diff --git a/.Rbuildignore b/.Rbuildignore
index c434a4b..bce7af1 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,4 +1,10 @@
-^.*\.Rproj$
-^\.Rproj\.user$
-^\.vscode
-^experiments$
+^.*\.Rproj$
+^\.Rproj\.user$
+^.*\.Rcheck$
+^\.vscode$
+^experiments$
+^.*.zip
+^.*.tar.gz
+^\.lintr$
+^CITATIONS.bib$
+^\.github$
diff --git a/.github/workflows/r_check.yml b/.github/workflows/r_check.yml
new file mode 100644
index 0000000..01820a1
--- /dev/null
+++ b/.github/workflows/r_check.yml
@@ -0,0 +1,44 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  pull_request:
+    branches: [main, master]
+
+name: R-CMD-check
+
+jobs:
+  R-CMD-check:
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: macOS-latest,    r: 'release'}
+          - {os: windows-latest,  r: 'release'}
+          - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
+          - {os: ubuntu-latest,   r: 'release'}
+          - {os: ubuntu-latest,   r: 'oldrel-1'}
+          - {os: ubuntu-latest,   r: '3.6'}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+          http-user-agent: ${{ matrix.config.http-user-agent }}
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - uses: r-lib/actions/check-r-package@v2
diff --git a/.gitignore b/.gitignore
index 05ca070..e27745a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,54 +1,61 @@
-.env/
-env/
-__pycache__
-*.so
-*.o
-*.pcm
-*.d
-*.root
-.vscode
-*.cxx_tmp_*
-*.rds
-*ACLiC*
-.Rproj.user
-r/*.html
-*.RData
-*.aux
-*.log
-*.fls
-*.fdb_latexmk
-*.pdf
-*.synctex.*
-*.nb.html
-.Rhistory
-.Rapp.history
-.RData
-*-Ex.R
-/*.tar.gz
-/*.Rcheck/
-vignettes/*.html
-vignettes/*.pdf
-.httr-oauth
-/*_cache/
-/cache/
-*.utf8.md
-*.knit.md
-rsconnect/
-experiments/results/*
-!experiments/results/*.jpg
-experiments/data/*
-!experiments/data/*.r
-!experiments/data/*.R
-!experiments/data/*.py
-!experiments/data/*.cpp
-!experiments/data/*.h
-tmp
-*.html
-*.zip
-*~
-*.~
-todo.md
-.lintr
-!vignettes/paper.pdf
-!vignettes/presentation.pdf
-!vignettes/poster.pdf
+.env/
+env/
+__pycache__
+*.so
+*.o
+*.pcm
+*.d
+*.root
+.vscode
+*.cxx_tmp_*
+*.rds
+*ACLiC*
+.Rproj.user
+*Rproj
+r/*.html
+*.RData
+*.Rcheck
+*.aux
+*.log
+*.fls
+*.fdb_latexmk
+*.pdf
+*.synctex.*
+*.nb.html
+.Rhistory
+.Rapp.history
+.RData
+*-Ex.R
+/*.tar.gz
+/*.Rcheck/
+vignettes/*.html
+vignettes/*.pdf
+.httr-oauth
+/*_cache/
+/cache/
+*.utf8.md
+*.knit.md
+rsconnect/
+experiments/results/*
+!experiments/results/*.png
+!experiments/results/*.jpg
+experiments/data/*
+!experiments/data/*.r
+!experiments/data/*.R
+!experiments/data/*.py
+!experiments/data/*.cpp
+!experiments/data/*.h
+tmp
+*.html
+*.zip
+*.tar.gz
+
+*~
+*.~
+todo.md
+*.dll
+data/*
+log*.txt
+!vignettes/paper.pdf
+!vignettes/presentation.pdf
+!vignettes/poster.pdf
\ No newline at end of file
diff --git a/.lintr b/.lintr
new file mode 100644
index 0000000..42df6c8
--- /dev/null
+++ b/.lintr
@@ -0,0 +1,7 @@
+linters: with_defaults(
+  line_length_linter(120),
+  commented_code_linter = NULL,
+  object_name_linter = NULL,
+  object_usage_linter = NULL
+  )
+encoding: "utf-8"
diff --git a/CITATIONS.bib b/CITATIONS.bib
new file mode 100644
index 0000000..58ce786
--- /dev/null
+++ b/CITATIONS.bib
@@ -0,0 +1,22 @@
+@article{bjorklund2022robust,
+  title   = {Robust regression via error tolerance},
+  author  = {Bj{\"o}rklund, Anton and Henelius, Andreas and Oikarinen, Emilia and Kallonen, Kimmo and Puolam{\"a}ki, Kai},
+  year    = {2022},
+  month   = jan,
+  journal = {Data Mining and Knowledge Discovery},
+  issn    = {1384-5810, 1573-756X},
+  doi     = {10.1007/s10618-022-00819-2}
+}
+
+@inproceedings{bjorklund2019sparse,
+  title     = {Sparse Robust Regression for Explaining Classifiers},
+  booktitle = {Discovery Science},
+  author    = {Bj{\"o}rklund, Anton and Henelius, Andreas and Oikarinen, Emilia and Kallonen, Kimmo and Puolam{\"a}ki, Kai},
+  year      = {2019},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {11828},
+  pages     = {351--366},
+  publisher = {Springer International Publishing},
+  doi       = {10.1007/978-3-030-33778-0_27},
+  isbn      = {978-3-030-33777-3 978-3-030-33778-0}
+}
diff --git a/DESCRIPTION b/DESCRIPTION
index d954527..b4fb5ce 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,17 +1,18 @@
-Package: slise
-Title: Sparse Linear Subset Explanations
-Version: 1.0.0
-Authors@R: person("Anton", "Björklund", email = "anton.bjorklund@helsinki.fi", role = c("aut", "cre"))
-Description: An implementation of the SLISE
-    algorithm for robust regression and
-    explaining outcomes from black box models.
-Depends: R (>= 3.5), Rcpp
-License: MIT + file LICENSE
-Encoding: UTF-8
-LazyData: true
-Imports: lbfgs, RcppArmadillo, ggplot2, stats, utils, methods, graphics, grDevices
-Suggests: scatterplot3d, grid, gridExtra, reshape2, crayon, wordcloud, testthat, stringr, numDeriv, R.rsp
-URL: https://github.com/edahelsinki/slise
-LinkingTo: Rcpp, RcppArmadillo
-RoxygenNote: 6.1.1.9000
-VignetteBuilder: R.rsp
+Package: slise
+Title: Sparse Linear Subset Explanations
+Version: 2.0.0
+Authors@R: c(
+    person("Anton", "Björklund", email = "anton.bjorklund@helsinki.fi", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7749-2918")),
+    person("Andreas", "Henelius", role = "aut", comment = c(ORCID = "0000-0002-4040-6967")),
+    person("Kai", "Puolamäki", role = "aut", comment = c(ORCID = "0000-0003-1819-1047")))
+Description: An implementation of the SLISE algorithm (for robust regression and explaining outcomes from black box models).
+Depends: R (>= 3.5), Rcpp
+License: MIT + file LICENSE
+Encoding: UTF-8
+LazyData: true
+Imports: lbfgs, stats, utils, methods, graphics, base
+Suggests: ggplot2, grid, gridExtra, reshape2, wordcloud, testthat, numDeriv, R.rsp
+URL: https://github.com/edahelsinki/slise
+LinkingTo: Rcpp, RcppArmadillo
+RoxygenNote: 7.1.1
+VignetteBuilder: R.rsp
diff --git a/LICENSE b/LICENSE
index 027c213..8aab042 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,21 @@
-MIT License
-
-Copyright (c) 2019 Anton Björklund, University of Helsinki
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+MIT License
+
+Copyright (c) 2022 Anton Björklund, University of Helsinki
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
\ No newline at end of file
diff --git a/NAMESPACE b/NAMESPACE
index f7a4c8b..c6d4a06 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,14 +1,29 @@
-# Generated by roxygen2: do not edit by hand
-
-S3method(explain,slise)
-S3method(plot,slise)
-S3method(predict,slise)
-S3method(print,slise)
-export(explain)
-export(slise.explain)
-export(slise.explain_comb)
-export(slise.explain_find)
-export(slise.fit)
-export(slise.raw)
-importFrom(Rcpp,sourceCpp)
-useDynLib(slise)
+# Generated by roxygen2: do not edit by hand
+
+S3method(plot,slise)
+S3method(predict,slise)
+S3method(print,slise)
+export(graduated_optimisation)
+export(scale_robust)
+export(simple_pca)
+export(slise.explain)
+export(slise.explain_comb)
+export(slise.explain_find)
+export(slise.fit)
+export(slise_initialisation_candidates)
+export(slise_initialisation_candidates2)
+export(slise_initialisation_lasso)
+export(slise_initialisation_ols)
+export(slise_initialisation_zeros)
+importFrom(Rcpp,sourceCpp)
+importFrom(graphics,legend)
+importFrom(stats,.lm.fit)
+importFrom(stats,lm.wfit)
+importFrom(stats,mad)
+importFrom(stats,median)
+importFrom(stats,predict)
+importFrom(stats,quantile)
+importFrom(stats,sd)
+importFrom(stats,uniroot)
+importFrom(utils,combn)
+useDynLib(slise)
diff --git a/R/RcppExports.R b/R/RcppExports.R
index cf2e54a..8a054e3 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -1,39 +1,43 @@
-# Generated by using Rcpp::compileAttributes() -> do not edit by hand
-# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
-
-sigmoidc <- function(x) {
-    .Call('_slise_sigmoidc', PACKAGE = 'slise', x)
-}
-
-loss_smooth_c <- function(alpha, data, response, beta, epsilon, lambda = 0) {
-    .Call('_slise_loss_smooth_c', PACKAGE = 'slise', alpha, data, response, beta, epsilon, lambda)
-}
-
-loss_smooth_c_dc <- function(xs, dcptr) {
-    .Call('_slise_loss_smooth_c_dc', PACKAGE = 'slise', xs, dcptr)
-}
-
-loss_smooth_grad_c <- function(alpha, data, response, beta, epsilon, lambda = 0) {
-    .Call('_slise_loss_smooth_grad_c', PACKAGE = 'slise', alpha, data, response, beta, epsilon, lambda)
-}
-
-loss_smooth_grad_c_dc <- function(xs, dcptr) {
-    .Call('_slise_loss_smooth_grad_c_dc', PACKAGE = 'slise', xs, dcptr)
-}
-
-lg_combined_smooth_c_dc <- function(xs, dcptr) {
-    .Call('_slise_lg_combined_smooth_c_dc', PACKAGE = 'slise', xs, dcptr)
-}
-
-lg_getgrad_c_dc <- function(xs, dcptr) {
-    .Call('_slise_lg_getgrad_c_dc', PACKAGE = 'slise', xs, dcptr)
-}
-
-loss_smooth_c_ptr <- function() {
-    .Call('_slise_loss_smooth_c_ptr', PACKAGE = 'slise')
-}
-
-loss_smooth_grad_c_ptr <- function() {
-    .Call('_slise_loss_smooth_grad_c_ptr', PACKAGE = 'slise')
-}
-
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+sigmoidc <- function(x) {
+    .Call('_slise_sigmoidc', PACKAGE = 'slise', x)
+}
+
+log_sigmoidc <- function(x) {
+    .Call('_slise_log_sigmoidc', PACKAGE = 'slise', x)
+}
+
+loss_smooth_c <- function(alpha, data, response, epsilon, beta, lambda1, lambda2, weight) {
+    .Call('_slise_loss_smooth_c', PACKAGE = 'slise', alpha, data, response, epsilon, beta, lambda1, lambda2, weight)
+}
+
+loss_smooth_c_dc <- function(xs, dcptr) {
+    .Call('_slise_loss_smooth_c_dc', PACKAGE = 'slise', xs, dcptr)
+}
+
+loss_smooth_grad_c <- function(alpha, data, response, epsilon, beta, lambda1, lambda2, weight) {
+    .Call('_slise_loss_smooth_grad_c', PACKAGE = 'slise', alpha, data, response, epsilon, beta, lambda1, lambda2, weight)
+}
+
+loss_smooth_grad_c_dc <- function(xs, dcptr) {
+    .Call('_slise_loss_smooth_grad_c_dc', PACKAGE = 'slise', xs, dcptr)
+}
+
+lg_combined_smooth_c_dc <- function(xs, dcptr) {
+    .Call('_slise_lg_combined_smooth_c_dc', PACKAGE = 'slise', xs, dcptr)
+}
+
+lg_getgrad_c_dc <- function(xs, dcptr) {
+    .Call('_slise_lg_getgrad_c_dc', PACKAGE = 'slise', xs, dcptr)
+}
+
+loss_smooth_c_ptr <- function() {
+    .Call('_slise_loss_smooth_c_ptr', PACKAGE = 'slise')
+}
+
+loss_smooth_grad_c_ptr <- function() {
+    .Call('_slise_loss_smooth_grad_c_ptr', PACKAGE = 'slise')
+}
+
diff --git a/R/cpp.R b/R/cpp.R
new file mode 100644
index 0000000..9440b37
--- /dev/null
+++ b/R/cpp.R
@@ -0,0 +1,10 @@
+#' @useDynLib slise
+#' @importFrom Rcpp sourceCpp
+"_PACKAGE"
+NULL
+
+.onUnload <- function (libpath) {
+  library.dynam.unload("slise", libpath)
+}
+
+Rcpp::loadModule("slise_mod", TRUE)
diff --git a/R/data.R b/R/data.R
index ba74c2b..c29c47e 100644
--- a/R/data.R
+++ b/R/data.R
@@ -1,214 +1,166 @@
-# This script contains some data preprocessing (scaling, centering, et.c.)
-
-#' Robust Scale
-#' A scale that can handle zero variance
-#'
-#' @param x the vector/matrix to normalise
-#' @param center Should constant columns be centered (TRUE)
-#' @param scale Should constant columns be scaled (TRUE)
-#' @param remove_constant Should constant columns be removed (TRUE)
-#'
-#' @return a list(scaled, center, scale, mask)
-#'
-scale_robust <- function(x, center=TRUE, scale = TRUE, remove_constant = TRUE) {
-    if (is.null(dim(x))) dim(x) <- c(length(x), 1)
-    if (center) {
-        meanx <- colMeans(x)
-        x <- sweep(x, 2, meanx, "-")
-    }
-    else {
-        meanx <- rep(0, ncol(x))
-    }
-    if (scale) {
-        stddv <- sqrt(colSums(x ^ 2) / (nrow(x) - 1))
-        if (remove_constant && any(stddv == 0)) {
-            mask <- stddv != 0
-            stddv <- stddv[mask]
-            meanx <- meanx[mask]
-            scaled <- sweep(x[, mask, drop = FALSE], 2, stddv, "/")
-        }
-        else {
-            stddv[stddv == 0] <- 1
-            scaled <- sweep(x, 2, stddv, "/")
-            mask <- NULL
-        }
-        list(scaled = scaled, center = meanx, scale = stddv, mask = mask)
-    }
-    else {
-        if (remove_constant) {
-            mask <- colSums(if(center) x^2 else sweep(x, 2, colMeans(x, na.rm=TRUE), "-")^2) != 0
-            if (sum(mask, na.rm = TRUE) < length(mask)) {
-                meanx <- meanx[mask]
-                x <- x[, mask, drop = FALSE]
-            }
-            else mask <- NULL
-        }
-        else mask <- NULL
-        list(scaled = x, center = meanx, scale = rep(1, length(meanx)), mask = mask)
-    }
-}
-
-#' Scale a vector to a range of approx one
-#'
-#' @param x the vector to normalise
-#'
-#' @return a list(scaled, center, scale)
-#'
-scale_one_range <- function(x) {
-    qs <- stats::quantile(x, c(0.05, 0.95), names = FALSE)
-    if (qs[1] >= 0 && qs[2] <= 1 && qs[1] < 0.5 && qs[2] > 0.5) {
-        list(scaled = x - 0.5, scale = 1, center = 0.5)
-    } else {
-        m <- mean(x)
-        qs <- qs - m
-        s <- abs(qs[2] - qs[1])
-        if (s == 0) s <- 1
-        list(scaled = (x - m) / s, scale = s, center = m)
-    }
-}
-
-#' Create a scale list without any changes
-#'
-#' @param x the vector to (not) scale
-#'
-#' @return a list(scaled = x, center = 0, scale = 1)
-#'
-scale_identity <- function(x) list(scaled = x, scale = rep(1, length(x)), center = rep(0, length(x)))
-
-#' SLISE Data Preprocessing
-#' Scales data and optionally adds intercept
-#'
-#' @param X the data matrix
-#' @param Y the response vector
-#' @param scale should the data be (columnwise) normalised (FALSE)
-#' @param intercept should an intercept column be added (FALSE)
-#' @param logit_tr should the response be logit-transformed (FALSE)
-#' @param scale_y should the response be scaled (FALSE)
-#'
-#' @return list(X, Y, scaling_functions...)
-#'
-data_preprocess <- function(X, Y, scale=FALSE, intercept=FALSE, logit_tr=FALSE, scale_y = TRUE) {
-    if (logit_tr) {
-        Y <- logit(Y)
-    }
-    scx <- scale_robust(X, scale = scale, center = scale)
-    scy <- if (scale_y) scale_one_range(Y) else scale_identity(Y)
-    X <- scx$scaled
-    Y <- scy$scaled
-    if (intercept) X <- cbind(rep(1, nrow(X)), X)
-    list(
-        X = X,
-        Y = Y,
-        scale_x = function(x) {
-            if (!is.null(scx$mask))
-                x <- x[scx$mask]
-            if (length(x) > length(scx$center))
-                sweep(sweep(x, 2, scx$center, "-"), 2, scx$scale, "/")
-            else
-                (x - scx$center) / scx$scale
-        },
-        scale_y = function(y) {
-            if (logit_tr) y <- logit(y)
-            (y - scy$center) / scy$scale
-        },
-        scale_alpha = function(alpha) {
-            if (!is.null(scx$mask)) {
-                if (length(alpha) > length(scx$mask)) {
-                    inter <- alpha[[1]]
-                    alpha <- alpha[-1]
-                }
-                alpha <- alpha[scx$mask]
-            } else if (length(alpha) > length(scx$center)) {
-                inter <- alpha[[1]]
-                alpha <- alpha[-1]
-            } else
-                inter <- 0
-            inter <- (inter - scy$center + sum(alpha * scx$center)) / scy$scale
-            alpha <- alpha / scy$scale * scx$scale
-            c(inter, alpha)
-        },
-        unscale_alpha = function(alpha) {
-            if (length(alpha) == length(scx$center) + 1) {
-                inter <- alpha[[1]]
-                alpha <- alpha[-1]
-            } else {
-                inter <- 0 #Always returns with intercept
-            }
-            if (length(alpha) != length(scx$center))
-                stop(paste(length(alpha), "!=", length(scx$center)))
-            inter <- (inter - sum(alpha * scx$center / scx$scale)) * scy$scale + scy$center
-            alpha <- alpha / scx$scale * scy$scale
-            if (is.null(scx$mask))
-                c(inter, alpha)
-            else {
-                a2 <- rep(0, length(scx$mask))
-                a2[scx$mask] <- alpha
-                c(inter, a2)
-            }
-        },
-        expand_alpha = function(alpha) {
-            if (is.null(scx$mask))
-                alpha
-            else if (length(alpha) == length(scx$center) + 1) {
-                a2 <- rep(0, length(scx$mask))
-                a2[scx$mask] <- alpha[-1]
-                c(alpha[[1]], a2)
-            } else {
-                a2 <- rep(0, length(scx$mask))
-                a2[scx$mask] <- alpha
-                a2
-            }
-        },
-        unscale_y = function(y) {
-            y <- y * scy$scale + scy$center
-            if (logit_tr) sigmoid(y) else y
-        }
-    )
-}
-
-#' SLISE Data Preprocessing without changing anything
-#'
-#' @param X the data matrix
-#' @param Y the response vector
-#'
-#' @return list(X, Y, scaling_functions...)
-#'
-data_identity <- function(X, Y) {
-    list(
-        X = X,
-        Y = Y,
-        scale_x = function(x) x,
-        scale_y = function(y) y,
-        scale_alpha = function(alpha) alpha,
-        unscale_alpha = function(alpha) alpha,
-        expand_alpha = function(alpha) alpha,
-        unscale_y = function(y) y
-    )
-}
-
-
-#' Shift the data to be local around a point
-#'
-#' @param X data matrix
-#' @param Y response vector
-#' @param x local data point
-#' @param y local response
-#'
-#' @return list(X = X - x, Y = Y - y, scaling_functions...)
-#'
-data_local <- function(X, Y, x, y=NULL) {
-    if (is.null(y)) {
-        # x is index
-        y <- Y[[x]]
-        x <- X[x,]
-    }
-    X <- sweep(X, 2, x)
-    Y <- Y - y
-    list(
-        X = X,
-        Y = Y,
-        scale_x = function(z) z - x,
-        scale_y = function(z) z - y,
-        unscale_alpha = function(alpha) c(y - sum(x * alpha), alpha)
-    )
-}
+# This script contains some data preprocessing (scaling, centering, et.c.)
+
+add_intercept_column <- function(x) {
+    if (is.null(dim(x))) {
+        dim(x) <- c(length(x), 1)
+    }
+    x2 <- cbind(rep(1, nrow(x)), x)
+    attr(x2, "scaled:center") <- attr(x, "scaled:center")
+    attr(x2, "scaled:scale") <- attr(x, "scaled:scale")
+    attr(x2, "constant_columns") <- attr(x, "constant_columns")
+    attr(x2, "intercept") <- TRUE
+    x2
+}
+
+remove_intercept_column <- function(x) {
+    x2 <- x[, -1]
+    attr(x2, "scaled:center") <- attr(x, "scaled:center")
+    attr(x2, "scaled:scale") <- attr(x, "scaled:scale")
+    attr(x2, "constant_columns") <- attr(x, "constant_columns")
+    x2
+}
+
+#'
+#' @importFrom stats sd
+remove_constant_columns <- function(x, epsilon = .Machine$double.eps) {
+    stddv <- apply(x, 2, sd)
+    mask <- which(stddv < epsilon)
+    if (length(mask) == 0) {
+        return(x)
+    }
+    x <- x[, -mask, drop = FALSE]
+    attr(x, "constant_columns") <- mask
+    x
+}
+
+add_constant_columns <- function(x, columns) {
+    if (length(columns) == 0) {
+        x
+    } else if (is.null(dim(x))) {
+        x2 <- rep(0, length(x) + length(columns))
+        x2[-columns] <- x
+        x2
+    } else {
+        x2 <- matrix(0, nrow(x), ncol(x) + length(columns))
+        x2[, -columns] <- x
+        x2
+    }
+}
+
+unscale_alpha <- function(alpha, x_center, x_scale, y_center=NULL, y_scale=NULL) {
+    if (is.null(y_center) && is.null(y_scale)) {
+        if (!hasattr(x_scale, "scaled:scale") && !hasattr(x_center, "scaled:scale")) {
+            stop("X and Y must have the scaled attributes")
+        }
+        y_center <- attr(x_scale, "scaled:center")
+        y_scale <- attr(x_scale, "scaled:scale")
+        x_scale <- attr(x_center, "scaled:scale")
+        x_center <- attr(x_center, "scaled:center")
+    }
+    if (length(alpha) == length(x_center)) {
+        alpha <- c(0, alpha)
+    }
+    alpha[1] <- (alpha[1] - sum(alpha[-1] * x_center / x_scale)) * y_scale + y_center
+    alpha[-1] <- alpha[-1] / x_scale * y_scale
+    alpha
+}
+
+#' Robust Scale
+#' A variant of 'scale' that is based on median and mad (instead of mean and sd).
+#' It can handle zero variance without producing nan:s.
+#'
+#' @param x the vector/matrix to normalise
+#' @param th threshold for the scale being zero
+#'
+#' @return scaled_x (with attributes "scaled:center" and "scaled:scale")
+#' @export
+#'
+#' @importFrom stats median
+#' @importFrom stats mad
+#'
+scale_robust <- function(x, th = .Machine$double.eps) {
+    if (is.null(dim(x))) {
+        # Vector
+        center <- median(x)
+        scale <- mad(x, center)
+        if (scale < th) {
+            scale <- 1
+        }
+        x <- (x - center) / scale
+    } else {
+        # Matrix
+        center <- apply(x, 2, median)
+        x <- sweep(x, 2, center)
+        scale <- apply(x, 2, mad, 0)
+        scale[scale < th] <- 1
+        x <- sweep(x, 2, scale, `/`)
+    }
+    attr(x, "scaled:center") <- c(center)
+    attr(x, "scaled:scale") <- c(scale)
+    x
+}
+
+
+#' A variant of `scale` that only adds the attributes
+#'
+#' @param x the vector to (not) scale
+#'
+#' @return x (with attributes "scaled:center" and "scaled:scale")
+#'
+scale_identity <- function(x) {
+    attr(x, "scaled:center") <- 0
+    attr(x, "scaled:scale") <- 1
+    x
+}
+
+scale_same <- function(x, center=NULL, scale=NULL, constant_columns=NULL) {
+    if (is.null(center)) {
+        return(x)
+    }
+    if (is.null(scale) && is.null(constant_columns) && hasattr(center, "scaled:scale")) {
+        constant_columns <- attr(center, "constant_columns")
+        scale <- attr(center, "scaled:scale")
+        center <- attr(center, "scaled:center")
+    }
+    if (is.null(dim(x))) {
+        if (!is.null(constant_columns)) {
+            x <- x[-constant_columns]
+        }
+        x <- (x - center) / scale
+    } else {
+        if (!is.null(constant_columns)) {
+            x <- x[, -constant_columns, drop = FALSE]
+        }
+        x <- sweep(sweep(x, 2, center), 2, scale, `/`)
+    }
+    attr(x, "scaled:scale") <- scale
+    attr(x, "scaled:center") <- center
+    attr(x, "constant_columns") <- constant_columns
+    x
+}
+
+#' Calculate the PCA rotation matrix
+#' The implementation is based on stats::prcomp.
+#' Assumes the data has already been centered and scaled (if that is desired).
+#'
+#' @param X the matrix to reduce
+#' @param dimensions the number of dimensions after PCA
+#' @param tolerance remove components with variance less than the tolerance
+#'
+#' @return pca rotation matrix
+#' @export
+#'
+simple_pca <- function(X, dimensions, tolerance = 1e-10) {
+    # PCA to a desired number of dimensions
+    dimensions <- min(dimensions, ncol(X))
+    s <- svd(X, nu = 0, nv = dimensions)
+    dimensions <- min(dimensions, length(s$d))
+    pca_rotation <- s$v
+    # Remove columns with too little variance
+    rank <- sum(s$d[1:dimensions] > (s$d[[1]] * tolerance), na.rm = TRUE)
+    if (rank < ncol(pca_rotation)) {
+        pca_rotation <- pca_rotation[, 1:rank, drop = FALSE]
+    }
+    # Return the rotation matrix (only)
+    pca_rotation
+}
diff --git a/R/explain_image.R b/R/explain_image.R
deleted file mode 100644
index 18614bb..0000000
--- a/R/explain_image.R
+++ /dev/null
@@ -1,214 +0,0 @@
-# This script contains helper functions for plotting explanations for images
-
-
-explain_slise_color_bw <- function() c("white", "black")
-explain_slise_color_rg <- function() c("red2", "green3")
-explain_slise_color_rg_cb <- function() c("#d01c8b", "#4dac26")
-explain_slise_color_cb <- function() c("#e66101", "#5e3c99")
-
-# Scale the color intensity to increase visibility when printed
-explain_slise_scale_colors <- function(x) {
-    x <- x / max(abs(x))
-    sigmoid(x * 4) * 2 - 1
-}
-
-# Plot a single image with optional outline
-explain_img_slise_image <- function(img, contour = NULL, width = 28, height = 28,
-        colors = explain_slise_color_cb(), class_labels = NULL, ..., scale_colors = TRUE) {
-    if (!requireNamespace("reshape2", quietly = TRUE)) {
-        stop("Package \"reshape2\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    image <- reshape2::melt(matrix(img, height, width))
-    if (scale_colors)
-        image$value <- explain_slise_scale_colors(image$value)
-    limits2 <- c(-1, 1) * max(abs(image$value))
-    plt <- ggplot2::ggplot(image, ggplot2::aes(image$Var2, image$Var1)) + ggplot2::geom_raster(ggplot2::aes(fill = image$value), interpolate = FALSE)
-    if (is.null(class_labels))
-        plt <- plt + ggplot2::scale_fill_gradient2(low = colors[[1]], mid = "white", high = colors[[2]], name = NULL)
-    else
-        plt <- plt + ggplot2::scale_fill_gradient2(low = colors[[1]], mid = "white", high = colors[[2]],
-            limits = limits2, labels = class_labels, breaks = limits2, guide = ggplot2::guide_legend(title = NULL))
-    if (!is.null(contour)) {
-        contour <- reshape2::melt(matrix(contour, height, width))
-        plt <- plt + ggplot2::stat_contour(ggplot2::aes(x = contour$Var2, y = contour$Var1, z = contour$value), data = contour, col = "black", bins = 1)
-    }
-    explain_img_slise_theme(plt, ...)
-}
-
-# Plot a lineup of images with optional outlines
-explain_img_slise_lineup <- function(imgs, labels, contours = NULL, width = 28, height = 28,
-        colors = explain_slise_color_cb(), class_labels = NULL, ..., nrow = 3, scale_colors = TRUE) {
-    if (!requireNamespace("reshape2", quietly = TRUE)) {
-        stop("Package \"reshape2\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    stopifnot(length(labels) == nrow(imgs))
-    labels <- factor(labels, levels = labels)
-    images <- do.call(rbind, lapply(seq_along(labels), function(i) {
-        image <- reshape2::melt(matrix(imgs[i,], width, height))
-        if (scale_colors)
-            image$value <- explain_slise_scale_colors(image$value)
-        image$label <- labels[[i]]
-        if (!is.null(contours)) {
-            cim <- reshape2::melt(contours[i,], width, height)
-            image$contour <- cim$value
-        }
-        image
-    }))
-    limits2 <- c(-1, 1) * max(abs(images$value))
-    plt_img <- ggplot2::ggplot(images, ggplot2::aes(images$Var2, images$Var1)) + ggplot2::geom_raster(ggplot2::aes(fill = images$value), interpolate = FALSE) +
-        ggplot2::facet_wrap(ggplot2::vars(images$label), scales = "free", nrow = nrow)
-    if (is.null(class_labels))
-        plt_img <- plt_img + ggplot2::scale_fill_gradient2(low = colors[[1]], mid = "white", high = colors[[2]], name = NULL)
-    else
-        plt_img <- plt_img + ggplot2::scale_fill_gradient2(low = colors[[1]], mid = "white", high = colors[[2]],
-            limits = limits2, labels = class_labels, breaks = limits2, guide = ggplot2::guide_legend(title = NULL))
-    if (!is.null(contours)) {
-        plt_img <- plt_img + ggplot2::stat_contour(ggplot2::aes(z = contours), col = "black", bins = 1)
-    }
-    explain_img_slise_theme(plt_img, ...) + ggplot2::theme(strip.background = ggplot2::element_rect(fill = "white"), strip.text = ggplot2::element_text(color = "black"))
-}
-
-# Plot a scatterplot of images
-explain_img_slise_scatter <- function(slise, width = 28, height = 28, lineup = NULL, ..., scatter_size = 0.03, num_scatter = 100, logits = FALSE) {
-    if (!requireNamespace("grid", quietly = TRUE)) {
-        stop("Package \"grid\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    if (!is.null(slise$logit) && slise$logit) {
-        Y_black_box <- slise$scaled$Y
-        Y_slise <- slise$scaled$X %*% slise$alpha[-1] + slise$alpha[[1]]
-        scaled_y <- slise$scaled$scale_y(slise$y)
-        mask <- Y_slise >= -0.55 & Y_slise <= 0.55
-    } else {
-        Y_black_box <- slise$Y
-        Y_slise <- slise$scaled$unscale_y(slise$scaled$X %*% slise$alpha[-1] + slise$alpha[[1]])
-        scaled_y <- slise$y
-        mask <- Y_slise >= -0.05 & Y_slise <= 1.05
-    }
-    X_mask <- -slise$X * 0.5 + 0.5
-    selected <- explain_slise_select_overlap(Y_black_box, Y_slise, scatter_size, scatter_size, sample(which(mask)), num_scatter)
-
-    plt <- ggplot2::ggplot() +
-        ggplot2::geom_tile(ggplot2::aes(scaled_y, scaled_y, width = scatter_size / 3, height = scatter_size / 3, fill = "Explained")) +
-        lapply(selected, function(i) {
-            im <- matrix(X_mask[i,], width, height)
-            g <- grid::rasterGrob(im, name=i)
-            ggplot2::annotation_custom(g, xmin = Y_slise[[i]] - scatter_size, xmax = Y_slise[[i]] + scatter_size,
-                ymin = Y_black_box[[i]] - scatter_size, ymax = Y_black_box[[i]] + scatter_size)
-        }) +
-        #geom_density2d(ggplot2::aes(Y_slise, Y_black_box, col="Dense Areas"), bins=5, linetype=3, size = 0.8) +
-        ggplot2::geom_abline(ggplot2::aes(intercept = 0, slope = 1, col = "Subset"), size = 1) +
-        ggplot2::geom_abline(ggplot2::aes(intercept = slise$epsilon, slope = 1, col = "Subset"), linetype = "dashed", size = 1) +
-        ggplot2::geom_abline(ggplot2::aes(intercept = -slise$epsilon, slope = 1, col = "Subset"), linetype = "dashed", size = 1) +
-        ggplot2::theme_light() + ggplot2::theme(aspect.ratio = 1) +
-        ggplot2::scale_fill_manual(guide = ggplot2::guide_legend(title = NULL), values = c("Explained" = "black")) +
-        ggplot2::scale_color_manual(guide = ggplot2::guide_legend(title = NULL),
-            breaks = c("Subset", "In Lineup", "Dense Areas"),
-            values = c("Subset" = "#1b9e77", "In Lineup" = "#d95f02", "Dense Areas" = "#7570b3"))
-
-    if (!is.null(lineup)) {
-        sels <- lineup$selection[lineup$selection != -1]
-        plt <- plt +
-            ggplot2::geom_tile(ggplot2::aes(Y_slise[sels], Y_black_box[sels], width = scatter_size * 2, height = scatter_size * 2, col = "In Lineup"), fill = "white", size = 1) +
-            lapply(sels, function(i) {
-                im <- matrix(X_mask[i,], width, height)
-                g <- grid::rasterGrob(im, name=-i)
-                ggplot2::annotation_custom(g, xmin = Y_slise[[i]] - scatter_size, xmax = Y_slise[[i]] + scatter_size,
-                    ymin = Y_black_box[[i]] - scatter_size, ymax = Y_black_box[[i]] + scatter_size)
-            })
-        if (length(sels) < length(lineup$selection)) {
-            plt <- plt + ggplot2::geom_tile(ggplot2::aes(scaled_y, scaled_y, width = scatter_size * 2.4, height = scatter_size * 2.4, col = "In Lineup"), fill = "white", size = 1)
-        }
-    }
-
-    plt <- plt + lapply(1, function(i) {
-        im <- matrix(slise$x * 0.5 + 0.5, width, height)
-        g <- grid::rasterGrob(im, name=0)
-        ggplot2::annotation_custom(g, xmin = scaled_y - scatter_size * 1.2, xmax = scaled_y + scatter_size * 1.2,
-            ymin = scaled_y - scatter_size * 1.2, ymax = scaled_y + scatter_size * 1.2)
-    })
-
-    if (!is.null(slise$logit) && slise$logit) {
-        if (logits) {
-            plt + ggplot2::xlab("SLISE Logits") + ggplot2::ylab("Classifier Logits") +
-                ggplot2::scale_x_continuous(breaks = seq(-0.5, 0.5, 0.2), labels = function(x) round(logit(slise$scaled$unscale_y(x)), 2)) +
-                ggplot2::scale_y_continuous(breaks = seq(-0.5, 0.5, 0.2), labels = function(x) round(logit(slise$scaled$unscale_y(x)), 2))
-        } else {
-            plt + ggplot2::xlab("SLISE Approximation") + ggplot2::ylab("Classifier Prediction") +
-                ggplot2::scale_x_continuous(breaks = seq(-0.5, 0.5, 0.2), labels = function(x) round(slise$scaled$unscale_y(x), 2)) +
-                ggplot2::scale_y_continuous(breaks = seq(-0.5, 0.5, 0.2), labels = function(x) round(slise$scaled$unscale_y(x), 2))
-        }
-    } else {
-        plt + ggplot2::xlab("SLISE Approximation") + ggplot2::ylab("Classifier Prediction") +
-            ggplot2::scale_x_continuous(limits = c(0, 1)) + ggplot2::scale_y_continuous(limits = c(0, 1))
-    }
-}
-
-# Add theming to image plots
-explain_img_slise_theme <- function(plt, rotate = FALSE, flip_x = FALSE, flip_y = TRUE, aspect = 1, legend = "none") {
-    plt <- plt + ggplot2::theme_light() +
-        ggplot2::theme(legend.position = legend,
-            axis.title.y = ggplot2::element_blank(), axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank(),
-            axis.title.x = ggplot2::element_blank(), axis.text.x = ggplot2::element_blank(), axis.ticks.x = ggplot2::element_blank(),
-            aspect.ratio = aspect)
-    if (rotate)
-        plt <- plt + ggplot2::coord_flip()
-    if (flip_x)
-        plt <- plt + ggplot2::scale_x_reverse(expand = c(0, 0))
-    else
-        plt <- plt + ggplot2::scale_x_continuous(expand = c(0, 0))
-    if (flip_y)
-        plt <- plt + ggplot2::scale_y_reverse(expand = c(0, 0))
-    else
-        plt <- plt + ggplot2::scale_y_continuous(expand = c(0, 0))
-    plt
-}
-
-# Create the subset lineup
-explain_slise_get_lineup <- function(slise, num_examples = 6, include_explained = TRUE, logits = FALSE) {
-    inter <- 1 / num_examples
-    if (logits) {
-        ys <- stats::quantile(slise$scaled$Y[slise$subset], c(inter / 2, 1 - inter / 2))
-        ys <- seq(ys[1], ys[2], (ys[2] - ys[1]) / (num_examples - 1))
-        sels <- which(slise$subset)[sapply(ys, function(y) which.min(abs(slise$scaled$Y[slise$subset] - y)))]
-    } else {
-        ys <- seq(inter / 2, 1 - inter / 2, inter)
-        sels <- which(slise$subset)[sapply(ys, function(y) which.min(abs(slise$Y[slise$subset] - y)))]
-    }
-    imgs <- slise$X[sels,]
-    probs <- slise$Y[sels]
-    if (include_explained) {
-        close <- which.min((ys - slise$scaled$scale_y(slise$y)) ^ 2)
-        imgs[close,] <- slise$x
-        probs[close] <- slise$y
-        sels[close] <- -1
-    }
-    probs <- round(probs, 3)
-    list(probabilities = probs, images = imgs, selection = sels)
-}
-
-# Randomly select samples for the scatter, but avoid overlapping
-# x,y: vector
-# w,h: width,height
-# o:   order (index)
-# num: amount
-explain_slise_select_overlap <- function(x, y, w, h, o, num = 50) {
-    sel <- list()
-    for (i in o) {
-        add <- TRUE
-        for (j in sel) {
-            if (abs(x[i] - x[j]) < w && abs(y[i] - y[j]) < h) {
-                add <- FALSE
-                break()
-            }
-        }
-        if (add) {
-            sel[length(sel) + 1] <- i
-            if (length(sel) >= num)
-                break()
-        }
-    }
-    unlist(sel)
-}
\ No newline at end of file
diff --git a/R/initialisation.R b/R/initialisation.R
new file mode 100644
index 0000000..1a17710
--- /dev/null
+++ b/R/initialisation.R
@@ -0,0 +1,260 @@
+
+
+#' OLS solver that falls back to an optimisation if ncol(X) is huge
+#' Also supports LASSO via optimisation
+#'
+#' @param X data matrix
+#' @param Y response vector
+#' @param weight weight vector (default: NULL)
+#' @param lambda LASSO regularisation (default: 0)
+#' @param max_iterations if ncol(X) is huge, then ols is replaced with optimisation (default:300)
+#'
+#' @return coefficient vector
+#'
+#' @importFrom stats lm.wfit
+#' @importFrom stats .lm.fit
+#'
+fast_ols <- function(X, Y, weight = NULL, lambda = 0, max_iterations = 300) {
+    # If the number of dimensions is very large, don't use the exact OLS solver
+    if (lambda > 0 || ncol(X) > max_iterations * 20) {
+        # 20 comes from the number of linesearch steps in lbfgs
+        if (is.null(weight)) {
+            loss <- function(alpha) sum((X %*% alpha - Y)^2) / 2
+            grad <- function(alpha) colSums(c(X %*% alpha - Y) * X)
+        } else {
+            loss <- function(alpha) sum((X %*% alpha - Y)^2 * weight) / 2
+            grad <- function(alpha) colSums((c(X %*% alpha - Y) * weight) * X)
+        }
+        lbfgs::lbfgs(
+            loss,
+            grad,
+            rep(0, ncol(X)),
+            invisible = TRUE,
+            max_iterations = max_iterations,
+            orthantwise_c = lambda
+        )$par
+    } else if (is.null(weight)) {
+        .lm.fit(X, Y)$coefficients
+    } else {
+        lm.wfit(X, Y, weight)$coefficients
+    }
+}
+
+
+#' Initialise the graduated optimisation with a LASSO solution
+#'
+#
+#' @param X data matrix
+#' @param Y response vector
+#' @param weight weight vector (default: NULL)
+#' @param lambda1 L1 regularisation (default: 0)
+#' @param max_iterations if ncol(X) is huge, then ols is replaced with optimisation (default:300)
+#' @param ... unused parameters
+#'
+#' @return list(alpha, beta)
+#' @export
+#'
+slise_initialisation_lasso <- function(X, Y, weight = NULL, lambda1 = 0, max_iterations = 300, ...) {
+    return(list(alpha = fast_ols(X, Y, weight, lambda1, max_iterations), beta = 0))
+}
+
+
+#' Initialise the graduated optimisation with an "Ordinary Least Squares" solution
+#'
+#
+#' @param X data matrix
+#' @param Y response vector
+#' @param epsilon error tolerance
+#' @param weight weight vector (default: NULL)
+#' @param beta_max the maximum starting sigmoid steepness (default: 20/epsilon^2)
+#' @param max_approx the target approximation ratio (default: 1.15)
+#' @param max_iterations if ncol(X) is huge, then ols is replaced with optimisation (default:300)
+#' @param beta_max_init the maximum sigmoid steepness in the initialisation
+#' @param ... unused parameters
+#'
+#' @return list(alpha, beta)
+#' @export
+#'
+slise_initialisation_ols <- function(X,
+                                     Y,
+                                     epsilon,
+                                     weight = NULL,
+                                     beta_max = 20 / epsilon^2,
+                                     max_approx = 1.15,
+                                     max_iterations = 300,
+                                     beta_max_init = 2.5 / epsilon^2,
+                                     ...) {
+    beta_max <- min(beta_max_init, beta_max)
+    alpha <- fast_ols(X, Y, weight, 0, max_iterations)
+    res <- (Y - X %*% alpha)^2
+    beta <- next_beta(res, epsilon^2, 0, weight, beta_max, log(max_approx))
+    return(list(alpha = alpha, beta = beta))
+}
+
+
+#' Initialise the graduated optimisation with a zero-vector
+#'
+#
+#' @param X data matrix
+#' @param Y response vector
+#' @param epsilon error tolerance
+#' @param weight weight vector (default: NULL)
+#' @param beta_max the maximum starting sigmoid steepness (default: 20/epsilon^2)
+#' @param max_approx the target approximation ratio (default: 1.15)
+#' @param beta_max_init the maximum sigmoid steepness in the initialisation
+#' @param ... unused parameters
+#'
+#' @return list(alpha, beta)
+#' @export
+#'
+slise_initialisation_zeros <- function(X,
+                                       Y,
+                                       epsilon,
+                                       weight = NULL,
+                                       beta_max = 20 / epsilon^2,
+                                       max_approx = 1.15,
+                                       beta_max_init = 2.5 / epsilon^2,
+                                       ...) {
+    beta_max <- min(beta_max_init, beta_max)
+    alpha <- c(rep(0, ncol(X)))
+    beta <- next_beta(Y^2, epsilon^2, 0, weight, beta_max, log(max_approx))
+    return(list(alpha = alpha, beta = beta))
+}
+
+
+# Create a candidate for slise_initialisation_candidates
+.create_candidate <- function(X, Y, weight = NULL, pca_treshold = 10, max_iterations = 300) {
+    if (ncol(X) <= pca_treshold) {
+        subset <- sample.int(nrow(X), max(3, ncol(X) + 1), FALSE, weight)
+        fast_ols(X[subset, , drop = FALSE], Y[subset])
+    } else {
+        subset <- sample.int(nrow(X), pca_treshold + 1, FALSE, weight)
+        X <- X[subset, , drop = FALSE]
+        pca <- simple_pca(X, pca_treshold)
+        pca %*% fast_ols(X %*% pca, Y[subset], max_iterations = max_iterations)
+    }
+}
+
+#' Initialise the graduated optimisation by sampling candidates
+#'
+#' The procedure starts with creating num_init subsets of size d.
+#' For each subset a linear model is fitted and the model that has
+#' the smallest loss is selected.
+#'
+#' The chance that one of these subsets contains only "clean" data is:
+#'   $$ 1-(1-(1-noise_fraction)^d)^num_init $$
+#' This means that high-dimensional data (large d) can cause issues,
+#' which is solved by using PCA (allows for sampling smaller subsets
+#' than d).
+#'
+#' @param X data matrix
+#' @param Y response vector
+#' @param epsilon error tolerance
+#' @param weight weight vector (default: NULL)
+#' @param beta_max the maximum sigmoid steepness (default: 20/epsilon^2)
+#' @param num_init the number of initial subsets to generate (default: 500)
+#' @param max_approx the target approximation ratio (default: 1.15)
+#' @param beta_max_init the maximum sigmoid steepness in the initialisation
+#' @param pca_treshold the maximum number of columns without using PCA (default: 10)
+#' @param max_iterations if ncol(X) is huge, then ols is replaced with optimisation (default:300)
+#' @param ... unused parameters
+#'
+#' @return list(alpha, beta)
+#' @export
+#'
+slise_initialisation_candidates <- function(X,
+                                            Y,
+                                            epsilon,
+                                            weight = NULL,
+                                            beta_max = 20 / epsilon^2,
+                                            max_approx = 1.15,
+                                            num_init = 500,
+                                            beta_max_init = 2.5 / epsilon^2,
+                                            pca_treshold = 10,
+                                            max_iterations = 300,
+                                            ...) {
+    beta_max <- min(beta_max_init, beta_max)
+    max_approx <- log(max_approx)
+    epsilon <- epsilon^2
+    # Initial model (zeros)
+    alpha <- c(rep(0, ncol(X)))
+    beta <- next_beta(Y^2, epsilon, 0, weight, beta_max, max_approx)
+    loss <- loss_smooth_res(alpha, Y^2, epsilon, beta, 0.0, 0.0)
+    # Find the candidate with the best loss for the next_beta
+    for (i in 2:num_init) {
+        model <- .create_candidate(X, Y, weight = NULL, pca_treshold, max_iterations)
+        residuals2 <- (Y - X %*% model)^2
+        loss2 <- loss_smooth_res(model, residuals2, epsilon, beta, 0, 0)
+        if (loss2 < loss) {
+            alpha <- model
+            beta <- next_beta(residuals2, epsilon, 0, weight, beta_max, max_approx)
+            loss <- loss_smooth_res(model, residuals2, epsilon, beta, 0, 0)
+        }
+    }
+    list(alpha = alpha, beta = beta, loss = loss)
+}
+
+# Create a candidate for slise_initialisation_candidates2
+.create_candidate2 <- function(X, Y, weight = NULL, max_iterations = 300) {
+    subset <- sample.int(nrow(X), 3, FALSE, weight)
+    X <- X[subset, , drop = FALSE]
+    Y <- Y[subset]
+    fast_ols(X, Y, NULL, .Machine$double.eps * 2, max_iterations)
+}
+
+#' Initialise the graduated optimisation by sampling candidates
+#'
+#' The procedure starts with creating num_init subsets of size d.
+#' For each subset a linear model is fitted and the model that has
+#' the smallest loss is selected.
+#'
+#' The chance that one of these subsets contains only "clean" data is:
+#'   $$ 1-(1-(1-noise_fraction)^d)^num_init $$
+#' This means that high-dimensional data (large d) can cause issues,
+#' which is solved by using LASSO-regularisation (which enables fitting
+#' of linear models with smaller subsets than d).
+#'
+#' @param X data matrix
+#' @param Y response vector
+#' @param epsilon error tolerance
+#' @param weight weight vector (default: NULL)
+#' @param beta_max the maximum sigmoid steepness (default: 20/epsilon^2)
+#' @param num_init the number of initial subsets to generate (default: 400)
+#' @param max_approx the target approximation ratio (default: 1.15)
+#' @param beta_max_init the maximum sigmoid steepness in the initialisation
+#' @param max_iterations if ncol(X) is huge, then ols is replaced with optimisation (default:300)
+#' @param ... unused parameters
+#'
+#' @return list(alpha, beta)
+#' @export
+#'
+slise_initialisation_candidates2 <- function(X,
+                                             Y,
+                                             epsilon,
+                                             weight = NULL,
+                                             beta_max = 20 / epsilon^2,
+                                             max_approx = 1.15,
+                                             num_init = 500,
+                                             beta_max_init = 2.5 / epsilon^2,
+                                             max_iterations = 300,
+                                             ...) {
+    beta_max <- min(beta_max_init, beta_max)
+    max_approx <- log(max_approx)
+    epsilon <- epsilon^2
+    # Initial model (zeros)
+    alpha <- c(rep(0, ncol(X)))
+    beta <- next_beta(Y^2, epsilon, 0, weight, beta_max, max_approx)
+    loss <- loss_smooth_res(alpha, Y^2, epsilon, beta, 0.0, 0.0)
+    # Find the candidate with the best loss for the next_beta
+    for (i in 2:num_init) {
+        model <- .create_candidate2(X, Y, weight = NULL, max_iterations)
+        residuals2 <- (Y - X %*% model)^2
+        loss2 <- loss_smooth_res(model, residuals2, epsilon, beta, 0, 0)
+        if (loss2 < loss) {
+            alpha <- model
+            beta <- next_beta(residuals2, epsilon, 0, weight, beta_max, max_approx)
+            loss <- loss_smooth_res(model, residuals2, epsilon, beta, 0, 0)
+        }
+    }
+    list(alpha = alpha, beta = beta, loss = loss)
+}
\ No newline at end of file
diff --git a/R/main.R b/R/main.R
deleted file mode 100644
index 9f052ce..0000000
--- a/R/main.R
+++ /dev/null
@@ -1,288 +0,0 @@
-# This script contains the SLISE functions (slise.fit and slise.explain)
-
-#' SLISE Regression
-#' Use SLISE for robust regression.
-#'
-#' @param X matrix of independent variables
-#' @param Y vector of the response variable
-#' @param epsilon error tolerance (will be scaled to represent a percentage, e.g. 0.1 == 10\%)
-#' @param lambda sparsity reguraliser
-#' @param ... other parameters to the optimiser
-#' @param scale Scale X by mean and standard deviation (FALSE)
-#' @param logit Should Y be logit-transformed (recommended for probabilities) (FALSE)
-#' @param intercept Should an intercept be added (TRUE)
-#' @param max_approx Target approximation ratio for selecting graduated optimisation step size (1.2)
-#' @param beta_max Stopping sigmoid steepness (25)
-#' @param beta_start_max Maximum beta-step during the initialisation (1.0)
-#' @param max_iterations Maximum number of OWL-QN steps per graduated optimisation step (250)
-#' @param scale_y Scales Y to roughly be in [-0.5, 0.5] (based on 95th and 5th quantile if not in [0, 1]) (TRUE)
-#'
-#' @return slise object (coefficients, subset, value, X, Y, lambda, epsilon, scaled, alpha)
-#' @export
-#'
-#' @examples
-#' X <- matrix(rnorm(200), 100, 2)
-#' Y <- rnorm(100)
-#' model <- slise.fit(X, Y)
-#' prediction <- predict(model, X)
-slise.fit <- function(X, Y, epsilon = 0.1, lambda = 0, ..., scale = FALSE, logit = FALSE, intercept = TRUE, scale_y = TRUE) {
-    # Setup
-    matprod_default <- options(matprod = "blas") # Use faster math
-    X <- as.matrix(X)
-    data <- data_preprocess(X, Y, scale = scale, intercept = intercept, logit_tr = logit, scale_y = scale_y)
-    # Initialisation
-    alpha <- stats::.lm.fit(data$X, data$Y)$coefficients
-    beta <- 0
-    # Optimisation
-    alpha <- graduated_optimisation(alpha, data$X, data$Y, epsilon = epsilon, lambda = lambda, ...)$par
-    # Output
-    out <- create_slise(alpha, X, Y, epsilon, lambda, data, NULL, NULL)
-    options(matprod_default) # Reset options
-    out
-}
-
-#' SLISE Regression
-#' The raw interface for SLISE, initialisation and scaling
-#'   (including locality) has to be done in advance.
-#' This function essentially wraps graduated_optimisation
-#'   and create_slise.
-#'
-#' @param X matrix of independent variables
-#' @param Y vector of the response variable
-#' @param alpha Starting alpha
-#' @param epsilon error tolerance (will be scaled to represent a percentage, e.g. 0.1 == 10\%)
-#' @param lambda sparsity reguraliser
-#' @param beta Starting sigmoid steepness
-#' @param ... other parameters to the optimiser
-#' @param max_approx Target approximation ratio for selecting graduated optimisation step size (1.2)
-#' @param beta_max Stopping sigmoid steepness (25)
-#' @param max_iterations Maximum number of OWL-QN steps per graduated optimisation step (250)
-#'
-#' @return slise object (coefficients, subset, value, X, Y, lambda, epsilon, scaled, alpha)
-#' @export
-#'
-#' @examples
-#' X <- matrix(rnorm(200), 100, 2)
-#' Y <- rnorm(100)
-#' model <- slise.raw(X, Y)
-slise.raw <- function(X, Y, alpha = rep(0, ncol(X)), epsilon = 0.1, lambda = 0, beta = 0, ...) {
-    # Setup
-    matprod_default <- options(matprod = "blas") # Use faster math
-    X <- as.matrix(X)
-    # Optimisation
-    alpha <- graduated_optimisation(alpha, X, Y, epsilon = epsilon, lambda = lambda, ...)$par
-    # Output
-    out <- create_slise(alpha, X, Y, epsilon, lambda, data_identity(X, Y), NULL, NULL)
-    options(matprod_default) # Reset options
-    out
-}
-
-#' SLISE Black Box Explainer
-#' Use SLISE for explaining predictions made by a black box.
-#'
-#' @param X matrix of independent variables
-#' @param Y vector of the dependent variable
-#' @param x the sample to be explained (or index if y is null)
-#' @param y the prediction to be explained
-#' @param epsilon error tolerance (will be scaled to represent a percentage, e.g. 0.1 == 10\%)
-#' @param lambda sparsity reguraliser
-#' @param ... other parameters to the optimiser
-#' @param scale Scale X by mean and standard deviation (FALSE)
-#' @param logit Should Y be logit-transformed (recommended for probabilities) (FALSE)
-#' @param max_approx Target approximation ratio for selecting graduated optimisation step size (1.2)
-#' @param beta_max Stopping sigmoid steepness (25)
-#' @param beta_start_max Maximum beta-step during the initialisation (1.0)
-#' @param max_iterations Maximum number of OWL-QN steps per graduated optimisation step (250)
-#' @param scale_y Scales Y to roughly be in [-0.5, 0.5] (based on 95th and 5th quantile if not in [0, 1]) (TRUE)
-#'
-#' @return slise object (coefficients, subset, value, X, Y, lambda, epsilon, scaled, alpha, x, y)
-#' @export
-#'
-#' @examples
-#' X <- matrix(rnorm(200), 100, 2)
-#' Y <- rnorm(100)
-#' index <- 10
-#' model <- slise.explain(X, Y, index)
-slise.explain <- function(X, Y, x, y = NULL, epsilon = 0.1, lambda = 0, ..., scale = FALSE, logit = FALSE, scale_y = TRUE) {
-    # Setup
-    matprod_default <- options(matprod = "blas") # Use faster math
-    X <- as.matrix(X)
-    if (all(is.null(y))) {
-        y <- Y[[x]]
-        x <- X[x, ]
-    }
-    data <- data_preprocess(X, Y, scale = scale, intercept = FALSE, logit_tr = logit, scale_y = scale_y)
-    xs <- data$scale_x(x)
-    local <- data_local(data$X, data$Y, xs, data$scale_y(y))
-    # Initialisation
-    alpha <- stats::.lm.fit(local$X, local$Y)$coefficients
-    beta <- 0
-    # Optimisation
-    alpha <- graduated_optimisation(alpha, local$X, local$Y, epsilon = epsilon, lambda = lambda, beta=beta, ...)$par
-    # Output
-    out <- create_slise(local$unscale_alpha(alpha), X, Y, epsilon, lambda, data, x = x, y = y, logit = logit)
-    options(matprod_default) # Reset options
-    out
-}
-
-#' SLISE Black Box Explainer
-#' Use SLISE for explaining predictions made by a black box.
-#' BUT with a binary search for sparsity!
-#'
-#' @param ... parameters to slise.explain
-#' @param lambda the starting value of the search
-#' @param variables number of non-zero coefficients
-#' @param iters number of search iterations
-#' @param treshold treshold for zero coefficient
-#'
-#' @return SLISE object
-#' @export
-#'
-#' @examples
-#' X <- matrix(rnorm(800), 100, 8)
-#' Y <- rnorm(100)
-#' index <- 10
-#' model <- slise.explain_find(X, Y, index, variables = 4)
-slise.explain_find <- function(..., lambda = 5, variables = 4, iters = 10, treshold = 1e-4) {
-    lower <- 0
-    upper <- -1
-    upper_best <- NULL
-    lower_best <- NULL
-    for (j in 1:iters) {
-        slise <- slise.explain(lambda = lambda, ...)
-        s <- sparsity(slise$alpha[-1], treshold)
-        if (s > variables) {
-            lower_best <- slise
-            lower <- lambda
-        } else {
-            upper <- lambda
-            upper_best <- slise
-        }
-        if (upper < 0)
-            lambda <- lambda * 2
-        else
-            lambda <- (upper + lower) * 0.5
-    }
-    if (!is.null(upper_best) && sparsity(upper_best$alpha[-1], treshold) == variables)
-        upper_best
-    else if (is.null(lower_best))
-        slise <- slise.explain(lambda = lower, ...)
-    else
-        lower_best
-}
-
-#' SLISE Black Box Explainer
-#' Use SLISE for explaining predictions made by a black box.
-#' BUT with sparsity from a combinatorial search rather than Lasso!
-#'
-#' @param X matrix of independent variables
-#' @param Y vector of the dependent variable
-#' @param x the sample to be explained (or index if y is null)
-#' @param y the prediction to be explained
-#' @param ... other parameters to slise.explain
-#' @param variables the number of non-zero coefficients
-#'
-#' @return SLISE object
-#' @export
-#'
-#' @examples
-#' X <- matrix(rnorm(400), 100, 4)
-#' Y <- rnorm(100)
-#' index <- 10
-#' model <- slise.explain_comb(X, Y, index, variables = 2)
-slise.explain_comb <- function(X, Y, x, y=NULL, ..., variables = 4) {
-    len <- ncol(X)
-    combs <- factorial(len) / factorial(variables) / factorial(len - variables)
-    if (combs >= 30)
-        warning(sprintf("The combinatorial search will take a long time (requires %d iterations)", combs))
-    if (all(is.null(y))) {
-        y <- Y[[x]]
-        x <- X[x, ]
-    }
-    res <- utils::combn(1:len, variables, function(s) {
-        X2 <- X
-        for (i in (1:len)[-s])
-            X2[, i] <- x[i]
-        slise.explain(X2, Y, x, y, ...)
-    }, simplify = FALSE)
-    expl <- res[[which.min(sapply(res, function(r) r$value))]]
-    expl$X <- X
-    expl
-}
-
-#' Create a result object for SLISE that is similar to other regression method results
-#'
-#' @param alpha linear model
-#' @param X data matrix
-#' @param Y response vector
-#' @param epsilon error tolerance
-#' @param lambda L1 weight
-#' @param data data_preprocess(X, Y)
-#' @param ... other variables to add to the SLISE object
-#'
-#' @return list(coefficients=unscale(alpha), X, Y, scaled=data, lambda, alpha, subset=[r_i<epsilon], value=loss, epsilon, loss, ...)
-#'
-create_slise <- function(alpha, X, Y, epsilon, lambda = 0, data = NULL, ...) {
-    if (is.null(data)) {
-        coeff <- alpha
-        if (length(alpha) > ncol(X))
-            dist <- (c(X %*% alpha[-1] + alpha[[1]]) - Y) ^ 2
-        else
-            dist <- (c(X %*% alpha) - Y) ^ 2
-    } else {
-        coeff <- data$unscale_alpha(alpha)
-        if (length(alpha) > ncol(data$X))
-            dist <- (c(data$X %*% alpha[-1] + alpha[[1]]) - data$Y) ^ 2
-        else
-            dist <- (c(data$X %*% alpha) - data$Y) ^ 2
-    }
-    if (is.null(colnames(X))) {
-        nams <- paste(1:ncol(X))
-    } else {
-        nams <- colnames(X)
-    }
-    if (length(coeff) > ncol(X)) {
-        names(coeff) <- c("Intercept", nams)
-    } else {
-        names(coeff) <- nams
-    }
-    mask <- dist <= epsilon ^ 2
-    loss <- sum(mask * (dist / nrow(X) - epsilon ^ 2)) + lambda * sum(abs(alpha))
-    structure(list(coefficients = coeff, X = X, Y = Y, scaled = data,
-        lambda = lambda, alpha = alpha, subset = mask, value = loss,
-        epsilon = epsilon, loss = loss, ...), class = "slise")
-}
-
-#' Predict with a SLISE
-#'
-#' @param object SLISE object
-#' @param newdata data matrix
-#' @param ... not used
-#'
-#' @return prediction vector
-#' @export
-#'
-#' @examples
-#' X <- matrix(rnorm(200), 100, 2)
-#' Y <- rnorm(100)
-#' index <- 10
-#' model <- slise.explain(X, Y, index)
-#' prediction <- predict(model, X)
-predict.slise <- function(object, newdata = NULL, ...) {
-    if (is.null(newdata)) {
-        newdata <- object$scaled$X
-    } else {
-        newdata <- as.matrix(newdata)
-        newdata <- object$scaled$scale_x(newdata)
-        if (length(newdata) <= length(object$alpha))
-            dim(newdata) <- c(1, length(newdata))
-    }
-    if (ncol(newdata) == length(object$alpha)) {
-        y <- newdata %*% object$alpha
-    } else if (ncol(newdata) == length(object$alpha) - 1) {
-        y <- newdata %*% object$alpha[-1] + object$alpha[[1]]
-    } else {
-        stop("Wrong number of columns in data")
-    }
-    object$scaled$unscale_y(c(y))
-}
diff --git a/R/optimisation.R b/R/optimisation.R
index f2cdb18..53228b5 100644
--- a/R/optimisation.R
+++ b/R/optimisation.R
@@ -1,261 +1,415 @@
-# This script contains the optimisations for SLISE (Graduated Optimisation and OWL-QN)
-
-#' Smooth Loss
-#'
-#' @param alpha The vector to calculate loss for
-#' @param X The data matrix
-#' @param Y The response vector
-#' @param epsilon (Optional) The acceptable error
-#' @param lambda (Optional) The sparsity reguraliser
-#' @param beta (Optional) The steepness of the sigmoid (default: 3)
-#'
-#' @return The loss value
-#'
-loss_smooth <- function(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3) {
-    epsilon <- epsilon ^ 2
-    distances <- c(X %*% alpha - Y) ^ 2
-    subsize <- sigmoidc(beta / epsilon * (epsilon - distances))
-    eps2 <- epsilon * length(Y)
-    loss <- pmin(0, distances - eps2) #phi(x) ~ pmin(0, x)
-
-    if (lambda > 0)
-        sum(subsize * loss) / length(Y) + lambda * sum(abs(alpha))
-    else
-        sum(subsize * loss) / length(Y)
-}
-
-#' Smooth Loss Gradient
-#'
-#' @param alpha The vector to calculate loss-gradient for
-#' @param X The data matrix
-#' @param Y The response vector
-#' @param epsilon (Optional) The acceptable error
-#' @param lambda (Optional) The sparsity reguraliser
-#' @param beta (Optional) The steepness of the sigmoid (default: 3)
-#'
-#' @return The gradients for alpha
-#'
-loss_smooth_grad <- function(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3) {
-    epsilon <- epsilon ^ 2
-    beta <- beta / epsilon
-    distances <- c(X %*% alpha - Y)
-    distances2 <- distances ^ 2
-
-    f <- distances2 / nrow(X) - epsilon
-    s <- sigmoidc(beta * (epsilon - distances2))
-    k1 <- 2 / nrow(X)
-    k2 <- -2 * beta * (s - s ^ 2)
-
-    distances <- ifelse(f < 0, distances, 0) #phi(x) ~ pmin(0, x)
-
-    if (lambda > 0)
-        (t(distances * X) %*% ((s * k1) + (f * k2))) + lambda * sign(alpha)
-    else
-        (t(distances * X) %*% ((s * k1) + (f * k2)))
-}
-
-
-#' Sharp Loss Function
-#'
-#' @param alpha The vector to calculate loss for
-#' @param X The data matrix
-#' @param Y The response vector
-#' @param epsilon (Optional) The acceptable error
-#' @param lambda (Optional) The sparsity reguraliser
-#'
-#' @return The loss value
-#'
-loss_sharp <- function(alpha, X, Y, epsilon = 0.1, lambda = 0) {
-    epsilon <- epsilon ^ 2
-    distances <- (X %*% alpha - Y) ^ 2
-    mask <- distances <= epsilon
-    subsize_loss <- sum(mask) * epsilon
-    regression_loss <- sum(distances[mask]) / length(Y)
-
-    if (lambda > 0)
-        -subsize_loss + regression_loss + lambda * sum(abs(alpha))
-    else
-        -subsize_loss + regression_loss
-}
-
-#' OWL-QN for optimising loss_smooth (Cpp implementation)
-#'
-#' @param alpha linear model to optimise
-#' @param X data matrix
-#' @param Y response vector
-#' @param epsilon error tolerance
-#' @param lambda L1 coefficient
-#' @param beta sigmoid steepness
-#' @param max_iterations number of OWL-QN iterations
-#' @param ... other parameters to OWL-QN
-#'
-#' @return lbfgs object
-#'
-owlqn_c <- function(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3, max_iterations = 250, ...) {
-    dc <- methods::new(DataContainer, data = X, response = Y, beta = beta, epsilon = epsilon, lambda = 0)
-    lbfgs::lbfgs(loss_smooth_c_ptr(), loss_smooth_grad_c_ptr(), alpha, dc$.pointer, ...,
-        max_iterations = max_iterations, invisible = TRUE, orthantwise_c = lambda)
-}
-
-#' OWL-QN for optimising loss_smooth (R implementation)
-#'
-#' @param alpha linear model to optimise
-#' @param X data matrix
-#' @param Y response vector
-#' @param epsilon error tolerance
-#' @param lambda L1 coefficient
-#' @param beta sigmoid steepness
-#' @param max_iterations number of OWL-QN iterations
-#' @param ... other parameters to OWL-QN
-#'
-#' @return lbfgs object
-#'
-owlqn_r <- function(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3, max_iterations = 250, ...) {
-    fn <- function(alpha) loss_smooth(alpha, X, Y, epsilon, 0, beta)
-    gd <- function(alpha) loss_smooth_grad(alpha, X, Y, epsilon, 0, beta)
-    lbfgs::lbfgs(fn, gd, alpha, ..., max_iterations = max_iterations, invisible = TRUE, orthantwise_c = lambda)
-}
-
-#' Calculate the Logarithm of the approximation ratio
-#' (logarithms are used for numerically stable calculations)
-#' See Theorem 3 from the paper for more details
-#'
-#' @param residuals squared residuals
-#' @param epsilon error tolerance
-#' @param beta1 current sigmoid steepness
-#' @param beta2 next sigmoid steepness
-#'
-#' @return log(approximation_ratio)
-#'
-log_approximation_ratio <- function(residuals, epsilon, beta1, beta2) {
-    if (beta1 >= beta2) return(0)
-    epsilon <- epsilon ^ 2
-    beta1 <- beta1 / epsilon
-    beta2 <- beta2 / epsilon
-    ## log(f(r, beta)), assuming squared r, the phi is calculated separately                    
-    lf <- function(r, beta) log_sigmoid(beta * (epsilon - r))
-    phi <- pmax(0, epsilon - residuals / length(residuals)) # = -pmin(0, r^2/n-e^2)
-    ## derivative of log(f(r, beta1)/f(r, beta2))
-    lg <- function(r) - beta1 * dlog_sigmoid(beta1 * (epsilon - r)) + beta2 * dlog_sigmoid(beta2 * (epsilon - r))
-    # Calculate log(k) (see Thm. 3)
-    g.zero <- lg(0)
-    if (g.zero < 0) {
-        ## If derivative at r=0 is negative the minimum is within r>0 and r<epsilon.
-        ## (due to lg(0) < 0 and lg(epsilon) = -beta1*0.5 + beta2*0.5 > 0)
-        a <- stats::uniroot(lg, lower = 0, upper = epsilon, f.lower = g.zero)$root
-        lK <- min(lf(0, beta1) - lf(0, beta2), lf(a, beta1) - lf(a, beta2))
-    } else {
-        ## If derivative at r=0 is positive the function has minimum at r<0 and hence the minimum
-        ## can be found at r=0 (negative values of the squared residual are not allowed).
-        lK <- lf(0, beta1) - lf(0, beta2)
-    }
-    # Calculate log(K) (see Thm. 3)
-    if (sum(phi) < 0) {
-        lphi <- log(phi)
-        log_sum(lf(residuals, beta1) + lphi) - lK - log_sum(lf(residuals, beta2) + lphi)
-    } else {
-        # phi is constant (0) and can be removed from the division
-        log_sum(lf(residuals, beta1)) - lK - log_sum(lf(residuals, beta2))
-    }
-}
-
-#' Find the matching *epsilon
-#'
-#' @param residuals squared residuals
-#' @param epsilon error tolerance
-#' @param beta sigmoid steepness
-#'
-#' @return *epsilon
-#'
-matching_epsilon <- function(residuals, epsilon, beta) {
-    epsilon <- epsilon ^ 2
-    residuals <- sort(residuals)
-    loss <- sigmoid(beta / epsilon * (epsilon - residuals))
-    i <- which.max(seq_along(residuals) * loss)
-    sqrt(residuals[i])
-}
-
-#' Print debug statement for how the graduated optimisation is going
-#'
-#' @param alpha linear model
-#' @param X data matrix
-#' @param Y response vector
-#' @param epsilon error tolerance
-#' @param lambda L1 weight
-#' @param beta current sigmoid steepness
-#' @param beta_max max sigmoid steepness
-#'
-grad_opt_debug <- function(alpha, X, Y, epsilon, lambda, beta, beta_max) {
-    residuals <- c(Y - X %*% alpha)^2
-    approx <- exp(log_approximation_ratio(residuals, epsilon, beta, beta_max))
-    m_epsilon <- matching_epsilon(residuals, epsilon, beta)
-    loss_sm <- loss_smooth(alpha, X, Y, epsilon, lambda, beta)
-    loss_sh <- loss_smooth(alpha, X, Y, epsilon, lambda, beta_max)
-    loss_ha <- loss_sharp(alpha, X, Y, epsilon, lambda)
-    cat(sprintf("Graduated Optimisation: beta = %6.3f eps* = %.3f approx = %5.3f Ls = %g Lh = %g L = %g\n",
-        beta, m_epsilon, approx, loss_sm, loss_sh, loss_ha))
-}
-
-
-#' Find the next beta according to:
-#'   ¤ approximation_ratio(alpha, beta_old, beta_new) == max_approx
-#'   ¤ beta_new >= beta_old + min_increase
-#'
-#' @param alpha linear model
-#' @param X data matrix
-#' @param Y response vector
-#' @param epsilon error tolerance
-#' @param beta current sigmoid steepness
-#' @param beta_max max sigmoid steepnsess
-#' @param max_approx approximation ratio target for increasing beta
-#' @param beta_min_increase minimum beta step
-#'
-#' @return beta_new
-#'
-next_beta <- function(alpha, X, Y, epsilon = 0.1, beta = 0, beta_max = 25,
-        max_approx = 1.2, beta_min_increase = beta_max * 0.0005) {
-    if (beta >= beta_max) return(beta)
-    residuals <- c(Y - X %*% alpha)^2
-    max_approx <- log(max_approx)
-    log_approx <- log_approximation_ratio(residuals, epsilon, beta, beta_max)
-    if (log_approx <= max_approx) {
-        beta_max
-    } else {
-        f <- function(b) log_approximation_ratio(residuals, epsilon, beta, b) - max_approx
-        beta_new <- stats::uniroot(f, lower = beta, upper = beta_max, f.lower = -max_approx,
-                f.upper = log_approx - max_approx)$root
-        max(beta_new, beta + beta_min_increase)
-    }
-}
-
-#' Graduated Optimisation to solve the SLISE problem
-#'
-#' @param alpha initial linear model (if NULL then OLS)
-#' @param X data matrix
-#' @param Y response vector
-#' @param epsilon error tolerance
-#' @param lambda L1 coefficient (0)
-#' @param beta starting sigmoid steepness (0 => convex problem)
-#' @param beta_max stopping sigmoid steepness (25)
-#' @param max_approx approximation ratio when selecting the next beta (1.2)
-#' @param max_iterations maximum number of OWL-QN iterations (100)
-#' @param debug should debug statement be printed each iteration (FALSE)
-#' @param ... Additional parameters to OWL-QN
-#' @param beta_min_increase the minimum increase of beta each iteration (beta_max * 0.0005)
-#' @param beta_start_max Ignored
-#'
-#' @return lbfgs object with beta (max) and the number of iteration steps
-#'
-graduated_optimisation <- function(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 0, beta_max = 25,
-        max_approx = 1.2, max_iterations = 100, debug = FALSE,
-        ..., beta_min_increase = beta_max * 0.0005, beta_start_max = NULL) {
-    res <- list(par = if (is.null(alpha)) stats::.lm.fit(X, Y)$coefficients else alpha)
-    while (beta < beta_max) {
-        res <- owlqn_c(res$par, X, Y, epsilon, lambda, beta, max_iterations, ...)
-        if (debug) grad_opt_debug(res$par, X, Y, epsilon, lambda, beta, beta_max)
-        beta <- next_beta(res$par, X, Y, epsilon, beta, beta_max, max_approx, beta_min_increase)
-    }
-    res <- owlqn_c(res$par, X, Y, epsilon, lambda, beta, max_iterations * 2, ...)
-    if (debug) grad_opt_debug(res$par, X, Y, epsilon, lambda, beta, beta_max)
-    res
-}
+# This script contains the optimisations for SLISE (Graduated Optimisation and OWL-QN)
+
+#' Smooth Loss
+#' A loss function for when you want gradients
+#'
+#' @param alpha The vector to calculate loss for
+#' @param X The data matrix
+#' @param Y The response vector
+#' @param epsilon The acceptable error
+#' @param beta The steepness of the sigmoid
+#' @param lambda1 The L1 regulariser (default: 0)
+#' @param lambda2 The L2 regulariser (default: 0)
+#' @param weight weight vector (default: NULL)
+#'
+#' @return The loss value
+#'
+loss_smooth <- function(alpha, X, Y, epsilon, beta, lambda1 = 0, lambda2 = 0, weight = NULL) {
+    epsilon <- epsilon^2
+    distances <- c(X %*% alpha - Y)^2
+    subsize <- sigmoidc(beta * (epsilon - distances))
+    if (is.null(weight)) {
+        loss <- pmin(0, distances - epsilon * length(Y)) # phi(x) ~ pmin(0, x)
+        loss <- sum(subsize * loss) / length(Y)
+    } else {
+        len <- sum(weight)
+        loss <- pmin(0, distances - epsilon * len)
+        loss <- sum(subsize * loss * weight) / len
+    }
+    if (lambda1 > 0) {
+        loss <- loss + lambda1 * sum(abs(alpha))
+    }
+    if (lambda2 > 0) {
+        loss <- loss + lambda2 * sum(alpha^2)
+    }
+    loss
+}
+
+#' Smooth Loss
+#' A loss function for when you want gradients and the residuals are already calculated
+#'
+#' @param alpha The vector to calculate loss for
+#' @param residuals2 Vector of squared residuals
+#' @param epsilon2 The squared acceptable error
+#' @param beta The steepness of the sigmoid
+#' @param lambda1 The L1 regulariser (default: 0)
+#' @param lambda2 The L2 regulariser (default: 0)
+#' @param weight weight vector (default: NULL)
+#'
+#' @return The loss value
+#'
+loss_smooth_res <- function(alpha, residuals2, epsilon2, beta, lambda1 = 0, lambda2 = 0, weight = NULL) {
+    subsize <- sigmoidc(beta * (epsilon2 - residuals2))
+    if (is.null(weight)) {
+        loss <- pmin(0, residuals2 - epsilon2 * length(residuals2)) # phi(x) ~ pmin(0, x)
+        loss <- sum(subsize * loss) / length(residuals2)
+    } else {
+        len <- sum(weight)
+        loss <- pmin(0, residuals2 - epsilon2 * len)
+        loss <- sum(subsize * loss * weight) / len
+    }
+    if (lambda1 > 0) {
+        loss <- loss + lambda1 * sum(abs(alpha))
+    }
+    if (lambda2 > 0) {
+        loss <- loss + lambda2 * sum(alpha^2)
+    }
+    loss
+}
+
+#' Smooth Loss Gradient
+#' Gradient for the smooth loss function
+#'
+#' @param alpha The vector to calculate loss-gradient for
+#' @param X The data matrix
+#' @param Y The response vector
+#' @param epsilon The acceptable error
+#' @param beta The steepness of the sigmoid
+#' @param lambda1 The L1 regulariser (default: 0)
+#' @param lambda2 The L2 regulariser (default: 0)
+#' @param weight weight vector (default: NULL)
+#'
+#' @return The gradients for alpha
+#'
+loss_smooth_grad <- function(alpha, X, Y, epsilon, beta, lambda1 = 0, lambda2 = 0, weight = NULL) {
+    epsilon <- epsilon^2
+    distances <- c(X %*% alpha - Y)
+    distances2 <- distances^2
+    len <- if (is.null(weight)) {
+        length(Y)
+    } else {
+        sum(weight)
+    }
+
+    f <- distances2 - epsilon * len
+    s <- sigmoidc(beta * (epsilon - distances2))
+    k1 <- 2 / len
+    k2 <- (-2 * beta / len) * (s - s^2)
+    distances <- ifelse(f < 0, distances, 0) # phi(x) ~ pmin(0, x)
+
+    if (length(weight) > 1) {
+        grad <- (t((distances * weight) * X) %*% ((s * k1) + (f * k2)))
+    } else if (length(weight) == 1 && weight != 0) {
+        grad <- (t(distances * X) %*% ((s * k1) + (f * k2))) * weight
+    } else {
+        grad <- (t(distances * X) %*% ((s * k1) + (f * k2)))
+    }
+
+    if (lambda1 > 0) {
+        grad <- grad + lambda1 * sign(alpha)
+    }
+    if (lambda2 > 0) {
+        grad <- grad + (lambda2 * 2) * alpha
+    }
+    grad
+}
+
+
+#' Sharp Loss Function
+#' Exact loss function without gradients
+#'
+#' @param alpha The vector to calculate loss for
+#' @param X The data matrix
+#' @param Y The response vector
+#' @param epsilon The acceptable error
+#' @param lambda1 The L1 regulariser (default: 0)
+#' @param lambda2 The L2 regulariser (default: 0)
+#' @param weight weight vector (default: NULL)
+#'
+#' @return The loss value
+#'
+loss_sharp <- function(alpha, X, Y, epsilon, lambda1 = 0, lambda2 = 0, weight = NULL) {
+    epsilon <- epsilon^2
+    distances <- (X %*% alpha - Y)^2
+    subsize <- distances <= epsilon
+    if (is.null(weight)) {
+        loss <- sum(subsize * (distances - epsilon * length(Y))) / length(Y)
+    } else {
+        len <- sum(weight)
+        loss <- sum(subsize * (distances - epsilon * len) * weight) / len
+    }
+    if (lambda1 > 0) {
+        loss <- loss + lambda1 * sum(abs(alpha))
+    }
+    if (lambda2 > 0) {
+        loss <- loss + lambda2 * sum(alpha^2)
+    }
+    loss
+}
+
+#' Sharp Loss Function
+#' Exact loss function without gradients for when the residuals are already calculated
+#'
+#' @param alpha The vector to calculate loss for
+#' @param residuals2 The squared error vector: (X \%*\% alpha - Y)^2
+#' @param epsilon2 The squared acceptable error
+#' @param lambda1 The L1 regulariser (default: 0)
+#' @param lambda2 The L2 regulariser (default: 0)
+#' @param weight weight vector (default: NULL)
+#'
+#' @return The loss value
+#'
+loss_sharp_res <- function(alpha, residuals2, epsilon2, lambda1 = 0, lambda2 = 0, weight = NULL) {
+    subsize <- residuals2 <= epsilon2
+    if (is.null(weight)) {
+        len <- length(residuals2)
+        loss <- sum(subsize * (residuals2 - epsilon2 * len)) / len
+    } else {
+        len <- sum(weight)
+        loss <- sum(subsize * (residuals2 - epsilon2 * len) * weight) / len
+    }
+    if (lambda1 > 0) {
+        loss <- loss + lambda1 * sum(abs(alpha))
+    }
+    if (lambda2 > 0) {
+        loss <- loss + lambda2 * sum(alpha^2)
+    }
+    loss
+}
+
+#' Wrapper for creating a C++ DataContainer that parses parameter names
+#'
+#' @param X data matrix
+#' @param Y response vector
+#' @param epsilon error tolerance
+#' @param beta sigmoid steepness
+#' @param weight weight vector (default: NULL)
+#' @param lambda1 L1 regularisation (default: 0)
+#' @param lambda2 L2 regularisation (default: 0)
+#'
+#' @return DataContainer
+#'
+data_container <- function(X, Y, epsilon, beta, lambda1 = 0.0, lambda2 = 0.0, weight = NULL) {
+    if (is.null(weight)) {
+        methods::new(DataContainer, data = X, response = Y, epsilon = epsilon, beta = beta, lambda1 = lambda1, lambda2 = lambda2)
+    } else {
+        methods::new(DataContainer, data = X, response = Y, epsilon = epsilon, beta = beta, lambda1 = lambda1, lambda2 = lambda2, weight = weight)
+    }
+}
+
+#' OWL-QN for optimising loss_smooth
+#' Cpp implementation
+#'
+#' @param alpha linear model to optimise
+#' @param dc DataContainer containing the data and parameters
+#' @param lambda1 L1 coefficient (default: 0)
+#' @param max_iterations number of OWL-QN iterations (default: 300)
+#' @param ... other parameters to OWL-QN
+#' @param invisible no terminal output (default: TRUE)
+#'
+#' @return lbfgs object
+#'
+owlqn_c <- function(alpha, dc, lambda1 = 0, max_iterations = 300, ..., invisible = TRUE) {
+    lbfgs::lbfgs(loss_smooth_c_ptr(), loss_smooth_grad_c_ptr(), alpha, dc$.pointer, ...,
+        max_iterations = max_iterations, invisible = invisible, orthantwise_c = lambda1
+    )
+}
+
+#' OWL-QN for optimising loss_smooth
+#' R implementation
+#'
+#' @param alpha linear model to optimise
+#' @param X data matrix
+#' @param Y response vector
+#' @param epsilon error tolerance
+#' @param beta sigmoid steepness
+#' @param lambda1 L1 coefficient (default: 0)
+#' @param lambda2 L1 coefficient(default: 0)
+#' @param weight weight vector (default: NULL)
+#' @param max_iterations number of OWL-QN iterations (default: 300)
+#' @param ... other parameters to OWL-QN
+#' @param invisible no terminal output (default: TRUE)
+#'
+#' @return lbfgs object
+#'
+owlqn_r <- function(alpha, X, Y, epsilon, beta, lambda1 = 0, lambda2 = 0,
+                    weight = NULL, max_iterations = 300, ..., invisible = TRUE) {
+    fn <- function(alpha) loss_smooth(alpha, X, Y, epsilon, beta, 0, lambda2, weight)
+    gd <- function(alpha) loss_smooth_grad(alpha, X, Y, epsilon, beta, 0, lambda2, weight)
+    lbfgs::lbfgs(fn, gd, alpha, ..., max_iterations = max_iterations, invisible = invisible, orthantwise_c = lambda1)
+}
+
+#' Calculate the Logarithm of the approximation ratio
+#' (logarithms are used for numerically stable calculations)
+#' See Theorem 3 from the paper for more details
+#'
+#' @param residuals2 squared residuals
+#' @param epsilon2 squared error tolerance
+#' @param beta1 current sigmoid steepness
+#' @param beta2 next sigmoid steepness
+#' @param weight weight vector (default: NULL)
+#'
+#' @return log(approximation_ratio)
+#'
+#' @importFrom stats uniroot
+#'
+log_approximation_ratio <- function(residuals2, epsilon2, beta1, beta2, weight = NULL) {
+    if (beta1 >= beta2) {
+        return(0)
+    }
+    # phi = -pmin(0, r^2/n-e^2)
+    phi <- if (is.null(weight)) {
+        pmax(0, epsilon2 - residuals2 / length(residuals2))
+    } else {
+        pmax(0, epsilon2 - residuals2 / sum(weight)) * weight
+    }
+    ## log(f(r, beta)), assuming squared r, the phi is calculated separately
+    lf <- function(r, beta) log_sigmoidc(beta * (epsilon2 - r))
+    ## derivative of log(f(r, beta1)/f(r, beta2))
+    lg <- function(r) -beta1 * dlog_sigmoid(beta1 * (epsilon2 - r)) + beta2 * dlog_sigmoid(beta2 * (epsilon2 - r))
+    # Calculate log(k) (k = min_r f_1(r) / f_2(r), see Thm. 3)
+    g.zero <- lg(0)
+    if (g.zero < 0) {
+        ## If derivative at r=0 is negative the minimum is within r>0 and r<epsilon2.
+        ## (due to lg(0) < 0 and lg(epsilon2) = -beta1*0.5 + beta2*0.5 > 0)
+        a <- uniroot(lg, lower = 0, upper = epsilon2, f.lower = g.zero)$root
+        lk <- min(lf(0, beta1) - lf(0, beta2), lf(a, beta1) - lf(a, beta2))
+    } else {
+        ## If derivative at r=0 is positive the function has minimum at r<0 and hence the minimum
+        ## can be found at r=0 (negative values of the squared residual are not allowed).
+        lk <- lf(0, beta1) - lf(0, beta2)
+    }
+    # Calculate log(K) (K = G_1(a_1) / (G_2(a_1) k), see Thm. 3)
+    log_sum_special(lf(residuals2, beta1), phi) - lk - log_sum_special(lf(residuals2, beta2), phi)
+}
+
+#' Find the matching *epsilon
+#'
+#' @param residuals2 squared residuals
+#' @param epsilon2 squared error tolerance
+#' @param beta sigmoid steepness
+#' @param weight weight vector (default: NULL)
+#'
+#' @return *epsilon
+#'
+matching_epsilon <- function(residuals2, epsilon2, beta, weight = NULL) {
+    if (is.null(weight)) {
+        residuals2 <- sort(residuals2)
+        loss <- sigmoid(beta * (epsilon2 - residuals2))
+        i <- which.max(seq_along(residuals2) * loss)
+    } else {
+        ord <- order(residuals2)
+        residuals2 <- residuals2[ord]
+        weight <- weight[ord]
+        loss <- sigmoid(beta * (epsilon2 - residuals2))
+        i <- which.max(cumsum(weight) * loss)
+    }
+    sqrt(residuals2[i])
+}
+
+#' Print debug statement for how the graduated optimisation is going
+#'
+#' @param alpha linear model
+#' @param X data matrix
+#' @param Y response vector
+#' @param epsilon error tolerance
+#' @param beta current sigmoid steepness
+#' @param lambda1 L1 coefficients
+#' @param lambda2 L2 coefficients
+#' @param weight weight vector
+#' @param beta_max max sigmoid steepness
+#'
+grad_opt_debug <- function(alpha, X, Y, epsilon, beta, lambda1, lambda2, weight, beta_max) {
+    residuals <- c(Y - X %*% alpha)^2
+    approx <- exp(log_approximation_ratio(residuals, epsilon^2, beta, beta_max, weight))
+    m_epsilon <- matching_epsilon(residuals, epsilon^2, beta, weight)
+    loss_sm <- loss_smooth(alpha, X, Y, epsilon, beta, lambda1, lambda2, weight)
+    loss_sh <- loss_smooth(alpha, X, Y, epsilon, beta_max, lambda1, lambda2, weight)
+    loss_ha <- loss_sharp(alpha, X, Y, epsilon, lambda1, lambda2, weight)
+    cat(sprintf(
+        "Graduated Optimisation: beta = %6.3f eps* = %.3f approx = %5.3f Ls = %g Lh = %g L = %g\n",
+        beta * epsilon^2, m_epsilon, approx, loss_sm, loss_sh, loss_ha
+    ))
+}
+
+
+#' Find the next beta according to:
+#'   ¤ approximation_ratio(alpha, beta_old, beta_new) == max_approx
+#'   ¤ beta_new >= beta_old + min_increase
+#'   ¤ beta_new <= beta_max
+#'
+#' @param residuals2 squared residuals
+#' @param epsilon2 squared error tolerance
+#' @param beta current sigmoid steepness
+#' @param weight weight vector (default: NULL)
+#' @param beta_max max sigmoid steepnsess
+#' @param log_max_approx logarithm of the approximation ratio target for increasing beta
+#' @param beta_min_increase minimum beta step
+#'
+#' @return beta_new
+#'
+#' @importFrom stats uniroot
+#'
+next_beta <- function(residuals2, epsilon2, beta = 0, weight = NULL,
+                      beta_max = 20 / epsilon2, log_max_approx = log(1.15),
+                      beta_min_increase = (beta_max + beta) * 0.0005) {
+    if (beta >= beta_max) {
+        return(beta)
+    }
+    log_approx <- log_approximation_ratio(residuals2, epsilon2, beta, beta_max, weight)
+    if (log_approx <= log_max_approx) {
+        beta_max
+    } else {
+        f <- function(b) log_approximation_ratio(residuals2, epsilon2, beta, b, weight) - log_max_approx
+        beta_new <- uniroot(f,
+            lower = beta, upper = beta_max, f.lower = -log_max_approx,
+            f.upper = log_approx - log_max_approx
+        )$root
+        max(beta_new, beta + beta_min_increase)
+    }
+}
+
+#' Graduated Optimisation to solve the SLISE problem
+#'
+#' @param alpha Initial linear model (if NULL then OLS)
+#' @param X Data matrix
+#' @param Y Response vector
+#' @param epsilon Error tolerance
+#' @param beta Starting sigmoid steepness (default: 0 == convex problem)
+#' @param lambda1 L1 coefficient (default: 0)
+#' @param lambda2 L1 coefficient (default: 0)
+#' @param weight Weight vector (default: NULL == no weights)
+#' @param beta_max Stopping sigmoid steepness (default: 20 / epsilon^2)
+#' @param max_approx Approximation ratio when selecting the next beta (default: 1.15)
+#' @param max_iterations Maximum number of OWL-QN iterations (default: 300)
+#' @param debug Should debug statement be printed each iteration (default: FALSE)
+#' @param beta_min_increase Minimum amount to increase beta (default: beta_max * 0.0005)
+#' @param ... Additional parameters to OWL-QN
+#'
+#' @return lbfgs object with beta (max) and the number of iteration steps
+#' @export
+#'
+graduated_optimisation <- function(alpha, X, Y, epsilon, beta = 0, lambda1 = 0, lambda2 = 0,
+                                   weight = NULL, beta_max = 20 / epsilon^2, max_approx = 1.15,
+                                   max_iterations = 300, beta_min_increase = beta_max * 0.0005,
+                                   debug = FALSE, ...) {
+    stopifnot(epsilon > 0)
+    stopifnot(beta >= 0)
+    stopifnot(lambda1 >= 0)
+    stopifnot(lambda2 >= 0)
+    stopifnot(beta_max > 0)
+    stopifnot(max_approx > 1)
+    stopifnot(max_iterations > 0)
+    res <- list(par = if (is.null(alpha)) rep(0, ncol(X)) else alpha)
+    max_approx <- log(max_approx)
+    dc <- data_container(X = X, Y = Y, epsilon = epsilon, beta = beta, lambda1 = 0, lambda2 = lambda2, weight = weight)
+    while (beta < beta_max) {
+        res <- owlqn_c(res$par, dc, lambda1, max_iterations, ...)
+        if (debug) grad_opt_debug(res$par, X, Y, epsilon, beta, lambda1, lambda2, weight, beta_max)
+        beta <- next_beta((X %*% res$par - Y)^2, epsilon^2, beta, weight, beta_max, max_approx, beta_min_increase)
+        dc$setBeta(beta)
+    }
+    res <- owlqn_c(res$par, dc, lambda1, max_iterations * 4, ...)
+    if (debug) grad_opt_debug(res$par, X, Y, epsilon, beta, lambda1, lambda2, weight, beta_max)
+    res
+}
\ No newline at end of file
diff --git a/R/plot.R b/R/plot.R
index d7044a2..93fa390 100644
--- a/R/plot.R
+++ b/R/plot.R
@@ -1,504 +1,924 @@
-# This script contains functions for plotting, printing and showing SLISE regressions and explanations
-
-#' Plot the robust regression or explanation from slise
-#'
-#' @param slise The slise object
-#' @param cols The columns in the data to plot
-#' @param title (Optional) The title of the plot (and result names when using other)
-#' @param labels (Optional) The labels for the x, y, and legend (in that order, can be partial)
-#' @param other list of other slise objects to include in the plot
-#' @param threed plot in 3D with two columns
-#' @param ... not used
-#'
-#' @export
-#'
-#' @examples
-#' data <- matrix(rnorm(200), 100, 2)
-#' response <- rnorm(100)
-#' slise <- slise.fit(data, response, epsilon=0.1)
-#' plot(slise, 1:2, threed = TRUE)
-plot.slise <- function(slise, cols = 1, title = "SLISE", labels = NULL, other = NULL, threed = FALSE, ...) {
-    if (length(cols) == 1) {
-        x <- c(min(slise$X[, cols]), max(slise$X[, cols]))
-        if (length(labels) == 1)
-            labels <- c(colnames(slise$data)[[cols]])
-        if (is.null(other)) {
-            y <- slise$coefficients[[1]] + slise$coefficients[[cols + 1]] * x
-            plt <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(slise$X[, cols], slise$Y)) +
-                ggplot2::geom_line(ggplot2::aes(x, y), color = "#e66101", size = 1) +
-                ggplot2::geom_point(ggplot2::aes(slise$X[slise$subset, cols], slise$Y[slise$subset]), color = "#5e3c99") +
-                ggplot2::ggtitle(title) + ggplot2::theme_light()
-        } else {
-            other <- append(list(slise), other)
-            if (is.null(title)) title <- ""
-            if (length(title) < length(other) + 1)
-                title <- c(title, paste(length(title):length(other)))
-            explanations <- data.frame(
-                y = c(sapply(other, function(e) e$coefficients[[1]] + e$coefficients[[cols + 1]] * x)),
-                x = c(rep(x, length(other))),
-                title = title[-1])
-            plt <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(slise$X[, cols], slise$Y)) +
-                ggplot2::geom_point(ggplot2::aes(slise$X[slise$subset, cols], slise$Y[slise$subset]), color = "gray42") +
-                ggplot2::geom_line(ggplot2::aes(explanations$x, explanations$y, col = explanations$title)) +
-                ggplot2::ggtitle(title[[1]])
-        }
-        if (length(labels) == 1)
-            plt <- plt + ggplot2::labs(x = labels[[1]])
-        else if (length(labels) == 2)
-            plt <- plt + ggplot2::labs(x = labels[[1]], y = labels[[2]])
-        else if (length(labels) == 3)
-            plt <- plt + ggplot2::labs(x = labels[[1]], y = labels[[2]], col = labels[[3]])
-        if (!is.null(slise$x) && !is.null(slise$y)) {
-            plt <- plt + ggplot2::geom_point(ggplot2::aes(slise$x[[cols]], slise$y), col = "red")
-        }
-        graphics::plot(plt)
-    } else if (length(cols) == 2) {
-        if (!is.null(other))
-            warning("Multiple slise objects only allowed when plotting one column")
-        x_var <- cols[[1]]
-        y_var <- cols[[2]]
-        zlab <- labels[[1]]
-        if (length(labels) > 1) xlab <- labels[[2]]
-        else xlab <- colnames(slise$data)[[x_var]]
-        if (length(labels) > 2) ylab <- labels[[3]]
-        else ylab <- colnames(slise$data)[[y_var]]
-        if (!threed) {
-            plt <- ggplot2::ggplot() +
-                ggplot2::geom_point(ggplot2::aes(slise$X[, x_var], slise$X[, y_var], col = slise$Y)) +
-                ggplot2::geom_point(ggplot2::aes(slise$X[slise$subset, x_var], slise$X[slise$subset, y_var]), col = "green") +
-                ggplot2::ggtitle(title) + ggplot2::labs(x = xlab, y = ylab, col = zlab)
-            if (!is.null(slise$x)) {
-                plt <- plt + ggplot2::geom_point(ggplot2::aes(slise$x[[x_var]], slise$x[[y_var]]), col = "red")
-            }
-            graphics::plot(plt)
-        } else {
-            if (!requireNamespace("scatterplot3d", quietly = TRUE)) {
-                stop("Package \"scatterplot3d\" needed for 3D plots. Please install it.",
-                call. = FALSE)
-            }
-            plt <- scatterplot3d::scatterplot3d(slise$X[, x_var], slise$X[, y_var], slise$Y, xlab = xlab, ylab = ylab,
-                zlab = zlab, pch = 16, color = grDevices::rgb(0.2, 0.2, 0.8, 0.3), main = title)
-            plt$points3d(slise$X[slise$subset, x_var], slise$X[slise$subset, y_var],
-                slise$Y[slise$subset], pch = 16, col = "green")
-            if (!is.null(slise$x) && !is.null(slise$y)) {
-                plt$points3d(slise$x[[x_var]], slise$x[[y_var]], slise$y, pch = 16, col = "red")
-            }
-            plt$plane3d(slise$coefficients[c(1, x_var + 1, y_var + 1)])
-        }
-    } else {
-        stop("plot.slise not defined for more than two variables")
-    }
-}
-
-#' Print the robust regression or explanation from slise
-#'
-#' @param slise The slise object
-#' @param title (Optional) The title of the result
-#' @param ... not used
-#'
-#' @export
-#'
-#' @examples
-#' data <- matrix(rnorm(200), 100, 2)
-#' response <- rnorm(100)
-#' slise <- slise.fit(data, response, epsilon=0.1)
-#' print(slise)
-print.slise <- function(slise, ..., title = "SLISE") {
-    Coefficients <- slise$coefficients
-    Alpha <- slise$scaled$expand_alpha(slise$alpha)
-    if (length(Alpha) < length(Coefficients))
-        Alpha <- c(NA, Alpha)
-    if (!is.null(slise$x)) {
-        Explained <- c(NA, slise$x)
-        xs <- slise$scaled$scale_x(slise$x)
-        Contribution <- c(slise$alpha[[1]], slise$scaled$expand_alpha(slise$alpha[-1] * xs))
-        data <- rbind(Coefficients, Alpha, Explained, Contribution)
-    }
-    else {
-        data <- rbind(Coefficients, Alpha)
-    }
-    colnames(data) <- names(slise$coefficients)
-    if (ncol(data) > 20) {
-        data <- data[, c(TRUE, slise$x != 0)]
-        if (ncol(data) > 20) data <- data[, 1:20]
-    }
-    if (!is.null(title)) {
-        cat(title, "\n", sep = "")
-    }
-    print(data)
-    cat(sprintf("Subset size: %6.2f\n", mean(slise$subset)))
-    cat(sprintf("Loss:        %6.2f\n", slise$value))
-    cat(sprintf("Epsilon:     %6.2f\n", slise$epsilon))
-    cat(sprintf("Lambda:      %6.2f\n", slise$lambda))
-    cat(sprintf("Non-Zero:    %3d\n", sparsity(slise$alpha)))
-    if (length(slise$logit) == 1 && slise$logit)
-        cat(sprintf("Class Balance: %.1f%% / %.1f%%\n", mean(slise$scaled$Y[slise$subset] > 0)*100, mean(slise$scaled$Y[slise$subset] < 0)*100))
-    invisible(slise)
-}
-
-summary.slise <- print.slise
-
-#' Explain an Object
-#'
-#' @param x the object to explain
-#' @param ... additional parameters
-#'
-#' @export
-#'
-#' @examples
-#' data <- matrix(rnorm(200), 100, 2)
-#' response <- rnorm(100)
-#' slise <- slise.explain(data, response, 10, epsilon=0.1)
-#' explain(slise)
-explain <- function(x, ...) {
-    UseMethod("explain", x)
-}
-
-#' Show a SLISE explanation
-#'
-#' @param slise the slise object to show
-#' @param type Type of explanation (bar, distribution, image, image2, text, image_scatter)
-#' @param class_labels (Optional) names of the two classes
-#' @param real_value (Optional) the real response for the explained instance
-#' @param title (Optional) A title to add to the explanation
-#' @param probability (Optional) is The prediction a probability (TRUE)
-#' @param ... Additional parameters to the visualisation
-#'
-#' @export
-#'
-#' @examples
-#' data <- matrix(rnorm(200), 100, 2)
-#' response <- rnorm(100)
-#' slise <- slise.explain(data, response, 10, epsilon=0.1)
-#' explain(slise)
-explain.slise <- function(slise, type = "bar", class_labels = c("Class 0", "Class 1"),
-        title = "SLISE", real_value = NULL, probability = TRUE, ...) {
-    if (is.null(type)) type <- ifelse(ncol(slise$X) > 20, "image", "bar")
-    if (is.null(slise$x) || is.null(slise$y)) {
-        stop("Can only show explanations for actual explanations (use slise.explain)")
-    }
-    title <- explain_slise_title(slise, main_title = title, class_labels = class_labels, real_value = real_value, probability = probability, ...)
-    if (identical(type, "bar")) {
-        explain_slise_bar(slise, title = title, class_labels = class_labels)
-    } else if (identical(type, "distribution") || identical(type, "dist")) {
-        explain_slise_distribution(slise, title = title, class_labels = class_labels)
-    } else if (identical(type, "image")) {
-        explain_slise_image(slise, title = title, class_labels = class_labels, ...)
-    } else if (identical(type, "image2")) {
-        explain_slise_image2(slise, title = title, class_labels = class_labels, ...)
-    } else if (identical(type, "image_scatter")) {
-        explain_slise_image_scatter(slise, title = title, class_labels = class_labels, ...)
-    } else if (identical(type, "text")) {
-        explain_slise_text(slise, title = title, class_labels = class_labels, ...)
-    } else {
-        stop("Unknown explanation visualisation type")
-    }
-}
-
-# Plot a tabular explanation with bar graphs
-explain_slise_bar <- function(slise, class_labels = c("", ""), title = "") {
-    if (!requireNamespace("gridExtra", quietly = TRUE)) {
-        stop("Package \"gridExtra\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    if (!requireNamespace("grid", quietly = TRUE)) {
-        stop("Package \"grid\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    X <- slise$scaled$X
-    Y <- slise$scaled$Y
-    x <- slise$scaled$scale_x(slise$x)
-    y <- slise$scaled$scale_y(slise$y)
-    alpha <- slise$alpha
-    # Setup
-    mask <- abs(alpha[-1]) > 1e-6
-    X <- X[, mask]
-    x <- x[mask]
-    inter <- alpha[[1]]
-    alpha <- alpha[-1][mask]
-    var_names <- names(alpha)
-    if (is.null(var_names)) var_names <- colnames(X)
-    if (is.null(var_names)) var_names <- seq_along(alpha)
-    ord <- rev(order(abs(alpha)))
-    iord <- rev(c(1, ord + 1))
-    # Subset
-    dataset_low <- c(stats::quantile(Y, 0.05) * 3, sapply(1:ncol(X), function(i) stats::quantile(X[, i], 0.05)))
-    dataset_high <- c(stats::quantile(Y, 0.95) * 3, sapply(1:ncol(X), function(i) stats::quantile(X[, i], 0.95)))
-    subset_low <- c(stats::quantile(Y[slise$subset], 0.05) * 3, sapply(1:ncol(X), function(i) stats::quantile(X[slise$subset, i], 0.05)))
-    subset_high <- c(stats::quantile(Y[slise$subset], 0.95) * 3, sapply(1:ncol(X), function(i) stats::quantile(X[slise$subset, i], 0.95)))
-    subset_point <- c(y * 3, x)
-    subset_names <- factor(c(sprintf("Predicted (%.3f)", slise$y), mapply(function(n, v) sprintf("%s (%.3f)", n, v), var_names, slise$x[mask])))
-    plt_subset <- ggplot2::ggplot() +
-        ggplot2::geom_hline(yintercept = 0, color = "gray32") +
-        ggplot2::geom_segment(ggplot2::aes(x = subset_names, xend = subset_names, y = dataset_low, yend = dataset_high, col = "gray"), size = 12) +
-        ggplot2::geom_segment(ggplot2::aes(x = subset_names, xend = subset_names, y = subset_low, yend = subset_high, col = "orange"), size = 8) +
-        ggplot2::geom_point(ggplot2::aes(subset_names, subset_point, col = "black"), size = 6) + ggplot2::scale_x_discrete(limits = subset_names[iord]) +
-        ggplot2::scale_color_identity(name = "", guide = "legend", labels = c("Explained Sample", "Dataset (95%)", "Subset (95%)")) +
-        ggplot2::ggtitle(sprintf("Subset Size (%.1f%%)", 100 * mean(slise$subset))) + ggplot2::xlab("") + ggplot2::ylab("") +
-        ggplot2::coord_flip() + ggplot2::theme_minimal(14) +
-        ggplot2::theme(legend.position = "bottom", axis.ticks.x = ggplot2::element_blank(), axis.text.x = ggplot2::element_blank(), legend.box.margin = ggplot2::margin(-40, 80, -10, -40, "pt"))
-    # Model
-    model <- c(inter, alpha)
-    model_sign <- ifelse(model > 0, "+", "-")
-    model_names <- factor(mapply(function(l, v) sprintf("%s (%.3f)", l, v), c("Intercept", var_names), model))
-    plt_model <- ggplot2::ggplot(mapping = ggplot2::aes(model_names, model, fill = model_sign)) + ggplot2::scale_x_discrete(limits = model_names[iord]) +
-        ggplot2::geom_bar(stat = "identity") + ggplot2::scale_fill_manual(values = c("-" = "red2", "+" = "green3"), name = "", labels = c("+" = class_labels[[2]], "-" = class_labels[[1]])) +
-        ggplot2::ggtitle("Local Linear Model") + ggplot2::ylab("") + ggplot2::xlab("") + ggplot2::ylim(-max(abs(model)), max(abs(model))) +
-        ggplot2::coord_flip() + ggplot2::theme_minimal(14) + ggplot2::theme(legend.position = "bottom", legend.box.margin = ggplot2::margin(-40, 0, -10, 0, "pt"))
-    # Impact
-    impact <- c(inter, alpha * x)
-    impact_sign <- ifelse(impact > 0, "+", "-")
-    plt_impact <- ggplot2::ggplot(mapping = ggplot2::aes(model_names, impact, fill = impact_sign)) + ggplot2::scale_x_discrete(limits = model_names[iord]) +
-        ggplot2::geom_bar(stat = "identity") + ggplot2::scale_fill_manual(values = c("+" = "green3", "0" = "white", "-" = "red2"), name = "", labels = c("+" = class_labels[[2]], "0" = "", "-" = class_labels[[1]])) +
-        ggplot2::ggtitle("Actual Impact") + ggplot2::ylab("") + ggplot2::xlab("") + ggplot2::ylim(-max(abs(impact)), max(abs(impact))) +
-        ggplot2::coord_flip() + ggplot2::theme_minimal(14) + ggplot2::theme(legend.position = "bottom", legend.box.margin = ggplot2::margin(-40, 0, -10, 0, "pt"),
-            axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank(), plot.title = ggplot2::element_text(hjust = 0.5))
-    gridExtra::grid.arrange(plt_model, plt_impact, plt_subset, ncol = 3, widths = c(3, 2, 3), top = grid::textGrob(title, gp = grid::gpar(cex = 1.2)))
-}
-
-# Plot an EMNIST explanation with a saliency map and outline of the digit
-explain_slise_image <- function(slise, title = "", class_labels = c("", ""), width = NULL, height = NULL, ...) {
-    if (is.null(width) && is.null(height))
-        width <- height <- floor(sqrt(length(slise$x)))
-    stopifnot(width * height == length(slise$x))
-    alpha <- slise$scaled$expand_alpha(slise$alpha)[-1]
-    plt <- explain_img_slise_image(alpha, slise$x, width, height, ..., class_labels = class_labels, legend = "bottom")
-    graphics::plot(plt + ggplot2::ggtitle(title))
-}
-
-# Plot an EMNIST explanation with a saliency map and the image
-explain_slise_image2 <- function(slise, title = "", class_labels = c("", ""), width = NULL, height = NULL, ...) {
-    if (!requireNamespace("gridExtra", quietly = TRUE)) {
-        stop("Package \"gridExtra\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    if (!requireNamespace("grid", quietly = TRUE)) {
-        stop("Package \"grid\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    if (is.null(width) && is.null(height))
-        width <- height <- floor(sqrt(length(slise$x)))
-    stopifnot(width * height == length(slise$x))
-    alpha <- slise$scaled$expand_alpha(slise$alpha)[-1]
-    plt <- explain_img_slise_image(alpha, NULL, width, height, ..., class_labels = class_labels, legend = "left")
-    plt2 <- explain_img_slise_image(slise$x, NULL, width, height, ..., legend = "right", colors = explain_slise_color_bw(), scale_colors = FALSE)
-    gridExtra::grid.arrange(plt + ggplot2::ggtitle("Saliency Map"), plt2 + ggplot2::ggtitle("Image"), ncol=2, top = grid::textGrob(title, gp = grid::gpar(cex = 1.2)))
-}
-
-# Plot an EMNIST explanation with a scatterplot and a lineup
-explain_slise_image_scatter <- function(slise, title = "", class_labels = c("", ""), width = NULL, height = NULL, num_examples = 5, ...) {
-    if (!requireNamespace("gridExtra", quietly = TRUE)) {
-        stop("Package \"gridExtra\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    if (!requireNamespace("grid", quietly = TRUE)) {
-        stop("Package \"grid\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    if (is.null(width) && is.null(height))
-        width <- height <- floor(sqrt(length(slise$x)))
-    stopifnot(width * height == length(slise$x))
-    alpha <- slise$scaled$expand_alpha(slise$alpha)[-1]
-    lineup <- explain_slise_get_lineup(slise, num_examples, TRUE)
-    images <- do.call(rbind, lapply(lineup$probabilities, function(p) alpha))
-    plt_cmp <- explain_img_slise_lineup(images, paste("p ==", lineup$probabilities), lineup$images, width, height, ..., class_labels = class_labels, legend = "bottom", nrow = 1) +
-        ggplot2::ggtitle(paste0("Numbers from the subset with different probabilities ('", class_labels[[1]], "' to '", class_labels[[2]], "')"))
-    plt_scatter <- explain_img_slise_scatter(slise, width, height, lineup, ...)
-    plt_scatter <- plt_scatter + ggplot2::ggtitle(title)
-    sub_bal <- mean(slise$scaled$Y[slise$subset] > 0)
-    layout <- matrix(c(1, 1, 2, 1, 1, 2), 3, 2)
-    if (sub_bal <= 0.1 || sub_bal >= 0.9) {
-        warning <- grid::textGrob("WARNING: Subset is very unbalanced", gp = grid::gpar(col = "dark red"))
-        gridExtra::grid.arrange(plt_scatter, plt_cmp, layout_matrix = layout, bottom = warning)
-    } else {
-        gridExtra::grid.arrange(plt_scatter, plt_cmp, layout_matrix = layout)
-    }
-}
-
-# Print a text explanation, with optional weights and plotted wordcloud
-explain_slise_text <- function(slise, title, class_labels, text = NULL, tokens = NULL, treshold = 1e-2, print_weights = TRUE, print_weights_all = FALSE, wordcloud = TRUE, ...) {
-    if (!requireNamespace("crayon", quietly = TRUE)) {
-        stop("Package \"crayon\" needed for coloring words. Please install it.",
-        call. = FALSE)
-    }
-    if (is.null(text)) {
-        text <- names(slise$x)[slise$x > 0]
-    } else if (length(text) == 1) {
-        text <- stringr::str_split(text, " ")[[1]]
-    }
-    if (is.null(tokens))
-        tokens <- text
-    cat(stringr::str_replace_all(title, ", ", "  \n"), "  \n")
-    ns <- mean(slise$Y[slise$subset] < 0.5)
-    ps <- mean(slise$Y[slise$subset] > 0.5)
-    cat("  Subset Size: ", sprintf("%.2f%% (", mean(slise$subset) * 100),
-        crayon::make_style(grDevices::rgb(0.8, 0, 0))(sprintf("%.1f%%", ns * 100)), " + ",
-        crayon::make_style(grDevices::rgb(0, 0.7, 0))(sprintf("%.1f%%", ps * 100)), ")",
-        ifelse(ns < 0.1 || ps < 0.1, crayon::make_style(grDevices::rgb(1, 0.5, 0))(" UNBALANCED SUBSET!"), ""),
-        "  \n", sep = "")
-    cat("  Legend:", crayon::make_style(grDevices::rgb(0.8, 0, 0))(class_labels[1]), "Neutral",
-        crayon::make_style(grDevices::rgb(0, 0.7, 0))(class_labels[2]), crayon::silver("Unknown"), "  \n")
-    vmax <- max(abs(slise$alpha[-1]))
-    vmed <- stats::median(abs(slise$alpha[-1]))
-    th <- vmax * 0.5 + vmed * 0.5
-    treshold <- th * treshold
-    color <- function(v, t) {
-        if (is.na(v)) crayon::silver(t)
-        else if (abs(v) < treshold) t
-        else if (v > 0) {
-            if (v > th) {
-                crayon::make_style(grDevices::rgb(0, 0.7, 0))(t)
-            } else {
-                v <- (1 - v / th) ^ 2
-                crayon::make_style(grDevices::rgb(v, 1, v))(t)
-            }
-        } else {
-            if (v < -th) {
-                crayon::make_style(grDevices::rgb(0.8, 0, 0))(t)
-            } else {
-                v <- (1 + v / th) ^ 2
-                crayon::make_style(grDevices::rgb(1, v, v))(t)
-            }
-        }
-    }
-    do.call(cat, lapply(seq_along(text), function(i) {
-        t <- text[[i]]
-        j <- which(tokens[[i]] == names(slise$coefficients))
-        v <- if (length(j) == 1) { slise$alpha[j] } else { NA }
-        color(v, t)
-    }))
-    cat("  \n")
-    if (print_weights || print_weights_all) {
-        ord <- c(1, rev(order(abs(slise$alpha[-1]))) + 1)
-        if (!print_weights_all) {
-            ord <- ord[c(1, slise$x)[ord] != 0]
-        }
-        mapply(function(t, v) {
-            if (abs(v) < treshold) return();
-            t <- sprintf("%10s", t)
-            t <- color(v, t)
-            cat(sprintf("%s: %7.4f", t, v), "  \n")
-        }, names(slise$coefficients)[ord], slise$alpha[ord])
-    }
-    if (wordcloud) {
-        if (!requireNamespace("wordcloud", quietly = TRUE)) {
-            stop("Package \"wordcloud\" needed for plotting wordclouds. Please install it.",
-            call. = FALSE)
-        }
-        mask <- c(FALSE, abs(slise$alpha[-1]) > treshold & slise$x != 0)
-        wordcloud::wordcloud(names(slise$coefficients)[mask], abs(slise$alpha[mask]),
-            colors = ifelse(slise$alpha[mask] > 0, "#4dac26", "#d01c8b"),
-            ordered.colors = TRUE, use.r.layout = TRUE)
-        graphics::par(mar = rep(0, 4))
-    }
-    invisible(slise)
-}
-
-# Plot a tabular explanation with density plots
-explain_slise_distribution <- function(slise, title, class_labels) {
-    if (!requireNamespace("gridExtra", quietly = TRUE)) {
-        stop("Package \"gridExtra\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    if (!requireNamespace("grid", quietly = TRUE)) {
-        stop("Package \"grid\" needed for the function to work. Please install it.",
-        call. = FALSE)
-    }
-    num_variables <- sparsity(slise$alpha[-1])
-    variable_names <- names(slise$coefficients)
-    ord <- order(abs(slise$alpha[-1]), decreasing = TRUE)[1:num_variables]
-    #Distributions
-    plts <- lapply(ord, function(i) {
-        ggplot2::ggplot() + ggplot2::xlab(variable_names[[i + 1]]) +
-            ggplot2::geom_density(ggplot2::aes(slise$X[, i], ggplot2::stat("count")), color = "red2") +
-            ggplot2::geom_density(ggplot2::aes(slise$X[slise$subset, i], ggplot2::stat("count")), color = "cyan3") +
-            ggplot2::geom_vline(xintercept = slise$x[[i]], color = "black") +
-            ggplot2::geom_point(ggplot2::aes(slise$x[[i]], 0), color = "black", size = 2.5) + ggplot2::theme_minimal() +
-            ggplot2::theme(axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank(),
-                axis.title.y = ggplot2::element_blank(), legend.position = "none",
-                panel.grid.minor.y = ggplot2::element_blank(), panel.grid.minor.x = ggplot2::element_blank())
-    })
-    plts <- append(list(
-        ggplot2::ggplot() + ggplot2::xlab("response") +
-            ggplot2::geom_density(ggplot2::aes(slise$Y, ggplot2::stat("count"), col = "c1")) +
-            ggplot2::geom_density(ggplot2::aes(slise$Y[slise$subset], ggplot2::stat("count"), col = "c2")) +
-            ggplot2::geom_vline(ggplot2::aes(xintercept = slise$y, col = "c3"), show.legend = TRUE) +
-            ggplot2::geom_point(ggplot2::aes(slise$y, 0, col = "c3"), size = 2.5) +
-            ggplot2::theme_minimal() + ggplot2::scale_color_manual(name = "Legend: ",
-                values = c("red2", "cyan3", "black"),
-                labels = c("Dataset", "Subset", "Explained Instance")) +
-            ggplot2::theme(axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank(),
-                axis.title.y = ggplot2::element_blank(), legend.position = "bottom",
-                panel.grid.minor.y = ggplot2::element_blank(), panel.grid.minor.x = ggplot2::element_blank())
-    ), plts)
-    #Meta
-    impact <- slise$alpha[-1] * slise$scaled$scale_x(slise$x)
-    ml <- max(c(max(abs(impact)), max(abs(slise$alpha))))
-    labs <- rev(c("Subset   ", "Size     ", "", "Intercept"))
-    labs <- factor(labs, levels = labs)
-    mp <- mean(slise$subset) * 2 * ml - ml
-    plts2 <- list(
-        ggplot2::ggplot(mapping = ggplot2::aes(labs, c(0, 0, 0, 0))) + ggplot2::geom_col() +
-        ggplot2::theme_minimal() + ggplot2::scale_y_continuous(limits = c(-ml, ml), breaks = c(-ml, -0.5 * ml, 0, 0.5 * ml, ml) * 0.8,) +
-        ggplot2::geom_col(ggplot2::aes(labs[[1]], slise$alpha[[1]], fill = ifelse(slise$alpha[[1]] > 0, "green3", "red2"))) +
-        ggplot2::geom_rect(ggplot2::aes(xmin = 1.7, xmax = 2.8, ymin = -ml, ymax = ml), size = 10, fill = "white") +
-        ggplot2::geom_vline(xintercept = 2, color = "white", size = 10) +
-        ggplot2::geom_text(ggplot2::aes(x = 2.5, y = c(-ml * 0.975, 0, ml * 0.95), label = paste(c(0, 50, 100), "%")), size = 3) +
-        ggplot2::geom_rect(ggplot2::aes(xmin = labs[[4]], xmax = labs[[3]], ymin = -ml, ymax = mp), fill = "orange") +
-        ggplot2::geom_text(ggplot2::aes(3.5, mp, label = sprintf(" %.1f %%", mean(slise$subset) * 100), hjust = 0)) +
-        ggplot2::scale_fill_manual(labels = class_labels, values = c("red2", "green3"), limits = c("red2", "green3"), drop = FALSE, guide = ggplot2::guide_legend(title = NULL)) +
-        ggplot2::coord_flip() + ggplot2::theme(panel.grid.minor.y = ggplot2::element_blank(), panel.grid.minor.x = ggplot2::element_blank(),
-            axis.title = ggplot2::element_blank(), rect = ggplot2::element_blank(), legend.position = "bottom", axis.text.x = ggplot2::element_blank())
-
-    )
-    # Model
-    plts2 <- append(plts2, lapply(ord, function(i) {
-        val <- c(slise$alpha[[i + 1]], impact[[i]])
-        col <- ifelse(val > 0, "green3", "red2")
-        ggplot2::ggplot(mapping = ggplot2::aes(c("Weight   ", "Impact   "), val)) +
-        ggplot2::theme_minimal() + ggplot2::scale_y_continuous(limits = c(-ml, ml), breaks = c(-ml, -0.5 * ml, 0, 0.5 * ml, ml) * 0.8) +
-        ggplot2::geom_col(ggplot2::aes("Weight   ", val[[1]]), fill = col[[1]]) +
-        ggplot2::geom_col(ggplot2::aes("Impact   ", val[[2]]), fill = col[[2]]) +
-        ggplot2::coord_flip() + ggplot2::theme(axis.title = ggplot2::element_blank(), axis.text.x = ggplot2::element_blank(),
-            panel.grid.minor.x = ggplot2::element_blank())
-    }))
-    plts <- append(plts, plts2)[order(c(seq_along(plts) * 2 - 1, seq_along(plts) * 2))]
-    do.call(gridExtra::grid.arrange, append(plts, list(ncol = 2, nrow = length(plts) / 2, heights = c(2, rep(1, length(plts) / 2 - 1)),
-        top = grid::textGrob(title, gp = grid::gpar(cex = 1.2)))))
-}
-
-# Generate title with additional information
-explain_slise_title <- function(slise, real_value = NULL, class_labels = NULL, main_title = NULL, probability = TRUE, ...) {
-    title <- list(sep = ",   ")
-    if (!is.null(main_title)) {
-        title <- c(title, main_title)
-    }
-    if (!is.null(slise$y)) {
-        pred <- slise$y
-        if (probability) {
-            if (is.null(class_labels))
-                pred <- sprintf("%.1f %%", pred * 100)
-            else if (pred >= 0.5)
-                pred <- sprintf("%.1f%% %s", pred * 100, class_labels[[2]])
-            else
-                pred <- sprintf("%.1f%% %s", 100 - pred * 100, class_labels[[1]])
-        }
-        else
-            pred <- sprintf("%.2f", pred)
-        title <- c(title, paste("Predicted:", pred))
-    }
-    if (!is.null(real_value)) {
-        if (is.character(real_value)) { }
-        else if (is.integer(real_value))
-            real_value <- class_labels[[as.integer(real_value + 1)]]
-        else if (probability)
-            real_value <- sprintf("%.1f %%", real_value * 100)
-        else
-            real_value <- sprintf("%.2f", real_value)
-        title <- c(title, paste("Real:", real_value))
-    }
-    if (length(title) > 1)
-        do.call(paste, title)
-    else
-        "SLISE Explanation"
-}
+# This script contains functions for plotting, printing and showing SLISE regressions and explanations
+
+SLISE_ORANGE <- "#fda411"
+SLISE_PURPLE <- "#9887cb"
+SLISE_WEAKPURPLE <- "#998ec344"
+SLISE_DARKORANGE <- "#e66101"
+SLISE_DARKPURPLE <- "#5e3c99"
+
+
+#' Print the robust regression or explanation from slise
+#'
+#' @param x The slise object
+#' @param num_vars Minimum number of variables to show without filtering (default: 10)
+#' @param ... Ignored additional parameters
+#'
+#' @return invisible(x)
+#' @export
+#'
+#' @examples
+#' X <- matrix(rnorm(30), 15, 2)
+#' Y <- runif(15, 0, 1)
+#' print(slise.fit(X, Y, epsilon = 0.1))
+print.slise <- function(x, num_vars = 10, ...) {
+    check_package("stringr")
+    slise <- x
+    if (is.null(slise$x)) {
+        cat("SLISE Regression:\n")
+    } else {
+        cat("SLISE Explanation:\n")
+    }
+    # Table of item, model and impact
+    data <- rbind(
+        `Explained Item` = c(slise$y, slise$x),
+        `Linear Model` = slise$coefficients,
+        `Prediction Impact` = slise$impact,
+        `Normalised Item` = c(slise$normalised_y, slise$normalised_x),
+        `Normalised Model` = slise$normalised,
+        `Normalised Impact` = slise$normalised_impact
+    )
+    colnames(data) <- names(slise$coefficients)
+    if (!is.null(slise$x)) {
+        colnames(data)[1] <- "Response/Intercept"
+    }
+    ord <- order_coefficients(slise, FALSE, if (ncol(data) > num_vars) .Machine$double.eps else -1)
+    if (slise$intercept) {
+        ord <- c(1, ord + 1)
+    }
+    data <- data[, ord]
+    print(data)
+    # Other Values
+    cat(sprintf("Subset size: %7.2f\n", mean(slise$subset)))
+    cat(sprintf("Loss:        %7.2f\n", slise$value))
+    cat(sprintf("Epsilon:     %7.2f\n", slise$epsilon))
+    if (slise$lambda1 > 0) {
+        cat(sprintf("Lambda1:     %7.2f\n", slise$lambda1))
+    }
+    if (slise$lambda2 > 0) {
+        cat(sprintf("Lambda2:     %7.2f\n", slise$lambda2))
+    }
+    if (sparsity(slise$coefficients) < length(slise$coefficients)) {
+        cat(sprintf(
+            "Non-Zero:    %2d / %2d\n",
+            sparsity(slise$coefficients),
+            length(slise$coefficients)
+        ))
+    }
+    if (slise$logit) {
+        cat(sprintf("Class Balance: %.1f%% <> %.1f%%\n", mean(slise$Y[slise$subset] > 0) * 100, mean(slise$Y[slise$subset] < 0) * 100))
+    }
+    invisible(slise)
+}
+
+order_coefficients <- function(slise, hightolow = FALSE, minimum = .Machine$double.eps) {
+    alpha <- slise$coefficients
+    if (!is.null(slise$normalised)) {
+        alpha <- slise$normalised
+    }
+    if (slise$intercept) {
+        alpha <- alpha[-1]
+    }
+    ord <- which(abs(alpha) > minimum)
+    if (hightolow) {
+        ord <- ord[order(alpha[ord], decreasing = TRUE)]
+    } else {
+        ord <- ord[order(abs(alpha[ord]), decreasing = TRUE)]
+    }
+    ord
+}
+
+summary.slise <- print.slise
+
+#' Plot the robust regression or explanation from slise
+#'
+#' @param x The slise object
+#' @param type The type of plot ("2D", "bar", "distribution", "mnist", "prediction", "wordcloud")
+#' @param title The title of the plot
+#' @param ... Other parameters to the plotting functions
+#' @inheritDotParams plot.slise_2d labels partial size
+#' @inheritDotParams plot.slise_bar labels partial size
+#' @inheritDotParams plot.slise_distribution labels partial signif
+#' @inheritDotParams plot.slise_mnist labels partial width height plots
+#' @inheritDotParams plot.slise_prediction labels partial
+#' @inheritDotParams plot.slise_wordcloud labels treshold local
+#'
+#' @return plot or ggplot2 objects
+#' @export
+#'
+#' @examples
+#' X <- matrix(rnorm(30), 30, 1)
+#' Y <- runif(30, 0, 1)
+#' plot(slise.fit(X, Y, epsilon = 0.1))
+plot.slise <- function(x,
+                       type = NULL,
+                       title = NULL,
+                       ...) {
+    slise <- x
+    if (length(type) == 0) {
+        if (length(slise$X) == length(slise$Y)) {
+            type <- "2d"
+        } else if (sparsity(slise$coefficients) > 10) {
+            type <- "bar"
+        } else {
+            type <- "distribution"
+        }
+    } else {
+        type <- tolower(type)
+    }
+    if (is.null(title)) {
+        if (is.null(slise$x)) {
+            title <- "SLISE Regression"
+        } else if (slise$logit) {
+            title <- sprintf("SLISE Explanation (p = %g)", sigmoid(slise$y))
+        } else {
+            title <- sprintf("SLISE Explanation (y = %g)", slise$y)
+        }
+    }
+    if (type == "plot" || type == "2d") {
+        plot.slise_2d(slise, title, ...)
+    } else if (type == "distribution" || type == "dist") {
+        plot.slise_distribution(slise, title, ...)
+    } else if (type == "bar") {
+        plot.slise_bar(slise, title, ...)
+    } else if (type == "mnist" || type == "emnist" || type == "image" || type == "img") {
+        plot.slise_mnist(slise, title, ...)
+    } else if (type == "pred" || type == "prediction") {
+        plot.slise_prediction(slise, title, ...)
+    } else if (type == "wordcloud" || type == "word" || type == "words") {
+        plot.slise_wordcloud(slise, title, ...)
+    } else {
+        stop("[plot.slise] Unknown plot type")
+    }
+}
+
+#' Plot the robust regression or explanation from slise in 2D
+#'
+#' @param slise The slise object
+#' @param title The title of the plot
+#' @param labels The axis labels (default: c("X", "Y") or c("x", "f(x)"))
+#' @param partial Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)
+#' @param size The size of the plot elements (default: 2)
+#' @param ... Ignored parameters
+#'
+#' @return ggplot object or plot
+#'
+plot.slise_2d <- function(slise,
+                          title,
+                          labels = NULL,
+                          partial = FALSE,
+                          size = 2,
+                          ...) {
+    check_package("ggplot2")
+    minx <- min(slise$X)
+    maxx <- max(slise$X)
+    deltax <- maxx - minx
+    miny <- min(slise$Y)
+    maxy <- max(slise$Y)
+    px <- c(
+        minx - deltax * 0.1,
+        maxx + deltax * 0.1,
+        maxx + deltax * 0.1,
+        minx - deltax * 0.1
+    )
+    py <- c(
+        sum(c(1, px[1]) * slise$coefficients) + slise$epsilon,
+        sum(c(1, px[2]) * slise$coefficients) + slise$epsilon,
+        sum(c(1, px[3]) * slise$coefficients) - slise$epsilon,
+        sum(c(1, px[4]) * slise$coefficients) - slise$epsilon
+    )
+    if (is.null(labels)) {
+        if (is.null(slise$x)) {
+            labels <- c("X", "Y")
+        } else {
+            labels <- c("x", "f(x)")
+        }
+    }
+
+    gg <- ggplot2::ggplot() +
+        ggplot2::ggtitle(if (is.null(title) || title == "") NULL else title) +
+        ggplot2::theme_bw() +
+        ggplot2::xlab(labels[1]) +
+        ggplot2::ylab(labels[2]) +
+        ggplot2::coord_cartesian(xlim = c(minx, maxx), ylim = c(miny, maxy)) +
+        ggplot2::geom_polygon(ggplot2::aes_string(x = "px", y = "py"), fill = SLISE_WEAKPURPLE) +
+        ggplot2::geom_abline(ggplot2::aes(
+            intercept = slise$coefficients[1],
+            slope = slise$coefficients[2],
+            color = "SLISE",
+            linetype = "SLISE"
+        ), size = size) +
+        ggplot2::geom_point(ggplot2::aes(x = slise$X, y = slise$Y), size = size) +
+        ggplot2::scale_linetype_manual(
+            limits = c("SLISE", "Explained Point"),
+            values = c("solid", "blank"),
+            name = NULL
+        ) +
+        ggplot2::scale_color_manual(
+            limits = c("SLISE", "Explained Point"),
+            values = c(SLISE_PURPLE, SLISE_ORANGE),
+            name = NULL
+        ) +
+        ggplot2::scale_shape_manual(
+            limits = c("SLISE", "Explained Point"),
+            values = c(NA, 16),
+            name = NULL
+        )
+    if (!is.null(slise$x)) {
+        gg <- gg + ggplot2::geom_point(ggplot2::aes(
+            x = slise$x,
+            y = slise$y,
+            color = "Explained Point",
+            shape = "Explained Point"
+        ), size = size * 2)
+    } else {
+        gg <- gg + ggplot2::guides(shape = "none", color = "none", linetype = "none")
+    }
+    if (partial) {
+        gg
+    } else {
+        plot(gg)
+    }
+}
+
+#' Plot the robust regression or explanation from slise with distributions
+#'
+#' @param slise The slise object
+#' @param title The title of the plot
+#' @param labels The class labels (vector with two strings: c(y_low, y_high), default: c("Low", "High"))
+#' @param partial Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)
+#' @param signif The number of significant digits to display (default: 3)
+#' @param ... Ignored parameters
+#'
+#' @return List of ggplot objects or plot
+#'
+#' @importFrom stats predict
+#'
+plot.slise_distribution <- function(slise,
+                                    title,
+                                    labels = c("Low", "High"),
+                                    partial = FALSE,
+                                    signif = 3,
+                                    ...) {
+    check_package("ggplot2")
+    # Distribution plot
+    variable_names <- names(slise$coefficients)
+    ord <- order_coefficients(slise)
+    ord1 <- c(1, ord + 1)
+    # Distributions
+    label_factors <- c("All", "Subset", "Predicted")
+    label_factors <- factor(label_factors, label_factors)
+    subset_label <- label_factors[c(rep(1, length(slise$Y)), rep(2, sum(slise$subset)))]
+    df <- data.frame(
+        x = c(slise$Y, slise$Y[slise$subset], predict(slise, slise$X[slise$subset, ])),
+        p = ifelse(is.null(slise$y), NA, slise$y),
+        s = label_factors[c(subset_label, rep(3, sum(slise$subset)))],
+        l = "Response"
+    )
+    for (i in ord) {
+        df2 <- data.frame(
+            x = c(slise$X[, i], slise$X[slise$subset, i]),
+            p = ifelse(is.null(slise$x), NA, slise$x[i]),
+            s = subset_label,
+            l = variable_names[i + slise$intercept]
+        )
+        df <- rbind(df, df2)
+    }
+    df$l <- factor(df$l, unique(df$l))
+    gg1 <- ggplot2::ggplot(df) +
+        ggplot2::facet_wrap(
+            "l",
+            ncol = 1,
+            scales = "free",
+            strip.position = "right"
+        ) +
+        ggplot2::theme_bw() +
+        ggplot2::xlab(NULL) +
+        ggplot2::ylab(NULL) +
+        ggplot2::geom_density(ggplot2::aes_string("x", "..count..", fill = "s", linetype = "s"), adjust = 0.5) +
+        ggplot2::theme(
+            legend.position = "bottom",
+            strip.text.y = ggplot2::element_text(angle = 90)
+        ) +
+        ggplot2::ggtitle("Dataset Distribution") +
+        ggplot2::scale_fill_manual(
+            values = c("white", SLISE_PURPLE, SLISE_WEAKPURPLE),
+            limits = c("All", "Subset", "Predicted"),
+            name = NULL
+        ) +
+        ggplot2::scale_linetype_manual(
+            values = c(1, 1, 2),
+            limits = c("All", "Subset", "Predicted"),
+            name = NULL
+        )
+    if (!is.null(slise$x)) {
+        gg1 <- gg1 +
+            ggplot2::geom_vline(
+                ggplot2::aes_string(xintercept = "p", color = '"Explained"')
+            ) +
+            ggplot2::scale_color_manual(
+                limits = "Explained",
+                values = SLISE_ORANGE,
+                name = NULL
+            )
+    }
+    # Bars
+    if (slise$intercept) {
+        mv <- max(abs(slise$coefficients))
+        df <- data.frame(
+            l = variable_names[ord1],
+            x = slise$coefficients[ord1] / mv,
+            v = slise$coefficients[ord1],
+            r = "Coefficients"
+        )
+        if (!is.null(slise$normalised)) {
+            mv <- max(abs(slise$normalised[ord1]))
+            df <- rbind(df, data.frame(
+                l = variable_names[ord1],
+                x = slise$normalised[ord1] / mv,
+                v = slise$normalised[ord1],
+                r = "Normalised\nModel"
+            ))
+        }
+    } else {
+        mv <- max(abs(slise$coefficients))
+        df <- data.frame(
+            l = c(" ", variable_names[ord]),
+            x = c(0, slise$coefficients[ord] / mv),
+            v = c(0, slise$coefficients[ord]),
+            r = "Coefficients"
+        )
+        if (!is.null(slise$normalised)) {
+            mv <- max(abs(slise$normalised[ord]))
+            df <- rbind(df, data.frame(
+                l = c(" ", variable_names[ord]),
+                x = c(0, slise$normalised[ord] / mv),
+                v = c(0, slise$normalised[ord]),
+                r = "Normalised\nModel"
+            ))
+        }
+    }
+    if (!is.null(slise$impact)) {
+        mv <- max(abs(slise$impact[-1][ord]))
+        df <- rbind(df, data.frame(
+            l = variable_names[-1][ord],
+            x = slise$impact[-1][ord] / mv,
+            v = slise$impact[-1][ord],
+            r = "Prediction\nImpact"
+        ))
+        if (!is.null(slise$normalised_impact)) {
+            mv <- max(abs(slise$normalised_impact[-1][ord]))
+            df <- rbind(df, data.frame(
+                l = variable_names[-1][ord],
+                x = slise$normalised_impact[-1][ord] / mv,
+                v = slise$normalised_impact[-1][ord],
+                r = "Normalised\nImpact"
+            ))
+        }
+    }
+    df$r <- factor(df$r, c("Normalised\nImpact", "Prediction\nImpact", "Normalised\nModel", "Coefficients"))
+    df$l <- factor(df$l, df$l[seq_along(ord1)])
+    if (nrow(df) <= length(ord1)) {
+        df$f <- labels[(df$x > 0) + 1]
+        fill <- ggplot2::scale_fill_manual(
+            values = c(SLISE_ORANGE, SLISE_PURPLE),
+            name = "Towards: "
+        )
+    } else {
+        df$f <- df$r
+        fill <- ggplot2::scale_fill_manual(
+            values = c(SLISE_PURPLE, SLISE_DARKPURPLE, SLISE_ORANGE, SLISE_DARKORANGE),
+            breaks = rev(levels(df$r)),
+            name = NULL
+        )
+    }
+    df$lab <- signif2(df$v, signif)
+    gg2 <- ggplot2::ggplot(df) +
+        ggplot2::facet_wrap(ggplot2::vars(df$l), ncol = 1, strip.position = "left") +
+        ggplot2::theme_bw() +
+        ggplot2::xlab(NULL) +
+        ggplot2::ylab(NULL) +
+        ggplot2::ggtitle("Linear Model") +
+        ggplot2::geom_col(ggplot2::aes_string("r", "x", fill = "f")) +
+        ggplot2::coord_flip() +
+        fill +
+        ggplot2::scale_y_continuous(limits = c(-1, 1)) +
+        ggplot2::geom_text(ggplot2::aes_string("r", "x", label = "lab"), hjust = "inward") +
+        ggplot2::theme(
+            panel.grid.minor.x = ggplot2::element_blank(),
+            legend.position = "bottom",
+            axis.text.x = ggplot2::element_blank(),
+            axis.ticks.x = ggplot2::element_blank(),
+            strip.text.y = ggplot2::element_text(angle = 180)
+        )
+    if (is.null(slise$normalised) && is.null(slise$x)) {
+        gg2 <- gg2 + ggplot2::theme(
+            axis.text.y = ggplot2::element_blank(),
+            axis.ticks.y = ggplot2::element_blank()
+        )
+    }
+    # Output
+    if (partial) {
+        list(gg1, gg2)
+    } else {
+        check_package("grid")
+        check_package("gridExtra")
+        gridExtra::grid.arrange(
+            gg1,
+            gg2,
+            ncol = 2,
+            top = grid::textGrob(title, gp = grid::gpar(cex = 1.2))
+        )
+    }
+}
+
+#' Plot the robust regression or explanation from slise based on predictions
+#'
+#' @param slise The slise object
+#' @param title The title of the plot
+#' @param labels The axis labels (default: c("Response", "Count"))
+#' @param partial Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)
+#' @param approximation Should the approximation density be added (default: TRUE)
+#' @param signif The number of significant digits to display (default: 3)
+#' @param ... Ignored parameters
+#'
+#' @return ggplot object or plot
+#'
+#' @importFrom stats predict
+#'
+plot.slise_prediction <- function(slise,
+                                  title,
+                                  labels = c("Response", "Count"),
+                                  partial = FALSE,
+                                  approximation = TRUE,
+                                  signif = 3,
+                                  ...) {
+    check_package("ggplot2")
+    # Distributions
+    label_factors <- c("Dataset", "Subset", if (approximation) "Approximation" else NULL)
+    label_factors <- factor(label_factors, label_factors)
+    df <- data.frame(
+        x = c(
+            slise$Y,
+            slise$Y[slise$subset],
+            if (approximation) predict(slise, slise$X[slise$subset, ], logit = TRUE) else NULL
+        ),
+        p = ifelse(is.null(slise$y), NA, slise$y),
+        s = label_factors[c(
+            rep(1, length(slise$Y)),
+            rep(2, sum(slise$subset)),
+            if (approximation) rep(3, sum(slise$subset)) else NULL
+        )]
+    )
+    gg1 <- ggplot2::ggplot(df) +
+        ggplot2::theme_bw() +
+        ggplot2::xlab(labels[1]) +
+        ggplot2::ylab(labels[2]) +
+        ggplot2::geom_density(ggplot2::aes_string("x", "..count..", fill = "s", linetype = "s")) +
+        ggplot2::theme(
+            legend.position = "right",
+            strip.text.y = ggplot2::element_text(angle = 0)
+        ) +
+        ggplot2::scale_fill_manual(
+            values = c("white", SLISE_PURPLE, if (approximation) SLISE_WEAKPURPLE else NULL),
+            limits = c("Dataset", "Subset", if (approximation) "Approximation" else NULL),
+            name = NULL
+        ) +
+        ggplot2::scale_linetype_manual(
+            values = c(1, 1, if (approximation) 2 else NULL),
+            limits = c("Dataset", "Subset", if (approximation) "Approximation" else NULL),
+            name = NULL
+        ) +
+        ggplot2::ggtitle(if (is.null(title) || title == "") NULL else title) +
+        ggplot2::scale_x_continuous(labels = function(x) {
+            if (slise$logit) base::signif(sigmoid(x), signif) else base::signif(x, signif)
+        })
+    # Explained Line
+    if (!is.null(slise$x)) {
+        gg1 <- gg1 +
+            ggplot2::geom_vline(
+                ggplot2::aes_string(xintercept = "p", color = '"Explained\nPrediction"')
+            ) +
+            ggplot2::scale_color_manual(
+                limits = "Explained\nPrediction",
+                values = SLISE_ORANGE,
+                name = NULL
+            )
+    }
+    # Output
+    if (partial) {
+        gg1
+    } else {
+        plot(gg1)
+    }
+}
+
+#' Plot the robust regression or explanation from slise as bar plots
+#'
+#' @param slise The slise object
+#' @param title The title of the plot
+#' @param labels The class labels (vector with two strings: c(y_low, y_high), default: c("Low", "High"))
+#' @param partial Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)
+#' @param size The size of the segments (default: 8)
+#' @param ... Ignored parameters
+#'
+#' @return List of ggplot objects or plot
+#'
+#' @importFrom stats quantile
+#'
+plot.slise_bar <- function(slise,
+                           title,
+                           labels = c("Low", "High"),
+                           partial = FALSE,
+                           size = 8,
+                           ...) {
+    check_package("ggplot2")
+    ord <- order_coefficients(slise)
+    ord1 <- c(1, ord + 1)
+    # Dataset
+    if (slise$intercept) {
+        variable_names <- c("Response", names(slise$coefficients)[ord + 1])
+    } else {
+        variable_names <- names(slise$coefficients)[ord]
+    }
+    variable_names <- factor(variable_names, rev(variable_names))
+    df <- data.frame(
+        low = simplify2array(c(
+            quantile(slise$Y, 0.05),
+            sapply(ord, function(i) quantile(slise$X[, i], 0.05)),
+            quantile(slise$Y, 0.25),
+            sapply(ord, function(i) quantile(slise$X[, i], 0.25)),
+            quantile(slise$Y[slise$subset], 0.05),
+            sapply(ord, function(i) quantile(slise$X[slise$subset, i], 0.05)),
+            quantile(slise$Y[slise$subset], 0.25),
+            sapply(ord, function(i) quantile(slise$X[slise$subset, i], 0.25))
+        )),
+        high = simplify2array(c(
+            quantile(slise$Y, 0.95),
+            sapply(ord, function(i) quantile(slise$X[, i], 0.95)),
+            quantile(slise$Y, 0.75),
+            sapply(ord, function(i) quantile(slise$X[, i], 0.75)),
+            quantile(slise$Y[slise$subset], 0.95),
+            sapply(ord, function(i) quantile(slise$X[slise$subset, i], 0.95)),
+            quantile(slise$Y[slise$subset], 0.75),
+            sapply(ord, function(i) quantile(slise$X[slise$subset, i], 0.75))
+        )),
+        variable = c(variable_names, variable_names, variable_names, variable_names),
+        col = c(
+            rep(c("Data   90%", "Data   50%"), each = length(variable_names)),
+            rep(c("Subset 90%", "Subset 50%"), each = length(variable_names))
+        ),
+        height = c(rep(size, length(variable_names) * 2), rep(size * 0.5, length(variable_names) * 2)),
+        stringsAsFactors = TRUE
+    )
+    if (!is.null(slise$x)) {
+        df$point <- c(slise$y, slise$x[ord])
+        col <- c("black", "grey", SLISE_ORANGE, SLISE_DARKPURPLE, SLISE_PURPLE)
+    } else {
+        col <- c("black", "grey", SLISE_DARKPURPLE, SLISE_PURPLE)
+    }
+    gg1 <- ggplot2::ggplot(df) +
+        ggplot2::geom_segment(
+            ggplot2::aes_string(
+                x = "low",
+                xend = "high",
+                y = "variable",
+                yend = "variable",
+                col = "col"
+            ),
+            size = df$height
+        ) +
+        ggplot2::scale_color_manual(values = col, name = "Distribution") +
+        ggplot2::scale_y_discrete(limits = rev(variable_names)) +
+        ggplot2::theme_bw() +
+        ggplot2::theme(legend.position = "left") +
+        ggplot2::xlab(NULL) +
+        ggplot2::ylab(NULL) +
+        ggplot2::ggtitle("Dataset")
+    if (!is.null(slise$x)) {
+        gg1 <- gg1 + ggplot2::geom_point(
+            ggplot2::aes_string("point", "variable", col = '"Explained"'),
+            size = size * 0.5
+        )
+    }
+    # Model
+    if (slise$intercept) {
+        levels(variable_names)[length(variable_names)] <- "Intercept"
+        df <- data.frame(
+            alpha = slise$coefficients[ord1],
+            names = variable_names
+        )
+    } else {
+        levels(variable_names)[length(variable_names)] <- ""
+        df <- data.frame(
+            alpha = c(0, slise$coefficients[ord]),
+            names = variable_names
+        )
+    }
+    df$labels <- labels[(df$alpha > 0) + 1]
+    gg2 <- ggplot2::ggplot(df) +
+        ggplot2::geom_col(
+            ggplot2::aes_string("names", "alpha", fill = "labels")
+        ) +
+        ggplot2::coord_flip() +
+        ggplot2::theme_bw() +
+        ggplot2::theme() +
+        ggplot2::xlab(NULL) +
+        ggplot2::ylab(NULL) +
+        ggplot2::ggtitle("Linear Model") +
+        ggplot2::scale_fill_manual(
+            values = c("#e9a3c9", "#a1d76a"),
+            limits = labels,
+            name = "Towards: "
+        )
+    # Normalised
+    if (!is.null(slise$normalised)) {
+        if (slise$intercept) {
+            df <- data.frame(
+                alpha = slise$normalised[ord1],
+                names = variable_names
+            )
+        } else {
+            df <- data.frame(
+                alpha = c(0, slise$normalised[ord]),
+                names = variable_names
+            )
+        }
+        df$labels <- labels[(df$alpha > 0) + 1]
+        gg3 <- ggplot2::ggplot(df) +
+            ggplot2::geom_col(
+                ggplot2::aes_string("names", "alpha", fill = "labels")
+            ) +
+            ggplot2::coord_flip() +
+            ggplot2::theme_bw() +
+            ggplot2::theme() +
+            ggplot2::xlab(NULL) +
+            ggplot2::ylab(NULL) +
+            ggplot2::ggtitle("Normalised Model") +
+            ggplot2::scale_fill_manual(
+                values = c("#e9a3c9", "#a1d76a"),
+                limits = labels,
+                name = "Towards: "
+            ) +
+            ggplot2::theme(
+                axis.text.y = ggplot2::element_blank(),
+                axis.ticks.y = ggplot2::element_blank()
+            )
+        gg2 <- gg2 + ggplot2::theme(legend.position = "none")
+        # Output
+        out <- list(gg1, gg2, gg3)
+    } else {
+        out <- list(gg1, gg2)
+    }
+    if (partial) {
+        out
+    } else {
+        check_package("grid")
+        check_package("gridExtra")
+        out$ncol <- length(out)
+        out$top <- grid::textGrob(title, gp = grid::gpar(cex = 1.2))
+        do.call(gridExtra::grid.arrange, out)
+    }
+}
+
+#' Plot the robust regression or explanation from slise as an image
+#'
+#' @param slise The slise object
+#' @param title The title of the plot
+#' @param labels The class labels (vector with two strings: c(y_low, y_high), default: c("Low", "High"))
+#' @param partial Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)
+#' @param width The width of the image (width * height == ncol(X))
+#' @param height The height of the image (width * height == ncol(X))
+#' @param plots The number of plots to split the explanation into (default: 1)
+#' @param enhance_colours Increse the saturation of the explanation (default: TRUE)
+#' @param ... Ignored parameters
+#'
+#' @return ggplot object(s) or plot
+#'
+plot.slise_mnist <- function(slise,
+                             title,
+                             labels = c("Low", "High"),
+                             partial = FALSE,
+                             width = floor(sqrt(ncol(slise$X))),
+                             height = width,
+                             plots = 1,
+                             enhance_colours = TRUE,
+                             ...) {
+    check_package("ggplot2")
+    if (is.null(slise$x)) {
+        plots <- 1
+    }
+    if (plots == 1) {
+        gg <- plot_mnist(
+            matrix(slise$coefficients[-1], height, width),
+            if (is.null(slise$x)) NULL else matrix(slise$x, height, width),
+            labels,
+            enhance_colours = enhance_colours
+        ) +
+            ggplot2::ggtitle(if (is.null(title) || title == "") NULL else title)
+        if (partial) {
+            gg
+        } else {
+            plot(gg)
+        }
+    } else if (plots == 2) {
+        gg1 <- plot_mnist(
+            matrix(slise$x, height, width),
+            colours = c("white", "black"),
+            enhance_colours = FALSE
+        ) +
+            ggplot2::ggtitle("Explained Image") +
+            ggplot2::theme(legend.position = "left")
+        gg2 <- plot_mnist(
+            matrix(slise$coefficients[-1], height, width),
+            matrix(slise$x, height, width),
+            labels,
+            enhance_colours = enhance_colours
+        ) +
+            ggplot2::ggtitle("Explanation") +
+            ggplot2::theme(legend.position = "right")
+        if (partial) {
+            list(gg1, gg2)
+        } else {
+            check_package("grid")
+            check_package("gridExtra")
+            gridExtra::grid.arrange(
+                gg1,
+                gg2,
+                ncol = 2,
+                top = grid::textGrob(title, gp = grid::gpar(cex = 1.2))
+            )
+        }
+    } else if (plots == 3) {
+        gg1 <- plot_mnist(
+            matrix(slise$x, height, width),
+            colours = c("white", "black"),
+            enhance_colours = FALSE
+        ) +
+            ggplot2::ggtitle("Explained Image") +
+            ggplot2::theme(
+                legend.position = "none",
+                plot.title = ggplot2::element_text(hjust = 0.5)
+            )
+        gg2 <- plot_mnist(
+            matrix(pmin(slise$coefficients[-1], 0), height, width),
+            matrix(slise$x, height, width),
+            labels,
+            enhance_colours = enhance_colours
+        ) +
+            ggplot2::ggtitle(paste("Towards", labels[1])) +
+            ggplot2::theme(
+                legend.position = "none",
+                plot.title = ggplot2::element_text(hjust = 0.5)
+            )
+        gg3 <- plot_mnist(
+            matrix(pmax(slise$coefficients[-1], 0), height, width),
+            matrix(slise$x, height, width),
+            labels,
+            enhance_colours = enhance_colours
+        ) +
+            ggplot2::ggtitle(paste("Towards", labels[2])) +
+            ggplot2::theme(
+                legend.position = "none",
+                plot.title = ggplot2::element_text(hjust = 0.5)
+            )
+        if (partial) {
+            list(gg1, gg2, gg3)
+        } else {
+            check_package("grid")
+            check_package("gridExtra")
+            gridExtra::grid.arrange(
+                gg1,
+                gg3,
+                gg2,
+                ncol = 3,
+                top = grid::textGrob(title, gp = grid::gpar(cex = 1.2))
+            )
+        }
+    } else {
+        stop("Unimplemented number of plots")
+    }
+}
+
+#' @importFrom stats median
+plot_mnist <- function(image,
+                       contour = NULL,
+                       labels = NULL,
+                       colours = c(SLISE_DARKORANGE, "white", SLISE_DARKPURPLE),
+                       enhance_colours = TRUE) {
+    check_package("reshape2")
+    check_package("ggplot2")
+    shape <- dim(image)
+    if (enhance_colours) {
+        image <- sigmoid(3 * image / max(abs(image))) * 2 - 1
+        dim(image) <- shape
+    }
+    if (length(colours) == 3) {
+        limits <- c(-1, 1) * max(abs(image))
+    } else {
+        limits <- c(min(image), max(image))
+    }
+    gg <- ggplot2::ggplot(reshape2::melt(image)) +
+        ggplot2::geom_raster(
+            if (length(shape) == 3) {
+                ggplot2::aes_string("Var3", "Var2", fill = "value")
+            } else {
+                ggplot2::aes_string("Var2", "Var1", fill = "value")
+            },
+            interpolate = FALSE
+        ) +
+        ggplot2::theme(
+            axis.title = ggplot2::element_blank(),
+            axis.text = ggplot2::element_blank(),
+            axis.ticks = ggplot2::element_blank(),
+            aspect.ratio = shape[length(shape) - 1] / shape[length(shape)]
+        ) +
+        ggplot2::scale_y_reverse(expand = c(0, 0)) +
+        ggplot2::scale_x_continuous(expand = c(0, 0))
+    if (is.null(labels)) {
+        gg <- gg + ggplot2::scale_fill_gradientn(
+            colours = colours,
+            name = NULL,
+            limits = limits,
+            breaks = if (enhance_colours) NULL else ggplot2::waiver()
+        )
+    } else {
+        gg <- gg + ggplot2::scale_fill_gradientn(
+            colours = colours,
+            name = NULL,
+            limits = limits,
+            labels = labels,
+            breaks = limits * 0.8,
+            guide = ggplot2::guide_legend(title = NULL, reverse = TRUE)
+        )
+    }
+    if (!is.null(contour)) {
+        if (length(dim(image)) == 3 && length(contour) == dim(image)[2] * dim(image)[3]) {
+            contour <- rep(contour, dim(image)[1])
+            dim(contour) <- dim(image)[c(2, 3, 1)]
+            contour <- aperm(contour, c(3, 1, 2))
+        }
+        stopifnot(dim(contour) == dim(image))
+        gg <- gg + ggplot2::stat_contour(
+            if (length(shape) == 3) {
+                ggplot2::aes_string("Var3", "Var2", z = "value")
+            } else {
+                ggplot2::aes_string("Var2", "Var1", z = "value")
+            },
+            data = reshape2::melt(contour),
+            col = "black",
+            bins = 2
+        )
+    }
+    gg
+}
+
+#' Plot the robust regression or explanation from slise as a wordcloud
+#'
+#' @param slise The slise object
+#' @param title The title of the plot
+#' @param labels The class labels (vector with two strings: c(y_low, y_high), default: c("Low", "High"))
+#' @param treshold Treshold for ignored value (default: 1e-8)
+#' @param local Only display the words relevant for the explained item (default: TRUE)
+#' @param ... Ignored parameters
+#'
+#' @return plot
+#'
+#' @importFrom graphics legend
+#'
+plot.slise_wordcloud <- function(slise,
+                                 title,
+                                 labels = c("Low", "High"),
+                                 treshold = 1e-8,
+                                 local = TRUE,
+                                 ...) {
+    check_package("wordcloud")
+    mask <- abs(slise$alpha) > treshold
+    if (local && !is.null(slise$x)) {
+        if (length(mask) > length(slise$x)) {
+            mask[1] <- FALSE
+            mask[-1] <- mask[-1] & (abs(slise$x) > treshold)
+        } else {
+            mask <- mask & (abs(slise$x) > treshold)
+        }
+    } else if (slise$intercept) {
+        mask[1] <- FALSE
+    }
+    wordcloud::wordcloud(
+        names(slise$coefficients)[mask],
+        abs(slise$alpha[mask]),
+        colors = ifelse(slise$alpha[mask] > 0, SLISE_PURPLE, SLISE_ORANGE),
+        ordered.colors = TRUE
+    )
+    legend(
+        "bottom",
+        legend = labels,
+        text.col = c(SLISE_ORANGE, SLISE_PURPLE),
+        horiz = TRUE,
+        title = "Towards:",
+        title.col = "black",
+        bty = "n",
+        adj = 0.5,
+        x.intersp = 2
+    )
+    title(title)
+}
diff --git a/R/slise.R b/R/slise.R
index dd6918b..f4cbcc5 100644
--- a/R/slise.R
+++ b/R/slise.R
@@ -1,10 +1,428 @@
-#' @useDynLib slise
-#' @importFrom Rcpp sourceCpp
-"_PACKAGE"
-NULL
-
-.onUnload <- function (libpath) {
-  library.dynam.unload("slise", libpath)
-}
-
-Rcpp::loadModule("slise_mod", TRUE)
+# This script contains the SLISE functions (slise.fit and slise.explain)
+
+
+#' SLISE Regression
+#' Use SLISE for robust regression.
+#'
+#' It is highly recommended that you normalise the data,
+#' either before using SLISE or by setting normalise = TRUE.
+#'
+#' @param X Matrix of independent variables
+#' @param Y Vector of the response variable
+#' @param epsilon Error tolerance
+#' @param lambda1 L1 regularisation coefficient (default: 0)
+#' @param lambda2 L2 regularisation coefficient (default: 0)
+#' @param weight Optional weight vector (default: NULL)
+#' @param intercept Should an intercept be added (default: TRUE)
+#' @param normalise Preprocess X and Y by scaling, note that epsilon is not scaled (default: FALSE)
+#' @param initialisation Function that gives the initial alpha and beta, or a list containing the initial alpha and beta (default: slise_initialisation_candidates)
+#' @param ... Other parameters to the optimiser and initialiser
+#' @inheritDotParams graduated_optimisation max_approx beta_max max_iterations debug
+#' @inheritDotParams slise_initialisation_candidates num_init beta_max_init pca_treshold
+#'
+#' @return slise object (coefficients, subset, value, X, Y, lambda1, lambda2, epsilon, scaled, alpha)
+#' @export
+#'
+#' @examples
+#' # Assuming data is a data.frame with the first column containing the response
+#' # Further assuming newdata is a similar data.frame with the response missing
+#' X <- matrix(rnorm(32), 8, 4)
+#' Y <- rnorm(8)
+#' model <- slise.fit(X, Y, (max(Y) - min(Y)) * 0.1)
+#' predicted <- predict(model, X)
+slise.fit <- function(X,
+                      Y,
+                      epsilon,
+                      lambda1 = 0,
+                      lambda2 = 0,
+                      weight = NULL,
+                      intercept = TRUE,
+                      normalise = FALSE,
+                      initialisation = slise_initialisation_candidates,
+                      ...) {
+    # Setup
+    matprod_default <- options(matprod = "blas") # Use faster math
+    X <- as.matrix(X)
+    Y <- c(Y)
+    X_orig <- X
+    Y_orig <- Y
+    if (any(weight < 0)) {
+        stop("Weights must not be negative!")
+    }
+    stopifnot(epsilon > 0)
+    stopifnot(lambda1 >= 0)
+    stopifnot(lambda2 >= 0)
+    # Preprocessing
+    if (normalise) {
+        X <- remove_constant_columns(X)
+        X <- scale_robust(X)
+        Y <- scale_robust(Y)
+        if (!intercept) {
+            stop("Normalisation requires intercept!")
+        }
+    }
+    if (intercept) {
+        X <- add_intercept_column(X)
+    }
+    # Initialisation
+    if (is.list(initialisation)) {
+        init <- initialisation
+        names(init) <- c("alpha", "beta")
+    } else {
+        init <- initialisation(X, Y, epsilon = epsilon, weight = weight, ...)
+    }
+    # Optimisation
+    alpha <- graduated_optimisation(
+        init$alpha,
+        X,
+        Y,
+        epsilon = epsilon,
+        beta = init$beta,
+        lambda1 = lambda1,
+        lambda2 = lambda2,
+        weight = weight,
+        ...
+    )$par
+    # Output
+    if (normalise) {
+        alpha2 <- unscale_alpha(alpha, X, Y)
+        alpha2 <- add_constant_columns(alpha2, attr(X, "constant_columns") + 1)
+        alpha <- add_constant_columns(alpha, attr(X, "constant_columns") + 1)
+        out <- slise.object(
+            alpha2,
+            X_orig,
+            Y_orig,
+            epsilon * attr(Y, "scaled:scale"),
+            lambda1,
+            lambda2,
+            weight,
+            intercept,
+            normalised = alpha
+        )
+    } else {
+        out <- slise.object(alpha, X_orig, Y, epsilon, lambda1, lambda2, weight, intercept)
+    }
+    options(matprod_default) # Reset options
+    out
+}
+
+
+#' SLISE Black Box Explainer
+#' Use SLISE for explaining predictions made by a black box.
+#'
+#' It is highly recommended that you normalise the data,
+#' either before using SLISE or by setting normalise = TRUE.
+#'
+#' @param X Matrix of independent variables
+#' @param Y Vector of the dependent variable
+#' @param epsilon Error tolerance
+#' @param x The sample to be explained (or index if y is null)
+#' @param y The prediction to be explained (default: NULL)
+#' @param lambda1 L1 regularisation coefficient (default: 0)
+#' @param lambda2 L2 regularisation coefficient (default: 0)
+#' @param weight Optional weight vector (default: NULL)
+#' @param normalise Preprocess X and Y by scaling, note that epsilon is not scaled (default: FALSE)
+#' @param logit Logit transform Y from probabilities to real values (default: FALSE)
+#' @param initialisation function that gives the initial alpha and beta, or a list containing the initial alpha and beta (default: slise_initialisation_candidates)
+#' @param ... Other parameters to the optimiser and initialiser
+#' @inheritDotParams graduated_optimisation max_approx beta_max max_iterations debug
+#' @inheritDotParams slise_initialisation_candidates num_init beta_max_init pca_treshold
+#'
+#' @return slise object (coefficients, subset, value, X, Y, lambda1, lambda2, epsilon, scaled, alpha, x, y)
+#' @export
+#'
+#' @examples
+#' X <- matrix(rnorm(32), 8, 4)
+#' Y <- runif(8, 0, 1)
+#' expl <- slise.explain(X, Y, 0.1, 3, lambda1 = 0.01, logit = TRUE)
+#' plot(expl, "bar", labels = c("class 1", "class 2"))
+slise.explain <- function(X,
+                          Y,
+                          epsilon,
+                          x,
+                          y = NULL,
+                          lambda1 = 0,
+                          lambda2 = 0,
+                          weight = NULL,
+                          normalise = FALSE,
+                          logit = FALSE,
+                          initialisation = slise_initialisation_candidates,
+                          ...) {
+    # Setup
+    matprod_default <- options(matprod = "blas") # Use faster math
+    X <- as.matrix(X)
+    Y <- c(Y)
+    if (logit) {
+        Y <- limited_logit(Y)
+    }
+    if (any(weight < 0)) {
+        stop("Weights must not be negative!")
+    }
+    stopifnot(epsilon > 0)
+    stopifnot(lambda1 >= 0)
+    stopifnot(lambda2 >= 0)
+    X_orig <- X
+    Y_orig <- Y
+    if (is.null(y)) {
+        # x is an index
+        y <- Y[[x]]
+        x <- X[x, ]
+    } else if (logit) {
+        y <- limited_logit(y)
+    }
+    x_orig <- x
+    y_orig <- y
+    # Preprocessing
+    if (normalise) {
+        X <- remove_constant_columns(X)
+        X <- scale_robust(X)
+        Y <- if (!logit) {
+            scale_robust(Y)
+        } else {
+            scale_identity(Y)
+        }
+        x <- scale_same(x, X)
+        y <- scale_same(y, Y)
+    }
+    # Localise
+    X <- sweep(X, 2, x)
+    Y <- Y - y
+    # Initialisation
+    if (is.list(initialisation)) {
+        init <- initialisation
+        names(init) <- c("alpha", "beta")
+    } else {
+        init <- initialisation(X, Y, epsilon = epsilon, weight = weight, ...)
+    }
+    # Optimisation
+    alpha <- graduated_optimisation(
+        init$alpha,
+        X,
+        Y,
+        epsilon = epsilon,
+        beta = init$beta,
+        lambda1 = lambda1,
+        lambda2 = lambda2,
+        weight = weight,
+        ...
+    )$par
+    # Output
+    if (normalise) {
+        alpha2 <- unscale_alpha(alpha, X, Y)
+        alpha2 <- add_constant_columns(alpha2[-1], attr(X, "constant_columns"))
+        alpha2 <- c(y_orig - sum(x_orig * alpha2), alpha2)
+        alpha <- add_constant_columns(alpha, attr(X, "constant_columns"))
+        x <- add_constant_columns(x, attr(X, "constant_columns"))
+        alpha <- c(y - sum(x * alpha), alpha)
+        out <- slise.object(
+            alpha2,
+            X_orig,
+            Y_orig,
+            epsilon * attr(Y, "scaled:scale"),
+            lambda1,
+            lambda2,
+            weight,
+            TRUE,
+            x = x_orig,
+            y = y_orig,
+            impact = c(1, x_orig) * alpha2,
+            logit = logit,
+            normalised = alpha,
+            normalised_x = x,
+            normalised_y = y,
+            normalised_impact = c(1, x) * alpha
+        )
+    } else {
+        alpha <- c(y_orig - sum(x_orig * alpha), alpha)
+        out <- slise.object(
+            alpha,
+            X_orig,
+            Y_orig,
+            epsilon,
+            lambda1,
+            lambda2,
+            weight,
+            TRUE,
+            x = x_orig,
+            y = y_orig,
+            impact = c(1, x_orig) * alpha,
+            logit = logit
+        )
+    }
+    options(matprod_default) # Reset options
+    out
+}
+
+#' SLISE Black Box Explainer
+#' Use SLISE for explaining predictions made by a black box.
+#' BUT with a binary search for sparsity!
+#'
+#' @param ... parameters to slise.explain
+#' @inheritDotParams slise.explain -lambda1
+#' @param lambda1 the starting value of the search
+#' @param variables number of non-zero coefficients
+#' @param iters number of search iterations
+#' @param treshold treshold for zero coefficient
+#'
+#' @return SLISE object
+#' @export
+#'
+slise.explain_find <- function(..., lambda1 = 5, variables = 4, iters = 10, treshold = 1e-4) {
+    lower <- 0
+    upper <- -1
+    upper_best <- NULL
+    lower_best <- NULL
+    for (j in 1:iters) {
+        slise <- slise.explain(lambda1 = lambda1, ...)
+        s <- sparsity(slise$alpha[-1], treshold)
+        if (s > variables) {
+            lower_best <- slise
+            lower <- lambda1
+        } else {
+            upper <- lambda1
+            upper_best <- slise
+        }
+        if (upper < 0) {
+            lambda1 <- lambda1 * 2
+        } else {
+            lambda1 <- (upper + lower) * 0.5
+        }
+    }
+    if (!is.null(upper_best) && sparsity(upper_best$alpha[-1], treshold) == variables) {
+        upper_best
+    } else if (is.null(lower_best)) {
+        slise <- slise.explain(lambda1 = lower, ...)
+    } else {
+        lower_best
+    }
+}
+
+#' SLISE Black Box Explainer
+#' Use SLISE for explaining predictions made by a black box.
+#' BUT with sparsity from a combinatorial search rather than Lasso!
+#'
+#' @param X matrix of independent variables
+#' @param Y vector of the dependent variable
+#' @param epsilon error tolerance
+#' @param x the sample to be explained (or index if y is null)
+#' @param y the prediction to be explained
+#' @param ... other parameters to slise.explain
+#' @inheritDotParams slise.explain -X -Y -x -y -epsilon
+#' @param variables the number of non-zero coefficients
+#'
+#' @return SLISE object
+#' @export
+#'
+#' @importFrom utils combn
+#'
+slise.explain_comb <- function(X, Y, epsilon, x, y = NULL, ..., variables = 4) {
+    if (all(is.null(y))) {
+        y <- Y[[x]]
+        x <- X[x, ]
+    }
+    X <- as.matrix(X)
+    len <- ncol(X)
+    combs <- factorial(len) / factorial(variables) / factorial(len - variables)
+    if (combs >= 30) {
+        warning(sprintf("The combinatorial search will take a long time (requires %d iterations)", combs))
+    }
+    res <- combn(1:len, variables, function(s) {
+        res <- slise.explain(X[, -s, drop = FALSE], Y, epsilon, x[-s], y, ...)
+        res$X <- X
+        alpha <- add_constant_columns(res$alpha, s + 1)
+        res$alpha <- alpha
+        res$coefficients <- alpha
+        res
+    }, simplify = FALSE)
+    expl <- res[[which.min(sapply(res, function(r) r$value))]]
+    expl$X <- X
+    expl
+}
+
+#' Create a result object for SLISE that is similar to other regression method results
+#'
+#' @param alpha linear model
+#' @param X data matrix
+#' @param Y response vector
+#' @param epsilon error tolerance
+#' @param lambda1 L1 regularisation coefficient (default: 0)
+#' @param lambda2 L2 regularisation coefficient (default: 0)
+#' @param weight weight vector (default: NULL)
+#' @param intercept does the model have an intercept (default: FALSE)
+#' @param logit has the target been logit-transformed (default: FALSE)
+#' @param x explained item x (default: NULL)
+#' @param y explained item y (default: NULL)
+#' @param ... other variables to add to the SLISE object
+#'
+#' @return list(coefficients=unscale(alpha), X, Y, scaled=data, lambda1, lambda2, alpha, subset=[r_i<epsilon], value=loss, epsilon, loss, weight, ...)
+#'
+slise.object <- function(alpha,
+                         X,
+                         Y,
+                         epsilon,
+                         lambda1 = 0,
+                         lambda2 = 0,
+                         weight = NULL,
+                         intercept = FALSE,
+                         logit = FALSE,
+                         x = NULL,
+                         y = NULL,
+                         ...) {
+    var_names <- colnames(X)
+    if (length(var_names) == 0 && is.null(var_names)) {
+        var_names <- paste(seq_len(ncol(X)))
+    }
+    if (intercept) {
+        dist <- (c(X %*% alpha[-1] + alpha[[1]]) - Y)^2
+        var_names <- c("Intercept", var_names)
+    }
+    else {
+        dist <- (c(X %*% alpha) - Y)^2
+    }
+    names(alpha) <- var_names
+    mask <- dist <= epsilon^2
+    loss <- loss_sharp_res(alpha, dist, epsilon^2, lambda1, lambda2, weight)
+    structure(list(
+        coefficients = alpha, X = X, Y = Y, lambda1 = lambda1, logit = logit,
+        lambda2 = lambda2, alpha = alpha, subset = mask, value = loss,
+        epsilon = epsilon, loss = loss, weight = weight, intercept = intercept,
+        x = x, y = y, ...
+    ), class = "slise")
+}
+
+#' Predict with a SLISE
+#'
+#' @param object SLISE object
+#' @param newdata data matrix
+#' @param ... ignored additional parameters
+#' @param logit return the result in logit space if a logit has been applied to Y (default: FALSE)
+#'
+#' @return prediction vector
+#' @export
+#'
+#' @importFrom stats predict
+#'
+predict.slise <- function(object, newdata = NULL, ..., logit = FALSE) {
+    if (is.null(newdata)) {
+        newdata <- object$X
+    } else {
+        if (is.vector(newdata)) {
+            newdata <- t(newdata)
+        } else {
+            newdata <- as.matrix(newdata)
+        }
+        if (ncol(newdata) != ncol(object$X)) {
+            stop(sprintf("Different number of columns in the new data %d != %d!", ncol(newdata), ncol(object$X)))
+        }
+    }
+    if (object$intercept) {
+        out <- newdata %*% object$alpha[-1] + object$alpha[1]
+    } else {
+        out <- newdata %*% object$alpha
+    }
+    if (!logit && object$logit) {
+        sigmoid(out)
+    } else {
+        out
+    }
+}
\ No newline at end of file
diff --git a/R/utils.R b/R/utils.R
index 67afc98..6ddcfa7 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1,65 +1,117 @@
-# This script contains some utility functions
-
-#' sigmoid function
-#'
-#' @param x vector
-#' @return sigmoid(x)
-sigmoid <- function(x) 1 / (1 + exp(-x))
-
-#' derivative of sigmoid function
-#'
-#' @param x vector
-#' @return Derivative of sigmoid(x).
-dsigmoid <- function(x) {
-    s <- sigmoid(x)
-    s * (1 - s)
-}
-
-#' log-sigmoid function
-#'
-#' @param x vector
-#' @return log(sigmoid(x))
-log_sigmoid <- function(x) ifelse(x >= 0, - log(1 + exp(-x)), x - log(1 + exp(x)))
-
-#' derivative of log-sigmoid function
-#'
-#' @param x vector
-#' @return Derivative of log(sigmoid(x)).
-dlog_sigmoid <- function(x) 1 - sigmoid(x)
-
-
-#' Which min n
-#' Get the indecies of the n smallest values using partial sort
-#'
-#' @param x vector
-#' @param n the number of indecies
-#' @return vector of indecies
-which_min_n <- function(x, n) which(x <= sort(x, partial = n)[n])[1:n]
-
-
-#' Sparsity
-#' Count the non-zero coefficients
-#'
-#' @param x vector
-#' @param treshold treshold for approximately zero (0)
-#' @return number of non-zero values
-sparsity <- function(x, treshold = 0) sum(abs(x) > treshold)
-
-#' Computes log(sum(exp(x))) in a numerically robust way.
-#'
-#' @param x vector of length n
-#' @return log(sum(exp(x))).
-log_sum <- function(x) {
-    xmax <- max(x)
-    xmax + log(sum(exp(x - xmax)))
-}
-
-#' Computes the logits from probabilities
-#'
-#' @param p probability (vector)
-#' @param stab limit p to [stab, 1-stab] for numerical stability
-#' @return log(p / (1 - p))
-logit <- function(p, stab = 0.001) {
-    p <- pmin(1.0 - stab, pmax(stab, p))
-    log(p / (1.0 - p))
-}
+# This script contains some utility functions
+
+library(lbfgs)
+
+#' sigmoid function
+#'
+#' @param x vector of real values
+#'
+#' @return sigmoid(x)
+#'
+sigmoid <- function(x) 1 / (1 + exp(-x))
+
+#' derivative of sigmoid function
+#'
+#' @param x vector of real values
+#'
+#' @return Derivative of sigmoid(x).
+#'
+dsigmoid <- function(x) {
+    s <- sigmoid(x)
+    s * (1 - s)
+}
+
+#' log-sigmoid function
+#'
+#' @param x  vector of real values
+#'
+#' @return log(sigmoid(x))
+#'
+log_sigmoid <- function(x) ifelse(x >= 0, - log(1 + exp(-x)), x - log(1 + exp(x)))
+
+#' derivative of log-sigmoid function
+#'
+#' @param x vector of real values
+#'
+#' @return Derivative of log(sigmoid(x)).
+#'
+dlog_sigmoid <- function(x) 1 - sigmoid(x)
+
+
+#' Which min n
+#' Get the indecies of the n smallest values using partial sort
+#'
+#' @param x vector
+#' @param n the number of indices
+#'
+#' @return vector of indecies
+#'
+which_min_n <- function(x, n) which(x <= sort(x, partial = n)[n])[1:n]
+
+
+#' Sparsity
+#' Count the non-zero coefficients
+#'
+#' @param x vector
+#' @param treshold threshold for zero
+#'
+#' @return number of non-zero values
+#'
+sparsity <- function(x, treshold = .Machine$double.eps) sum(abs(x) > treshold)
+
+#' Computes log(sum(exp(x))) in a numerically robust way.
+#'
+#' @param x vector of length n
+#'
+#' @return log(sum(exp(x))).
+#'
+log_sum <- function(x) {
+    xmax <- max(x)
+    xmax + log(sum(exp(x - xmax)))
+}
+
+#' Computes log(sum(exp(x) * y)),
+#' or log(sum(exp(x))) if all(y == 0),
+#' in a numerically robust way.
+#'
+#' @param x vector of length n
+#' @param y multiplier
+#'
+#' @return log(sum(exp(x))).
+#'
+log_sum_special <- function(x, y) {
+    xmax <- max(x)
+    xexp <- exp(x - xmax)
+    xsum <- sum(xexp * y)
+    if (xsum == 0) xsum <- sum(xexp)
+    xmax + log(xsum)
+}
+
+#' Computes the logits from probabilities
+#'
+#' @param p probability (vector)
+#' @param stab limit p to [stab, 1-stab] for numerical stability
+#'
+#' @return log(p / (1 - p))
+#'
+limited_logit <- function(p, stab = 0.001) {
+    p <- pmin(1.0 - stab, pmax(stab, p))
+    log(p / (1.0 - p))
+}
+
+# Checks if the object has the attribute
+hasattr <- function(object, attribute) {
+    !is.null(attr(object, attribute))
+}
+
+# A variant of `signif` that gives "" in case of zero
+signif2 <- function(x, num = 5) {
+    ifelse(abs(x) < .Machine$double.eps, "", signif(x, num))
+}
+
+# Check if a package is installed
+check_package <- function(pack) {
+    if (!requireNamespace(pack, quietly = TRUE)) {
+        stop(paste0("Package \"", pack, "\" needed for the function to work. Please install it."), call. = FALSE)
+    }
+}
diff --git a/README.md b/README.md
index 372726a..b70aa73 100644
--- a/README.md
+++ b/README.md
@@ -1,86 +1,107 @@
-# SLISE - Sparse Linear Subset Explanations
-
-R implementation of the SLISE algorithm. The SLISE algorithm can be used for
-both robust regression and to explain outcomes from black box models.
-For more details see [the paper](https://rdcu.be/bVbda), alternatively for a more informal
-overview see [the presentation](vignettes/presentation.pdf), or [the poster](vignettes/poster.pdf).
-
-> **Björklund A., Henelius A., Oikarinen E., Kallonen K., Puolamäki K.**  
-> *Sparse Robust Regression for Explaining Classifiers.*  
-> Discovery Science (DS 2019).  
-> Lecture Notes in Computer Science, vol 11828, Springer.  
-> https://doi.org/10.1007/978-3-030-33778-0_27
-
-
-## Other Languages
-
-The official Python version can be found [here](https://github.com/edahelsinki/pyslise).
-
-
-## Installation
-To install this R-package, proceed as follows.
-
-First install the `devtools`-package and load it in R:
-```R
-install.packages("devtools")
-library(devtools)
-```
-
-Then install the `slise` package
-
-```R
-install_github("edahelsinki/slise")
-```
-
-### Loading
-After installation, start R and load the package using
-```R
-library(slise)
-```
-
-
-## Example
-
-In order to use SLISE you need to have your data in a numerical matrix (or
-something that can be cast to a matrix), and the response as a numerical vector.
-Below is an example of SLISE being used for robust regression:
-
-```R
-source("experiments/utils.R")
-data <- data_pox("fpox", "all")
-slise <- slise.fit(X=data$X, Y=data$Y, epsilon=0.1, lambda=0)
-title <- sprintf("SLISE as Robust Regression   [Intercept = %.0f, Smallpox = %.2f]",
-    slise$coefficients[1], slise$coefficients[2])
-plot(slise, labels=c("Smallpox", "All Deaths"), title=title)
-```
-![Example Plot 1](experiments/results/ex1.jpg)
-
-
-SLISE can also be used to explain an opaque classifiers:
-
-```R
-source("experiments/utils.R")
-set.seed(42)
-emnist <- data_emnist(10000, classifier="digits2")
-slise <- slise.explain(emnist$X, emnist$Y, 3, epsilon = 0.1, lambda = 2, logit = TRUE)
-explain(slise, "image", class_labels=c("not 2", "is 2"), title="Using SLISE to explain a handwritten digit")
-```
-![Example Plot 1](experiments/results/ex2.jpg)
-
-
-## Dependencies
-
-SLISE depends on the following R-packages:
-
- - Rcpp
- - lbfgs
- - ggplot2
-
-The following R-packages are optional, but needed for *some* of the built-in visualisations:
-
- - grid
- - gridExtra
- - reshape2
- - scatterplot3d
- - crayon
- - wordcloud
+# SLISE - Sparse Linear Subset Explanations
+
+R implementation of the SLISE algorithm. The SLISE algorithm can be used for both robust regression and to explain outcomes from black box models. For more details see [the original paper](https://rdcu.be/bVbda) or the [robust regression paper](https://rdcu.be/cFRHD). Alternatively for a more informal overview see [the presentation](https://github.com/edahelsinki/slise/raw/master/vignettes/presentation.pdf), or [the poster](https://github.com/edahelsinki/slise/raw/master/vignettes/poster.pdf).
+
+> *Björklund A., Henelius A., Oikarinen E., Kallonen K., Puolamäki K.* (2019)  
+> **Sparse Robust Regression for Explaining Classifiers.**  
+> Discovery Science (DS 2019).  
+> Lecture Notes in Computer Science, vol 11828, Springer.  
+> https://doi.org/10.1007/978-3-030-33778-0_27  
+
+> *Björklund A., Henelius A., Oikarinen E., Kallonen K., Puolamäki K.* (2022).  
+> **Robust regression via error tolerance.**  
+> Data Mining and Knowledge Discovery.  
+> https://doi.org/10.1007/s10618-022-00819-2  
+
+
+## The idea
+
+In robust regression we fit regression models that can handle data that contains outliers (see the example below for why outliers are problematic for normal regression). SLISE accomplishes this by fitting a model such that the largest possible subset of the data items have an error less than a given value. All items with an error larger than that are considered potential outliers and do not affect the resulting model.
+
+SLISE can also be used to provide *local model-agnostic explanations* for outcomes from black box models. To do this we replace the ground truth response vector with the predictions from the complex model. Furthermore, we force the model to fit a selected item (making the explanation local). This gives us a local approximation of the complex model with a simpler linear model (this is similar to, e.g., [LIME](https://github.com/marcotcr/lime) and [SHAP](https://github.com/slundberg/shap)). In contrast to other methods SLISE creates explanations using real data (not some discretised and randomly sampled data) so we can be sure that all inputs are valid (i.e. in the correct data manifold, and follows the constraints used to generate the data, e.g., the laws of physics).
+
+
+## Installation
+
+First install the `devtools`-package:
+
+```R
+install.packages("devtools")
+```
+
+Then install the `slise` package:
+
+```R
+devtools::install_github("edahelsinki/slise")
+```
+
+After installation, load the package using:
+
+```R
+library(slise)
+```
+
+
+## Other Languages
+
+The official Python version can be found [here](https://github.com/edahelsinki/pyslise).
+
+
+## Example
+
+In order to use SLISE you need to have your data in a numerical matrix (or something that can be cast to a matrix), and the response as a numerical vector. Below is an example of SLISE being used for robust regression:
+
+```R
+library(ggplot2)
+source("experiments/regression/utils.R")
+set.seed(42)
+
+x <- seq(-1, 1, length.out = 50)
+y <- -x + rnorm(50, 0, 0.15)
+x <- c(x, rep(seq(1.6, 1.8, 0.1), 2))
+y <- c(y, rep(c(1.8, 1.95), each = 3))
+
+ols <- lm(y ~ x)$coefficients
+slise <- slise.fit(x, y, epsilon = 0.5)
+
+plot(slise, title = "", partial = TRUE, size = 2) +
+    geom_abline(aes(intercept = ols[1], slope = ols[2], color = "OLS", linetype = "OLS"), size = 2) +
+    scale_color_manual(values = c("#1b9e77", SLISE_ORANGE), name = NULL) +
+    scale_linetype_manual(values = 2:1, name = NULL) +
+    theme(axis.title.y = element_text(angle = 0, vjust = 0.5), legend.key.size = grid::unit(2, "line")) +
+    guides(shape = FALSE, color = "legend", linetype = "legend")
+```
+![Robust Regression Example Plot](experiments/results/ex1.png)
+
+
+SLISE can also be used to explain predictions from black box models such as convolutional neural networks:
+
+```R
+devtools::load_all()
+source("experiments/regression/data.R")
+set.seed(42)
+
+emnist <- data_emnist(10000, digit=2, th = -1)
+slise <- slise.explain(emnist$X, emnist$Y, 0.5, emnist$X[17,], emnist$Y[17], 3, 6)
+
+plot(slise, "image", "", c("not 2", "is 2"), plots = 1)
+```
+![Explanation Example Plot](experiments/results/ex2.png)
+
+
+## Dependencies
+
+SLISE depends on the following R-packages:
+
+- Rcpp
+- RcppArmadillo
+- lbfgs
+
+The following R-packages are optional, but needed for *some* of the built-in visualisations:
+
+- ggplot2
+- grid
+- gridExtra
+- reshape2
+- crayon
+- wordcloud
diff --git a/experiments/README.md b/experiments/README.md
index fd4c0d7..a9ae94f 100644
--- a/experiments/README.md
+++ b/experiments/README.md
@@ -1,51 +1,9 @@
-# Experiments
-
-This directory contains the experiments used in the paper.
-
-
-## Files
-
- - Helper functions
-    - utils.R
-    - lime.R
-    - collect_results.R
- - Data downloading and preparation
-    - data/retrieve_*.R
- - Experiments (some are designed to run on a cluster)
-    - exp_*.R
- - Plotting results for the paper
-    - plot_*.R
-
-
-## Dependencies
-
-For running all the experiments and gathering all the data the following
-packages are needed, note that not all packages are needed individual
-experiments:
-
- - MASS
- - MTE
- - robustHD
- - sparseLTSEigen
- - robustbase
- - glmnet
- - R.utils
- - ggplot2
- - dplyr
- - scales
- - abind
- - magick
- - lime
- - xtable
- - plyr
- - keras
- - R.matlab
- - tm
- - qdap
- - Matrix
- - SnowballC
- - e1071
- - randomForest
- - elmNNRcpp
- - latex2exp
- - pense
+# Experiments
+
+This directory contains experiments used in the papers.
+
+- data: Contains scripts for pre-processing data (training classifiers).
+- conference: Contains out-of-date experiments for the conference paper.
+- regression: Contains experiments for the robust regression paper.
+- explanations: Contains experiments for the explanation paper.
+
diff --git a/experiments/conference/README.md b/experiments/conference/README.md
new file mode 100644
index 0000000..2335fe5
--- /dev/null
+++ b/experiments/conference/README.md
@@ -0,0 +1,52 @@
+# Experiments
+
+This directory contains the experiments used in the conference paper.
+
+> These files are out-of-date due to a large refactoring of the SLISE code.
+
+## Files
+
+ - Helper functions
+    - utils.R
+    - lime.R
+    - collect_results.R
+ - Data downloading and preparation
+    - data/retrieve_*.R
+ - Experiments (some are designed to run on a cluster)
+    - exp_*.R
+ - Plotting results for the paper
+    - plot_*.R
+
+
+## Dependencies
+
+For running all the experiments and gathering all the data the following
+packages are needed, note that not all packages are needed individual
+experiments:
+
+ - MASS
+ - MTE
+ - robustHD
+ - sparseLTSEigen
+ - robustbase
+ - glmnet
+ - R.utils
+ - ggplot2
+ - dplyr
+ - scales
+ - abind
+ - magick
+ - lime
+ - xtable
+ - plyr
+ - keras
+ - R.matlab
+ - tm
+ - qdap
+ - Matrix
+ - SnowballC
+ - e1071
+ - randomForest
+ - elmNNRcpp
+ - latex2exp
+ - pense
diff --git a/experiments/collect_results.R b/experiments/conference/collect_results.R
similarity index 100%
rename from experiments/collect_results.R
rename to experiments/conference/collect_results.R
diff --git a/experiments/exp_optimality.R b/experiments/conference/exp_optimality.R
similarity index 100%
rename from experiments/exp_optimality.R
rename to experiments/conference/exp_optimality.R
diff --git a/experiments/exp_robustness.R b/experiments/conference/exp_robustness.R
similarity index 100%
rename from experiments/exp_robustness.R
rename to experiments/conference/exp_robustness.R
diff --git a/experiments/exp_scalability.R b/experiments/conference/exp_scalability.R
similarity index 100%
rename from experiments/exp_scalability.R
rename to experiments/conference/exp_scalability.R
diff --git a/experiments/lime.R b/experiments/conference/lime.R
similarity index 100%
rename from experiments/lime.R
rename to experiments/conference/lime.R
diff --git a/experiments/plot_explanation.R b/experiments/conference/plot_explanation.R
similarity index 100%
rename from experiments/plot_explanation.R
rename to experiments/conference/plot_explanation.R
diff --git a/experiments/plot_introduction.R b/experiments/conference/plot_introduction.R
similarity index 100%
rename from experiments/plot_introduction.R
rename to experiments/conference/plot_introduction.R
diff --git a/experiments/plot_regression.R b/experiments/conference/plot_regression.R
similarity index 100%
rename from experiments/plot_regression.R
rename to experiments/conference/plot_regression.R
diff --git a/experiments/utils.R b/experiments/conference/utils.R
similarity index 100%
rename from experiments/utils.R
rename to experiments/conference/utils.R
diff --git a/experiments/data/RootParser.cpp b/experiments/data/RootParser.cpp
index d0aa2c1..7b592c7 100644
--- a/experiments/data/RootParser.cpp
+++ b/experiments/data/RootParser.cpp
@@ -1,64 +1,93 @@
-// This script is used to extract tabular information about the jets to a csv
-// Run this script with:
-//    root -l -b -q RootParser.cpp
-// From the folder with the root data file downloaded from https://hot.hip.fi/index.php/2018/11/14/hip-cms-opendata-jet-tuples-8-tev-mc/
-// Note that this requires that you have root installed (http://root.cern.ch/)
-
-#define RootParser_cxx
-#include "RootParser.h"
-#include <iostream>
-#include <fstream>
-#include <cmath>
-
-
-void RootParser::Loop()
-{
-//   In a ROOT session, you can do:
-//      root> .L RootParser.C
-//      root> RootParser t
-//      root> t.GetEntry(12); // Fill t data members with entry number 12
-//      root> t.Show();       // Show values of entry 12
-//      root> t.Show(16);     // Read and show values of entry 16
-//      root> t.Loop();       // Loop on all entries
-//
-//  This is the loop skeleton where:
-//    jentry is the global entry number in the chain
-//    ientry is the entry number in the current Tree
-//  Note that the argument to GetEntry must be:
-//    jentry for TChain::GetEntry
-//    ientry for TTree::GetEntry and TBranch::GetEntry
-//
-    fChain->SetBranchStatus("*",0);  // disable all branches
-    fChain->SetBranchStatus("jetPt",1);  // activate branchname
-    fChain->SetBranchStatus("jetEta",1);  // activate branchname
-    fChain->SetBranchStatus("jetGirth",1);  // activate branchname
-    fChain->SetBranchStatus("jetTightID",1);  // activate branchname
-    fChain->SetBranchStatus("isPhysUDS",1);  // activate branchname
-    fChain->SetBranchStatus("isPhysG",1);  // activate branchname
-    fChain->SetBranchStatus("isPhysOther",1);  // activate branchname
-    fChain->SetBranchStatus("QG_ptD",1);  // activate branchname
-    fChain->SetBranchStatus("QG_axis2",1);  // activate branchname
-    fChain->SetBranchStatus("QG_mult",1);  // activate branchname
-    fChain->SetBranchStatus("jetGenMatch",1);  // activate branchname
-   if (fChain == 0) return;
-
-   std::ofstream csvfile;
-   csvfile.open ("jets.csv");
-   csvfile << "isPhysUDS, jetPt, jetGirth, QG_ptD, QG_axis2, QG_mult" << std::endl;
-   
-
-   Long64_t nentries = fChain->GetEntriesFast();
-   Long64_t nbytes = 0, nb = 0;
-   for (Long64_t jentry=0; jentry<nentries;jentry++) {
-      Long64_t ientry = LoadTree(jentry);
-      if (ientry < 0) break;
-      nb = fChain->GetEntry(jentry);   nbytes += nb;
-      if (jetEta > 2 || jetEta < -2) continue;
-      if (jetTightID != 1) continue;
-      if (jetGenMatch != 1) continue;
-      if (isPhysOther) continue;
-      if (isnan(QG_ptD)) continue;
-      csvfile << isPhysUDS << "," << jetPt << "," << jetGirth << "," << QG_ptD << "," << QG_axis2 << "," << QG_mult << std::endl;
-   }
-   csvfile.close();
-}
+// This script is used to extract tabular and image information about the jets to csvs
+// Run this script with (install ROOT from https://root.cern.ch first):
+//    root -l -b -q RootParser.cpp
+// This assumes the file CMSOpenDataJets_MC_8TeV_500K.root is in the same directory.
+// The data file in root format can be downloaded from https://hot.hip.fi/index.php/2018/11/14/hip-cms-opendata-jet-tuples-8-tev-mc/
+
+#define RootParser_cxx
+#include "RootParser.h"
+#include <iostream>
+#include <fstream>
+#include <cmath>
+
+#define IMAGE_SIZE 18
+
+void RootParser::Loop()
+{
+   fChain->SetBranchStatus("*", 0); // disable all branches
+   fChain->SetBranchStatus("jetPt", 1);
+   fChain->SetBranchStatus("jetEta", 1);
+   fChain->SetBranchStatus("jetTightID", 1);
+   fChain->SetBranchStatus("isPhysUDS", 1);
+   fChain->SetBranchStatus("isPhysG", 1);
+   fChain->SetBranchStatus("isPhysOther", 1);
+   fChain->SetBranchStatus("QG_ptD", 1);
+   fChain->SetBranchStatus("QG_axis2", 1);
+   fChain->SetBranchStatus("QG_mult", 1);
+   fChain->SetBranchStatus("jetGenMatch", 1);
+   fChain->SetBranchStatus("nPF", 1);
+   fChain->SetBranchStatus("PF_pT", 1);
+   fChain->SetBranchStatus("PF_dR", 1);
+   fChain->SetBranchStatus("PF_dTheta", 1);
+
+   if (fChain == 0)
+      return;
+
+   std::ofstream csvfile;
+   csvfile.open("jets.csv");
+   csvfile << "isPhysUDS, jetPt, QG_ptD, QG_axis2, QG_mult" << std::endl;
+
+   std::ofstream imgfile;
+   imgfile.open("jets_img.csv");
+   imgfile << "isPhysUDS";
+   for (size_t i = 0; i < IMAGE_SIZE; i++)
+      for (size_t j = 0; j < IMAGE_SIZE; j++)
+         imgfile << ", "
+                 << "pixel_" << i << "_" << j;
+   imgfile << std::endl;
+
+   Long64_t nentries = fChain->GetEntriesFast();
+   Long64_t nbytes = 0, nb = 0;
+   for (Long64_t jentry = 0; jentry < nentries; jentry++)
+   {
+      Long64_t ientry = LoadTree(jentry);
+      if (ientry < 0)
+         break;
+      nb = fChain->GetEntry(jentry);
+      nbytes += nb;
+
+      // Some basic filtering of the jets:
+      if (jetEta > 2 || jetEta < -2)
+         continue;
+      if (jetTightID != 1)
+         continue;
+      if (jetGenMatch != 1)
+         continue;
+      if (isPhysOther)
+         continue;
+      if (isnan(QG_ptD))
+         continue;
+
+      csvfile << isPhysUDS << "," << jetPt << "," << QG_ptD << "," << QG_axis2 << "," << QG_mult << std::endl;
+      imgfile << isPhysUDS;
+      Float_t image[IMAGE_SIZE][IMAGE_SIZE] = {};
+      for (size_t i = 0; i < nPF; i++)
+      {
+         Float_t r = PF_dR[i];
+         r = -r * r * 0.666666 + r * 1.666666;
+         r = r / 0.9;
+         if (r >= 1)
+            continue;
+         Float_t theta = PF_dTheta[i];
+         int x = (int)((std::sin(theta) * r + 1) * 0.5 * IMAGE_SIZE);
+         int y = (int)((std::cos(theta) * r + 1) * 0.5 * IMAGE_SIZE);
+         image[x][y] += PF_pT[i];
+      }
+      for (size_t i = 0; i < IMAGE_SIZE; i++)
+         for (size_t j = 0; j < IMAGE_SIZE; j++)
+            imgfile << "," << image[i][j];
+      imgfile << std::endl;
+   }
+   csvfile.close();
+   imgfile.close();
+}
diff --git a/experiments/data/retrieve_aclimdb.R b/experiments/data/retrieve_aclimdb.R
index 2d936bb..250a8ba 100644
--- a/experiments/data/retrieve_aclimdb.R
+++ b/experiments/data/retrieve_aclimdb.R
@@ -1,235 +1,247 @@
-## --------------------------------------------------
-## Retrieve and preprocess the
-##    "Large Movie Review Dataset"
-##
-##    http://ai.stanford.edu/~amaas/data/sentiment/
-## --------------------------------------------------
-##
-## Usage:
-##
-##        Rscript --vanilla retrieve_aclimdb.R destdir
-##
-## This will download the data, preprocess it
-## and create an rds-file containing training
-## and testing data.
-##
-## The argument 'destdir' specifies the destination directory
-## --------------------------------------------------
-
-## --------------------------------------------------
-## Libraries
-## --------------------------------------------------
-library(tm)
-library(qdap)
-library(Matrix)
-library(SnowballC)
-
-library(e1071)
-library(randomForest)
-library(elmNNRcpp)
-
-source("utils.R")
-
-aclimdb_clean <- function(source) {
-    data <- VCorpus(source, readerControl = list(language = "en"))
-    data <- tm_map(data, content_transformer(function(x) gsub("<br />", " ", x)))
-    data <- tm_map(data, stripWhitespace)
-    data <- tm_map(data, content_transformer(tolower))
-    data <- tm_map(data, content_transformer(removePunctuation))
-    data <- tm_map(data, content_transformer(removeNumbers))
-    data <- tm_map(data, content_transformer(qdap::bracketX))
-    data <- tm_map(data, removeWords, stopwords("english"))
-    data <- tm_map(data, stemDocument)
-    data
-}
-
-aclimdb_get_review <- function(data, index, set="test") {
-    id <- rownames(data$X)[[index]]
-    cls <- if (data$R[[index]] == 0) "neg" else "pos"
-    path <- file.path("experiments/data/aclImdb", set, cls, id)
-    file <- file(path, encoding = "UTF-8")
-    review <- paste(readLines(file, warn = FALSE))
-    close(file)
-    review <- stringr::str_replace_all(review, "<br /><br />", " ")
-    aclimbd_vectorise(review, colnames(data$X))
-}
-
-aclimbd_vectorise <- function(review, cols) {
-    data <- aclimdb_clean(VectorSource(stringr::str_split(review, " ")[[1]]))
-    vec <- c(rep(0, length(cols)))
-    tokens <- sapply(data$content, function(d) d$content)
-    for (d in tokens) {
-        sel <- which(d == cols)
-        vec[sel] <- vec[sel] + 1
-    }
-    vec <- vec / max(vec)
-    list(vec = vec, tokens = tokens, text = review)
-}
-
-
-# This is only run when called from Rscript
-if (sys.nframe() == 0L) {
-
-    ## --------------------------------------------------
-    ## Command-line arguments
-    ## --------------------------------------------------
-    args    <- commandArgs(trailingOnly = TRUE)
-    if (length(args) < 1)
-        destdir <- "experiments/data"
-    else
-        destdir <- args[1]
-
-    ## --------------------------------------------------
-    ## Retrieve the data
-    ## --------------------------------------------------
-    cat("Downloading Data\n")
-    dataURL <- "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
-    download.file(dataURL, destfile = file.path(destdir, basename(dataURL)))
-    untar(file.path(destdir, basename(dataURL)), exdir = destdir)
-
-
-    ## --------------------------------------------------
-    ## Helper functions
-    ## --------------------------------------------------
-    preprocess_data <- function(dpath) {
-        aclimdb_clean(DirSource(dpath, encoding = "UTF-8"))
-    }
-
-    make_sparse_mat <- function(dtm) {
-        Matrix::sparseMatrix(i = dtm$i, j = dtm$j, x = dtm$v)
-    }
-
-    make_dtm <- function(data, N = NULL) {
-
-        dtm_data <- DocumentTermMatrix(data)
-
-        ## Make a sparse matrix
-        M <- make_sparse_mat(dtm_data)
-
-        ## Find N most frequent terms
-        cs <- colSums(M)
-        ind <- order(cs, decreasing = TRUE)
-        if (! is.null(N))
-            colnames(dtm_data)[ind][1:N]
-        else
-            colnames(dtm_data)[ind]
-    }
-
-    make_dictionary <- function(data_pos, data_neg, N) {
-        list("common" = make_dtm(c(data_pos, data_neg), N))
-    }
-
-    discretize_documents <- function(data, dic) {
-        DocumentTermMatrix(data, control = list(dictionary = dic))
-    }
-
-    make_dataset <- function(data_pos = NULL, data_neg = NULL, dir_pos = NULL, dir_neg = NULL, dic) {
-        if (! (is.null(dir_pos) & is.null(dir_neg))) {
-            data_pos <- preprocess_data(dir_pos)
-            data_neg <- preprocess_data(dir_neg)
-        }
-
-        dataset <- rbind(
-            as.matrix(discretize_documents(data_pos, dic = unlist(dic))),
-            as.matrix(discretize_documents(data_neg, dic = unlist(dic))))
-
-        dataset <- dataset / apply(dataset, 1, max)
-        dataset[is.na(dataset)] <- 0
-
-        dataset <- as.data.frame(dataset)
-
-        list("data" = dataset, "class" = as.factor(c(rep(c("pos", "neg"), each=length(data_neg)))))
-    }
-
-    get_acc <- function(model, dataset_test) {
-        res <- predict(model, newdata = dataset_test)
-        sum(as.character(dataset_test$class) == as.character(res)) / nrow(dataset_test)
-    }
-
-
-    ## --------------------------------------------------
-    ## Define directories with the data
-    ## --------------------------------------------------
-    dir_train_pos <- file.path(destdir, "aclImdb/train/pos/")
-    dir_train_neg <- file.path(destdir, "aclImdb/train/neg/")
-
-    dir_test_pos <- file.path(destdir, "aclImdb/test/pos/")
-    dir_test_neg <- file.path(destdir, "aclImdb/test/neg/")
-
-    data_train_pos <- preprocess_data(dir_train_pos)
-    data_train_neg <- preprocess_data(dir_train_neg)
-
-
-    ## --------------------------------------------------
-    ## (1) Create a dictionary with N words
-    ## (2) Make datasets
-    ## (3) Create classifiers
-    ## (4) Save
-    ## --------------------------------------------------
-
-    cat("Processing Data\n")
-    N_words <- 1000
-    dict    <- make_dictionary(data_pos = data_train_pos, data_neg = data_train_neg, N = N_words)
-    dataset_train <- make_dataset(data_pos = data_train_pos, data_neg = data_train_neg, dic = dict)
-    dataset_test  <- make_dataset(dir_pos = dir_test_pos, dir_neg = dir_test_neg, dic = dict)
-
-    ## Create models
-    cat("Training SVM\n")
-    model_svm <- svm(x = dataset_train$data, y = dataset_train$class, kernel = "radial", probability = TRUE)
-    saveRDS(model_svm, file.path(destdir, "aclimdb_model_svm.rds"), compress = "xz")
-
-    cat("Training ELM\n")
-    model_elm <- elm_train(as.matrix(dataset_train$data), onehot_encode(as.numeric(dataset_train$class) - 1),
-        nhid = 1000, actfun = "sig", init_weights = "uniform_negative", bias = TRUE, verbose = TRUE)
-    saveRDS(model_elm, file.path(destdir, "aclimdb_model_elm.rds"), compress = "xz")
-
-    cat("Training RF\n")
-    model_rf  <- randomForest(x = dataset_train$data, y = dataset_train$class)
-    saveRDS(model_rf, file.path(destdir, "aclimdb_model_rf.rds"), compress = "xz")
-
-    cat("Training LogReg\n")
-    model_lr <- glm(dataset_train$class ~ ., dataset_train$data, family="binomial", model=FALSE)
-    model_lr[c("data", "y", "model", "residuals", "weights", "fitted.values",
-        "prior.weights", "na.action", "linear.predictors", "effects", "R")] <- NULL
-    model_lr$qr$qr <- NULL
-    saveRDS(model_lr, file.path(destdir, "aclimdb_model_lr.rds"), compress = "xz")
-
-    ## Functions for making predictions
-    pred_svm <- function(model, data) {
-        p_svm <- predict(model, newdata = data, probability = TRUE)
-        unname(attr(p_svm, "probabilities")[, 1])
-    }
-
-    pred_elm <- function(model, data) {
-        p_elm <- elm_predict(model, newdata = as.matrix(data), normalize = TRUE)
-        p_elm[, 2]
-    }
-
-    pred_rf <- function(model, data) {
-        p_rf  <- predict(model, newdata = data, type = "prob")
-        unname(p_rf[, 2])
-    }
-
-    pred_lr <- function(model, data) {
-        p_lr <- predict(model, newdata = data)
-        sigmoid(unname(p_lr))
-    }
-
-
-    ## Probability of item being positive
-    cat("Predicting Training Data\n")
-    dataset_train$prob_svm <- pred_svm(model_svm, dataset_train$data)
-    dataset_train$prob_elm <- pred_elm(model_elm, dataset_train$data)
-    dataset_train$prob_rf  <- pred_rf(model_rf, dataset_train$data)
-    dataset_train$prob_lr  <- pred_lr(model_lr, dataset_train$data)
-    saveRDS(dataset_train, file.path(destdir, "aclimdb_data_train.rds"), compress = "xz")
-
-    cat("Predicting Test Data\n")
-    dataset_test$prob_svm  <- pred_svm(model_svm, dataset_test$data)
-    dataset_test$prob_elm  <- pred_elm(model_elm, dataset_test$data)
-    dataset_test$prob_rf   <- pred_rf(model_rf, dataset_test$data)
-    dataset_test$prob_lr   <- pred_lr(model_lr, dataset_test$data)
-    saveRDS(dataset_test, file.path(destdir, "aclimdb_data_test.rds"), compress = "xz")
-
-    ## --------------------------------------------------
-}
+## --------------------------------------------------
+## Retrieve and preprocess the
+##    "Large Movie Review Dataset"
+##
+##    http://ai.stanford.edu/~amaas/data/sentiment/
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla retrieve_aclimdb.R [destdir]
+##
+## This will download the data, preprocess it and create an
+## rds-file containing training and testing data.
+##
+## The argument 'destdir' specifies where the data and models
+## should be saved (default: "experiments/data").
+##
+## Alternatively it can be sourced from a script to only get
+## access to the get_data and get_review functions.
+##
+## --------------------------------------------------
+
+
+library(stringr)
+
+
+read_text_file <- function(path) {
+    file <- file(path, encoding = "UTF-8")
+    str <- readChar(path, file.info(path)$size)
+    close(file)
+    str_replace_all(str, "((<br />)|(\\n)|(\\\\n))", "\n")
+}
+
+aclimdb_get_data <- function(set = "test", datadir = "experiments/data") {
+    data <- readRDS(file.path(datadir, paste0("aclimdb_data_", set, ".rds")))
+    pred <- readRDS(file.path(datadir, paste0("aclimdb_pred_", set, ".rds")))
+    data$prediction <- pred
+    data
+}
+
+aclimdb_get_review <- function(index, data = NULL, set = "test", datadir = "experiments/data") {
+    if (missing(data)) {
+        data <- aclimdb_get_data(set, datadir = datadir)
+    }
+    str <- readRDS(file.path(datadir, paste0("aclimdb_str_", set, ".rds")))
+
+    list(
+        x = data$data[index, ],
+        y = data$class[index, ],
+        svm = data$prediction$svm[index],
+        elm = data$prediction$elm[index],
+        rf = data$prediction$rf[index],
+        str = str[[index]],
+    )
+}
+
+
+# This is only run when called from Rscript
+if (sys.nframe() == 0L) {
+    ## --------------------------------------------------
+    ## Libraries
+    ## --------------------------------------------------
+    library(tm)
+    library(Matrix)
+    library(e1071)
+    library(randomForest)
+    library(elmNNRcpp)
+
+    ## --------------------------------------------------
+    ## Helper functions
+    ## --------------------------------------------------
+    aclimdb_clean <- function(source) {
+        data <- VCorpus(source, readerControl = list(language = "en"))
+        data <- tm_map(data, content_transformer(function(x) gsub("((<br />)|(\\\\n)|(\\n))", " ", x)))
+        data <- tm_map(data, content_transformer(function(x) gsub("(<|>|\\(|\\)|\\[|\\]|\\{|\\})", " ", x)))
+        data <- tm_map(data, stripWhitespace)
+        data <- tm_map(data, content_transformer(tolower))
+        data <- tm_map(data, content_transformer(removePunctuation))
+        data <- tm_map(data, content_transformer(removeNumbers))
+        data <- tm_map(data, removeWords, stopwords("english"))
+        data <- tm_map(data, stemDocument)
+        data
+    }
+
+    aclimdb_select_words <- function(data, num_words = NULL) {
+        dtm_data <- DocumentTermMatrix(data)
+
+        ## Find N most frequent terms
+        mat <- sparseMatrix(i = dtm_data$i, j = dtm_data$j, x = dtm_data$v)
+        cs <- colSums(mat)
+        ind <- order(cs, decreasing = TRUE)
+        if (!is.null(num_words)) {
+            colnames(dtm_data)[ind[1:num_words]]
+        } else {
+            colnames(dtm_data)[ind]
+        }
+    }
+
+    aclimdb_make_dataset <- function(data_pos, data_neg, words) {
+        dataset <- rbind(
+            as.matrix(DocumentTermMatrix(data_pos, control = list(dictionary = words))),
+            as.matrix(DocumentTermMatrix(data_neg, control = list(dictionary = words)))
+        )
+        dataset <- sweep(dataset, 1, apply(dataset, 1, max) + 0.1, `/`)
+        classes <- factor(c("pos", "neg"))
+
+        list(
+            "data" = as.data.frame(dataset),
+            "class" = classes[c(rep(1, length(data_pos)), rep(2, length(data_neg)))]
+        )
+    }
+
+    ## --------------------------------------------------
+    ## Command-line arguments
+    ## --------------------------------------------------
+    args <- commandArgs(trailingOnly = TRUE)
+    if (length(args) < 1) {
+        destdir <- "experiments/data"
+    } else {
+        destdir <- args[1]
+    }
+    dir.create(file.path(destdir), showWarnings = FALSE)
+
+
+    ## --------------------------------------------------
+    ## Paths
+    ## --------------------------------------------------
+    data_url <- "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
+    path_data <- file.path(destdir, basename(data_url))
+    path_str_test <- file.path(destdir, "aclimdb_str_test.rds")
+    path_str_train <- file.path(destdir, "aclimdb_str_train.rds")
+    path_data_test <- file.path(destdir, "aclimdb_data_test.rds")
+    path_data_train <- file.path(destdir, "aclimdb_data_train.rds")
+    path_models <- file.path(destdir, "aclimdb_models.rds")
+    path_pred_test <- file.path(destdir, "aclimdb_pred_test.rds")
+    path_pred_train <- file.path(destdir, "aclimdb_pred_train.rds")
+
+
+    ## --------------------------------------------------
+    ## Retrieve the data
+    ## --------------------------------------------------
+    if (!file.exists(path_data)) {
+        cat("Downloading Data\n")
+        download.file(data_url, destfile = path_data)
+    }
+    if (!file.exists(path_str_test) || !file.exists(path_str_train)) {
+        cat("Extracting Data\n")
+        dir <- tempdir()
+        untar(path_data, "*.txt", exdir = dir)
+        test_str <- list(
+            pos = sapply(list.files(file.path(dir, "aclImdb/test/pos"), full.names = TRUE), read_text_file),
+            neg = sapply(list.files(file.path(dir, "aclImdb/test/neg"), full.names = TRUE), read_text_file)
+        )
+        saveRDS(test_str, path_str_test, compress = "xz")
+        train_str <- list(
+            pos = sapply(list.files(file.path(dir, "aclImdb/train/pos"), full.names = TRUE), read_text_file),
+            neg = sapply(list.files(file.path(dir, "aclImdb/train/neg"), full.names = TRUE), read_text_file)
+        )
+        saveRDS(train_str, path_str_train, compress = "xz")
+        unlink(file.path(dir, "aclImbd"), TRUE)
+    } else {
+        cat("Loading Data\n")
+        test_str <- readRDS(path_str_test)
+        train_str <- readRDS(path_str_train)
+    }
+
+
+    ## --------------------------------------------------
+    ## Preprocess the data
+    ## --------------------------------------------------
+    if (!file.exists(path_data_test) || !file.exists(path_data_train)) {
+        cat("Processing Data\n")
+        test_pos <- aclimdb_clean(VectorSource(test_str$pos))
+        test_neg <- aclimdb_clean(VectorSource(test_str$neg))
+        train_pos <- aclimdb_clean(VectorSource(train_str$pos))
+        train_neg <- aclimdb_clean(VectorSource(train_str$neg))
+
+        num_words <- 1000
+        words <- aclimdb_select_words(c(train_pos, train_neg), num_words)
+        dataset_train <- aclimdb_make_dataset(train_pos, train_neg, words)
+        saveRDS(dataset_train, path_data_train, compress = "xz")
+        dataset_test <- aclimdb_make_dataset(test_pos, test_neg, words)
+        saveRDS(dataset_test, path_data_test, compress = "xz")
+    } else {
+        cat("Loading Processed Data\n")
+        dataset_train <- readRDS(path_data_train)
+        dataset_test <- readRDS(path_data_test)
+    }
+
+
+    ## --------------------------------------------------
+    ## Train models
+    ## --------------------------------------------------
+    if (!file.exists(path_models)) {
+        cat("Training SVM\n")
+        model_svm <- svm(x = dataset_train$data, y = dataset_train$class, kernel = "radial", probability = TRUE)
+        cat("Training ELM\n")
+        model_elm <- elm_train(as.matrix(dataset_train$data), onehot_encode(as.numeric(dataset_train$class) - 1),
+            nhid = 1000, actfun = "sig", init_weights = "uniform_negative", bias = TRUE, verbose = TRUE
+        )
+        cat("Training RF\n")
+        model_rf <- randomForest(x = dataset_train$data, y = dataset_train$class)
+
+        saveRDS(list(svm = model_svm, elm = model_elm, rf = model_rf), path_models, compress = "xz")
+    } else {
+        cat("Loading Models\n")
+        models <- readRDS(path_models)
+        model_svm <- models$svm
+        model_elm <- models$elm
+        model_rf <- models$rf
+    }
+
+
+    ## --------------------------------------------------
+    ## Making predictions
+    ## --------------------------------------------------
+    pred_svm <- function(model, data) {
+        p_svm <- predict(model, newdata = data, probability = TRUE)
+        unname(attr(p_svm, "probabilities")[, 1])
+    }
+
+    pred_elm <- function(model, data) {
+        p_elm <- elm_predict(model, newdata = as.matrix(data), normalize = TRUE)
+        p_elm[, 2]
+    }
+
+    pred_rf <- function(model, data) {
+        p_rf <- predict(model, newdata = data, type = "prob")
+        unname(p_rf[, 2])
+    }
+    if (!file.exists(path_pred_test)) {
+        cat("Predicting Test Data\n")
+        saveRDS(data.frame(
+            svm = pred_svm(model_svm, dataset_test$data),
+            elm = pred_elm(model_elm, dataset_test$data),
+            rf = pred_rf(model_rf, dataset_test$data)
+        ), path_pred_test, compress = "xz")
+    }
+    if (!file.exists(path_pred_train)) {
+        cat("Predicting Train Data\n")
+        saveRDS(data.frame(
+            svm = pred_svm(model_svm, dataset_train$data),
+            elm = pred_elm(model_elm, dataset_train$data),
+            rf = pred_rf(model_rf, dataset_train$data)
+        ), path_pred_train, compress = "xz")
+    }
+}
diff --git a/experiments/data/retrieve_census.R b/experiments/data/retrieve_census.R
deleted file mode 100644
index fbbdbbb..0000000
--- a/experiments/data/retrieve_census.R
+++ /dev/null
@@ -1,35 +0,0 @@
-## --------------------------------------------------
-## Retrieve the UCI "adult" data
-## --------------------------------------------------
-##
-## Usage:
-##
-##        Rscript --vanilla retrieve_cencus.R
-##
-## --------------------------------------------------
-
-# This is only run when called from Rscript
-if (sys.nframe() == 0L) {
-    destdir <- "experiments/data"
-
-    ## --------------------------------------------------
-    ## Define Paths
-    ## --------------------------------------------------
-    data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
-    csv_path <- file.path(destdir, "census.csv")
-    data_path <- file.path(destdir, "census.rds")
-
-    ## --------------------------------------------------
-    ## Retrieve the data
-    ## --------------------------------------------------
-    download.file(data_url, destfile=csv_path)
-    data <- read.csv(csv_path, header = FALSE, strip.white = TRUE)
-    colnames(data) <- c("age", "workclass", "fnlwgt", "education",
-                        "education_num", "marital_status", "occupation",
-                        "relationship", "race", "sex", "capital_gain",
-                        "capital_loss", "hours_per_week", "native_country",
-                        "income")
-    data <- data[data$workclass != "?" & data$occupation != "?" & data$native_country != "?",]
-    data <- droplevels(data)
-    saveRDS(data, data_path, compress = "xz")
-}
diff --git a/experiments/data/retrieve_emnist.R b/experiments/data/retrieve_emnist.R
index c9ca1fe..baa6484 100644
--- a/experiments/data/retrieve_emnist.R
+++ b/experiments/data/retrieve_emnist.R
@@ -1,120 +1,200 @@
-## --------------------------------------------------
-## Retrieve and preprocess EMNIST
-##
-##    https://www.westernsydney.edu.au/bens/home/reproducible_research/emnist
-## --------------------------------------------------
-##
-## Usage:
-##
-##        Rscript --vanilla retrieve_emnist.R destdir
-##
-## This will download the data, preprocess it
-## and create an rds-file containing an image
-## matrix (40 000 X 784) and a label vector
-## (40 000 X 1).
-##
-## The argument "destdir" specifies the destination directory
-## --------------------------------------------------
-
-library(keras)
-
-train_model <- function(X, Y, smoothing=0.1, epochs=1) {
-    shuff <- sample.int(length(Y))
-    X <- X[shuff, ]
-    Y <- Y[shuff]
-    # Label Smoothing
-    Y <- to_categorical(Y)
-    Y <- Y * (1 - smoothing) + smoothing / ncol(Y)
-    n_class <- ncol(Y)
-    # Neural Network
-    model <- keras_model_sequential()
-    model %>%
-        layer_reshape(list(28, 28, 1), list(784)) %>%
-        #Conv 1
-        layer_conv_2d(filters = 8, kernel_size=3, activation = "relu", padding="same") %>%
-        layer_batch_normalization() %>%
-        layer_max_pooling_2d() %>%
-        # Conv 2
-        layer_conv_2d(filters = 16, kernel_size=3, activation = "relu", padding="same") %>%
-        # layer_batch_normalization() %>%
-        layer_max_pooling_2d() %>%
-        # Dense
-        layer_flatten(input_shape = c(7, 7)) %>%
-        layer_dense(units = 32, activation = "relu") %>%
-        # layer_dropout(rate = 0.3) %>%
-        # Output
-        layer_dense(units = n_class, activation = "softmax")
-    model %>% compile(
-        optimizer = "adam",
-        loss = "categorical_crossentropy",
-        metrics = c("accuracy")
-    )
-    # Train
-    model %>% fit(X, Y, epochs = epochs, batch_size = 512, validation_split = 0.2)
-    invisible(model)
-}
-
-
-# This is only run when called from Rscript
-if (sys.nframe() == 0L) {
-
-    ## --------------------------------------------------
-    ## Command-line arguments
-    ## --------------------------------------------------
-    args    <- commandArgs(trailingOnly = TRUE)
-
-    if (length(args) < 1)
-        destdir <- "experiments/data"
-    else
-        destdir <- args[1]
-    dir.create(file.path(destdir), showWarnings = FALSE)
-
-    ## --------------------------------------------------
-    ## Define Paths
-    ## --------------------------------------------------
-    dataURL <- "https://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/matlab.zip"
-    zipPath <- file.path(destdir, "emnist.zip")
-    matPath <- file.path(destdir, "emnist-digits.mat")
-    rdsPath <- file.path(destdir, "emnist.rds")
-    highPath <- file.path(destdir, "emnist_high")
-    evenPath <- file.path(destdir, "emnist_even")
-    digiPath <- file.path(destdir, "emnist_digits")
-
-    ## --------------------------------------------------
-    ## Retrieve the data
-    ## --------------------------------------------------
-    download.file(dataURL, destfile=zipPath)
-    unzip(zipPath, "matlab/emnist-digits.mat", exdir=destdir, junkpaths=TRUE)
-
-    ## --------------------------------------------------
-    ## Save the data
-    ## --------------------------------------------------
-    library(R.matlab)
-    emnist <- readMat(matPath)
-    train <- emnist$dataset[[1]]
-    train_image <- train[[1]] / 127.5 - 1
-    train_label <- train[[2]]
-    test <- emnist$dataset[[2]]
-    image <- test[[1]] / 127.5 - 1 # 40 000 X 784 (28^2)
-    label <- test[[2]] # 40 000 X 1
-
-    saveRDS(list(image=image, label=label), rdsPath, compress = "xz")
-
-    ## --------------------------------------------------
-    ## Train Models
-    ## --------------------------------------------------
-    install_keras()
-
-    print("EMNIST High-Low")
-    model <- train_model(train_image, train_label > 4)
-    save_model_hdf5(model, paste0(highPath, ".hdf5"))
-    saveRDS(predict(model, image)[, 2], paste0(highPath, ".rds"), compress="xz")
-    print("EMNIST Even-Odd")
-    model <- train_model(train_image, (train_label %% 2) == 0)
-    save_model_hdf5(model, paste0(evenPath, ".hdf5"))
-    saveRDS(predict(model, image)[, 2], paste0(evenPath, ".rds"), compress="xz")
-    print("EMNIST Digits")
-    model <- train_model(train_image, train_label)
-    save_model_hdf5(model, paste0(digiPath, ".hdf5"))
-    saveRDS(predict(model, image), paste0(digiPath, ".rds"), compress="xz")
-}
+## --------------------------------------------------
+## Retrieve and preprocess EMNIST
+##
+##    https://www.westernsydney.edu.au/bens/home/reproducible_research/emnist
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/data/retrieve_emnist.R [destdir]
+##
+## This will download the data, preprocess it and create an rds-file containing
+## a data matrix (40 000 X 784) and a label vector (40 000 X 1).
+##
+## The argument 'destdir' specifies where the data and models
+## should be saved (default: "experiments/data").
+##
+## Alternatively it can be sourced from a script to only access the functions.
+##
+## --------------------------------------------------
+
+library(keras)
+
+emnist_model <- function() {
+    model <- keras_model_sequential()
+    # Average pooling to avoid noise when doing activation maximisation.
+    model %>%
+        layer_reshape(c(28, 28, 1), c(784)) %>%
+        layer_conv_2d(filters = 16, kernel_size = 3, activation = "relu", padding = "same") %>%
+        layer_average_pooling_2d() %>%
+        layer_conv_2d(filters = 16, kernel_size = 3, activation = "relu", padding = "valid") %>%
+        layer_average_pooling_2d() %>%
+        layer_batch_normalization() %>%
+        layer_conv_2d(filters = 8, kernel_size = 3, padding = "valid") %>%
+        layer_average_pooling_2d() %>%
+        layer_flatten(input_shape = c(2, 2), name = "flat") %>%
+        layer_activation("relu") %>%
+        layer_dense(units = 32, activation = "relu") %>%
+        layer_dropout(0.5) %>%
+        layer_dense(units = 10, activation = "softmax")
+    model %>% compile(
+        optimizer = "adam",
+        loss = "categorical_crossentropy",
+        metrics = c("accuracy")
+    )
+    model
+}
+
+emnist_train <- function(model, X, Y, smoothing = 0.2, epochs = 2) {
+    Y <- to_categorical(Y)
+    if (smoothing > 0) {
+        Y <- Y * (1 - smoothing) + smoothing / ncol(Y)
+    }
+    model %>% fit(X, Y, epochs = epochs, batch_size = 512, validation_split = 0.2)
+}
+
+emnist_get_data <- function(datadir = "experiments/data") {
+    data <- readRDS(file.path(datadir, "emnist.rds"))
+    data$prediction <- readRDS(file.path(datadir, "emnist_preds.rds"))
+    data
+}
+
+emnist_get_internal <- function(datadir = "experiments/data") {
+    readRDS(file.path(datadir, "emnist_internal.rds"))
+}
+
+# Visualise internal workings of a neural network
+#  - Activation Maximation
+#  - Visualising internal nodes (at the flattening layer)
+#  - loosely based on: https://blog.keras.io/how-convolutional-neural-networks-see-the-world.html
+activation_maximisation <- function(model, X, node, layer = "flat", iterations = 20, step_size = 0.05, l2 = 100, smooth = TRUE, verbose = TRUE) {
+    output_node <- get_layer(model, "flat")$output[, node]
+
+    # Loss that maximises a specific node with regularisation to "dim" irrelevant pixels
+    loss <- k_mean(output_node) - l2 * k_mean((model$input - 0.5)^2)
+    # Normalised gradients
+    grads <- k_gradients(loss, model$input)
+    grads <- grads / (k_sqrt(k_mean(k_square(grads), 0, TRUE)) + 1e-5)
+    # Gradient ascent
+    image <- model$input + grads * step_size # - 0.5
+    #  Enforce [0, 1] constraints for the images
+    # scale <- k_sqrt(k_maximum(k_max(k_abs(image), 0, TRUE) * 2.0, 1.0))
+    # image <- image / scale + 0.5
+    image <- k_clip(image, 0.0, 1.0)
+    # Create a tensorflow function that can be called iteratively
+    image <- k_reshape(image, k_shape(model$input))
+    diff <- k_mean(k_abs(image - model$input))
+    iterate <- k_function(c(model$input), c(image, diff))
+
+    # Push the images closer to the neutral 0.5
+    X <- 0.2 + X * 0.6 # + runif(length(X), -0.1, 0.1)
+    diff <- 0
+    # Iteratively update the images
+    for (i in 1:iterations) {
+        out <- iterate(X)
+        X <- out[[1]]
+        diff <- diff + out[[2]]
+    }
+
+    # Optional Gaussian blur (3x3) to make interpretation easier
+    if (smooth) {
+        kernel_index <- c(-29, -28, -27, -1, 0, 1, 27, 28, 29)
+        kernel_weight <- c(1, 2, 1, 2, 4, 2, 1, 2, 1)
+        X2 <- X
+        for (i in 1:784) {
+            index <- i + kernel_index
+            mask <- index > 0 & index < 785 & abs(((index - 1) %% 28) - ((i - 1) %% 28)) < 2
+            X[, i] <- apply(X2[, index[mask]], 1, weighted.mean, kernel_weight[mask])
+        }
+    }
+
+    if (verbose) {
+        cat(sprintf("Activation Maximisations for node %2d trained! (md: %g)\n", node, diff))
+    }
+    X
+}
+
+
+
+# This is only run when called from Rscript
+if (sys.nframe() == 0L) {
+    suppressPackageStartupMessages(library(R.matlab))
+
+    # This is the easiest way to get k_gradient to work
+    suppressPackageStartupMessages(library(tensorflow))
+    tf$compat$v1$disable_eager_execution()
+
+    ## --------------------------------------------------
+    ## Command-line arguments
+    ## --------------------------------------------------
+    args <- commandArgs(trailingOnly = TRUE)
+    if (length(args) < 1) {
+        destdir <- "experiments/data"
+    } else {
+        destdir <- args[1]
+    }
+    dir.create(file.path(destdir), showWarnings = FALSE)
+
+    ## --------------------------------------------------
+    ## Define Paths
+    ## --------------------------------------------------
+    url_data <- "https://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/matlab.zip"
+    path_zip <- file.path(destdir, "emnist.zip")
+    path_test <- file.path(destdir, "emnist.rds")
+    path_model <- file.path(destdir, "emnist_model.hdf5")
+    path_pred <- file.path(destdir, "emnist_preds.rds")
+    path_internal <- file.path(destdir, "emnist_internal.rds")
+
+    ## --------------------------------------------------
+    ## Retrieve the data
+    ## --------------------------------------------------
+    if (!file.exists(path_zip)) {
+        cat("Downloading Data...\n")
+        download.file(url_data, destfile = path_zip)
+    }
+
+    cat("Reading Data...\n")
+    emnist <- readMat(unz(path_zip, "matlab/emnist-digits.mat"))
+    train <- emnist$dataset[[1]]
+    train_image <- train[[1]] / 255
+    train_label <- c(train[[2]])
+    test <- emnist$dataset[[2]]
+    test_image <- test[[1]] / 255 # 40 000 X 784 (28^2)
+    test_label <- c(test[[2]]) # 40 000 X 1
+    rm(emnist, train, test)
+
+    if (!file.exists(path_test)) {
+        saveRDS(list(image = test_image, label = test_label), path_test, compress = "xz")
+    }
+
+    ## --------------------------------------------------
+    ## Train Model
+    ## --------------------------------------------------
+    if (!file.exists(path_model)) {
+        print("Training Model...\n")
+        model <- emnist_model()
+        emnist_train(model, train_image, train_label)
+        save_model_hdf5(model, path_model)
+    } else {
+        model <- load_model_hdf5(path_model)
+    }
+    if (!file.exists(path_pred)) {
+        predictions <- predict(model, test_image)
+        saveRDS(predictions, path_pred, compress = "xz")
+    }
+
+    ## --------------------------------------------------
+    ## Activation Maximisations
+    ## --------------------------------------------------
+    if (!file.exists(path_internal)) {
+        cat("Training Activation Maximisations...\n")
+        selected <- seq(100, nrow(test_image), 100) # 30990
+        model2 <- keras_model(inputs = model$input, outputs = get_layer(model, "flat")$output)
+        internal <- predict(model2, test_image)
+        colnames(internal) <- paste("Neuron", 1:ncol(internal))
+        nodes <- lapply(1:ncol(internal), function(i) {
+            activation_maximisation(model, test_image[selected, , drop = FALSE], i)
+        })
+        saveRDS(list(internal = internal, nodes = nodes, selected = selected), path_internal, compress = "xz")
+    }
+}
\ No newline at end of file
diff --git a/experiments/data/retrieve_jets.R b/experiments/data/retrieve_jets.R
index 7a50b93..2c81261 100644
--- a/experiments/data/retrieve_jets.R
+++ b/experiments/data/retrieve_jets.R
@@ -1,88 +1,156 @@
-## --------------------------------------------------
-## Train models for jets
-## --------------------------------------------------
-##
-## Usage:
-##
-##        Rscript --vanilla retrieve_jets.R
-##
-## This will train the jet-classifiers.
-##
-## --------------------------------------------------
-
-library(keras)
-
-train_model_tab <- function(X, Y, smoothing = 0.1, epochs = 5) {
-    shuff1 <- which(Y == 1)
-    shuff2 <- which(Y == 0)
-    len <- min(length(shuff1), length(shuff2))
-    shuff <- sample(c(sample(shuff1, len), sample(shuff2, len)))
-    X <- X[shuff, ]
-    Y <- Y[shuff]
-    # Label Smoothing
-    Y <- to_categorical(Y)
-    Y <- Y * (1 - smoothing) + smoothing / ncol(Y)
-    # Neural Network
-    model <- keras_model_sequential()
-    model %>%
-        layer_batch_normalization(input_shape = 5) %>%
-        layer_dense(units = 16, activation = "relu") %>%
-        layer_batch_normalization() %>%
-        layer_dense(units = 32, activation = "relu") %>%
-        layer_batch_normalization() %>%
-        layer_dense(units = 16, activation = "relu") %>%
-        layer_batch_normalization() %>%
-        layer_dense(units = 2, activation = "softmax")
-    model %>% compile(
-        optimizer = "adam",
-        loss = "categorical_crossentropy",
-        metrics = c("accuracy")
-    )
-    # Train
-    model %>% fit(X, Y, epochs = epochs, batch_size = 128, validation_split = 0.2)
-    invisible(model)
-}
-
-
-# This is only run when called from Rscript
-if (sys.nframe() == 0L) {
-
-    ## --------------------------------------------------
-    ## Define Paths
-    ## --------------------------------------------------
-    data_tab_in <- "experiments/data/jets.csv"
-    data_tab_out <- "experiments/data/jets.rds"
-    model_tab_out <- "experiments/data/jets.hdf5"
-
-
-    ## --------------------------------------------------
-    ## Read the data
-    ## --------------------------------------------------
-    if (file.exists(data_tab_in)) {
-        cat("Reading data\n")
-        data_tab <- as.matrix(read.csv(data_tab_in))
-        label_tab <- data_tab[, 1]
-        data_tab <- data_tab[, -1]
-    } else if (file.exists(data_tab_out)) {
-        cat("Reading data\n")
-        data_tab <- readRDS(data_tab_out)
-        label_tab <- data_tab$R
-        data_tab <- data_tab$X
-    } else {
-        stop("Jets need to be extracted from a root file first! (see RootParser.cpp)")
-    }
-
-    ## --------------------------------------------------
-    ## Train Models
-    ## --------------------------------------------------
-    install_keras()
-
-    cat("Training tabular model\n")
-    model_tab <- train_model_tab(data_tab, label_tab)
-    cat("Saving tabular model\n")
-    save_model_hdf5(model_tab, model_tab_out)
-    cat("Predicting jets\n")
-    predicted_tab <- predict(model_tab, data_tab)
-    cat("Saving tabular data\n")
-    saveRDS(list(X = data_tab, Y = predicted_tab[, 2], R = label_tab), data_tab_out, compress = "xz")
-}
+## --------------------------------------------------
+## Train models for jets
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla retrieve_jets.R
+##
+## This will train the jet-classifiers.
+##
+## --------------------------------------------------
+
+library(keras)
+
+jets_train_model_img <- function(X, Y, smoothing = 0.2, epochs = 5) {
+    # # Label Smoothing
+    Yc <- to_categorical(Y)
+    Yc <- Yc * (1 - smoothing) + smoothing / ncol(Y)
+    # Neural Network
+    model <- keras_model_sequential()
+    model %>% # 18 > 14 > 7 > 5 > 3 > Dense
+        layer_reshape(list(18, 18, 1), list(18 * 18)) %>%
+        layer_locally_connected_2d(filters = 16, kernel_size=5, activation = "relu", padding="valid") %>%
+        layer_batch_normalization() %>%
+        layer_max_pooling_2d() %>%
+        layer_locally_connected_2d(filters = 32, kernel_size=3, activation = "relu", padding="valid") %>%
+        layer_batch_normalization() %>%
+        layer_locally_connected_2d(filters = 16, kernel_size=3, activation = "relu", padding="valid") %>%
+        layer_flatten(input_shape = c(3, 3)) %>%
+        layer_dense(units = 32, activation = "relu") %>%
+        layer_dense(units = 2, activation = "sigmoid")
+    model %>% compile(
+        optimizer = "adam",
+        loss = "categorical_crossentropy",
+        metrics = c("accuracy")
+    )
+    # Train
+    model %>% fit(
+        X,
+        Yc,
+        epochs = epochs,
+        batch_size = 128,
+        validation_split = 0.2,
+        class_weight = as.list(length(Y) / table(Y))
+    )
+    model
+}
+
+jets_train_model_tab <- function(X, Y, smoothing = 0.2, epochs = 5) {
+    # # Label Smoothing
+    Yc <- to_categorical(Y)
+    Yc <- Yc * (1 - smoothing) + smoothing / ncol(Y)
+    # Neural Network
+    model <- keras_model_sequential()
+    model %>%
+        layer_batch_normalization(input_shape = 4) %>%
+        layer_dense(units = 16, activation = "relu") %>%
+        layer_batch_normalization() %>%
+        layer_dense(units = 32, activation = "relu") %>%
+        layer_batch_normalization() %>%
+        layer_dense(units = 16, activation = "relu") %>%
+        layer_batch_normalization() %>%
+        layer_dense(units = 2, activation = "sigmoid")
+    model %>% compile(
+        optimizer = "adam",
+        loss = "categorical_crossentropy",
+        metrics = c("accuracy")
+    )
+    # Train
+    model %>% fit(
+        X,
+        Yc,
+        epochs = epochs,
+        batch_size = 128,
+        validation_split = 0.2,
+        class_weight = as.list(length(Y) / table(Y))
+    )
+    model
+}
+
+jets_get_data_img <- function() {
+    readRDS("experiments/data/jets_img.rds")
+}
+
+jets_get_data_tab <- function() {
+    readRDS("experiments/data/jets.rds")
+}
+
+
+# This is only run when called from Rscript
+if (sys.nframe() == 0L | T) {
+
+    ## --------------------------------------------------
+    ## Define Paths
+    ## --------------------------------------------------
+    url_data <- "https://hot.hip.fi/tuples/CMSOpenDataJets_QCD_MC_8TeV_500K.root"
+    path_root <- "experiments/data/CMSOpenDataJets_MC_8TeV_500K.root"
+    path_img_csv <- "experiments/data/jets_img.csv"
+    path_img_rds <- "experiments/data/jets_img.rds"
+    path_img_model <- "experiments/data/jets_img.hdf5"
+    path_tab_csv <- "experiments/data/jets.csv"
+    path_tab_rds <- "experiments/data/jets.rds"
+    path_tab_model <- "experiments/data/jets.hdf5"
+
+
+    ## --------------------------------------------------
+    ## Retrieve the data
+    ## --------------------------------------------------
+    if (!file.exists(path_root)) {
+        cat("Downloading Data\n")
+        download.file(url_data, destfile = path_root)
+    }
+
+    if (!file.exists(path_img_rds) || !file.exists(path_tab_rds)) {
+        if (!file.exists(path_img_csv) || !file.exists(path_tab_csv)) {
+            cat("Extracting data\n")
+            # This requries ROOT to be installed: https://root.cern.ch
+            system("root -l -b -q experiments/data RootParser.cpp")
+        }
+        cat("Reading data\n")
+        data_tab <- as.matrix(read.csv(path_tab_csv))
+        label_tab <- data_tab[, 1]
+        data_tab <- data_tab[, -1]
+        data_img <- as.matrix(read.csv(path_img_csv))
+        label_img <- data_img[, 1]
+        data_img <- data_img[, -1]
+    } else {
+        data_tab <- readRDS(path_tab_rds)
+        label_tab <- data_tab$R
+        data_tab <- data_tab$X
+        data_img <- readRDS(path_img_rds)
+        label_img <- data_img$R
+        data_img <- data_img$X
+    }
+
+    ## --------------------------------------------------
+    ## Train Models
+    ## --------------------------------------------------
+    cat("Training image model\n")
+    model_img <- jets_train_model_img(data_img, label_img)
+    cat("Saving image model\n")
+    save_model_hdf5(model_img, path_img_model)
+    cat("Predicting images\n")
+    predicted_img <- predict(model_img, data_img)
+    cat("Saving image data\n")
+    saveRDS(list(X = data_img, Y = predicted_img[, 2], R = label_img), path_img_rds, compress = "xz")
+
+    cat("Training tabular model\n")
+    model_tab <- jets_train_model_tab(data_tab, label_tab)
+    cat("Saving tabular model\n")
+    save_model_hdf5(model_tab, path_tab_model)
+    cat("Predicting jets\n")
+    predicted_tab <- predict(model_tab, data_tab)
+    cat("Saving tabular data\n")
+    saveRDS(list(X = data_tab, Y = predicted_tab[, 2], R = label_tab), path_tab_rds, compress = "xz")
+}
diff --git a/experiments/explanations/README.md b/experiments/explanations/README.md
new file mode 100644
index 0000000..32d5d70
--- /dev/null
+++ b/experiments/explanations/README.md
@@ -0,0 +1,31 @@
+# Experiments
+
+This directory contains explanation experiments.
+
+## Dependencies
+
+For running all the experiments and gathering all the data the following
+packages are needed, note that not all packages are needed for individual
+experiments:
+
+- ggplot2
+- xtable
+- R.matlab
+- tm
+- Matrix
+- e1071
+- randomForest
+- elmNNRcpp
+- datasets
+- reticulate
+- keras
+- tensorflow
+
+### Reticulate dependencies
+
+These are python packages that are accessed through reticulate:
+
+- `keras::install_keras()`
+- `tensorflow::install_tensorflow()`
+- `reticulate::py_install("lime")`
+- `reticulate::py_install("shap")`
diff --git a/experiments/explanations/data.R b/experiments/explanations/data.R
new file mode 100644
index 0000000..de23f20
--- /dev/null
+++ b/experiments/explanations/data.R
@@ -0,0 +1,146 @@
+
+library(datasets)
+
+DATA_DIR <- "experiments/data"
+
+
+#' Get EMNIST data
+#'
+#' @param digit the digit for the classification
+#' @param n number of images (default = -1 == all images)
+#' @param d number of dimensions (default = -1 == all dimensions)
+#' @param th discard dimensions with variance less than th
+#' @param balance should the number of samples in each class be balanced
+#' @param index item to override the digit from (default = -1 == use digit instead)
+#'
+#' @return list(X = data, Y = prediction, R = real_class, mask = selected image indices)
+#'
+data_emnist <- function(digit = 2, n = -1, d = -1, th = -1, balance = TRUE, index = -1) {
+    emnist <- readRDS(file.path(DATA_DIR, "emnist.rds"))
+    X <- emnist$image
+    R <- c(emnist$label)
+    Y <- readRDS(file.path(DATA_DIR, "emnist_preds.rds"))
+    if (index > 0) {
+        digit <- R[index]
+    }
+    mask <- NULL
+    if (balance) {
+        mask <- which(R == digit)
+        if (n > 0 && length(mask) > n / 2) {
+            mask <- sample(mask, n / 2)
+        }
+        mask <- c(mask, sample(which(R != digit), length(mask)))
+        mask <- sample(mask)
+        X <- X[mask, , drop = FALSE]
+        Y <- Y[mask, digit + 1]
+        R <- R[mask]
+    } else if (n > 0 && length(R) > n) {
+        mask <- sample.int(length(R), n)
+        X <- X[mask, , drop = FALSE]
+        Y <- Y[mask, digit + 1]
+        R <- R[mask]
+    } else {
+        Y <- Y[, digit + 1]
+    }
+    colnames(X) <- sprintf("Pixel[%02d,%02d]", rep(1:28, 28), rep(1:28, each = 28))
+    if (th >= 0) {
+        X <- X[, apply(X, 2, var, na.rm = TRUE) > th, drop = FALSE]
+    }
+    if (d > 0 && ncol(X) > d) {
+        X <- X[, seq(1, ncol(X), length.out = d)]
+    }
+    list(X = X, Y = Y, R = R, mask = mask, name = paste("emnist", digit))
+}
+
+
+#' Get EMNIST internal data
+#'
+#' @param index index of item used as a start for the activation maximisation
+#' @param data output from data_emnist()
+#'
+#' @return list(X = data, X2 = internal_states, Y = prediction, R = real_class, mask = selected image indices, nodes = activation maximisation, selected = the item used for the AM)
+#'
+data_emnist_internal <- function(index = 2400, data = data_emnist(index = index)) {
+    internal <- readRDS(file.path(DATA_DIR, "emnist_internal.rds"))
+    if (is.null(data$mask)) {
+        data$X2 <- internal$internal
+        data$selected <- index
+    } else {
+        data$X2 <- internal$internal[data$mask, , drop = FALSE]
+        data$selected <- which(data$mask == index)
+    }
+    sel <- which(internal$selected == index)
+    if (length(sel) != 1) {
+        stop(paste("Index ", index, "has no prepared activation maximisation"))
+    }
+    data$nodes <- lapply(internal$nodes, function(n) n[sel, ])
+    data$name <- "emnist internal"
+    data
+}
+
+#' Get Physics data
+#'
+#' @param n number of jets
+#' @param img image or tabular (default) format
+#'
+#' @return list(X = data matrix, Y = prediction, R = real_class)
+#'
+data_jets <- function(img = FALSE, n = -1) {
+    data <- readRDS(file.path(DATA_DIR, if (img) "jets_img.rds" else "jets.rds"))
+    if (n > 0) {
+        mask <- sample(c(sample(which(data$R == 1), n / 2), sample(which(data$R == 0), n / 2)))
+        data$R <- data$R[mask]
+        data$Y <- data$Y[mask]
+        data$X <- data$X[mask, ]
+    }
+    data$name <- if (img) "jet images" else "jets"
+    data
+}
+
+#' Get imdb review data
+#'
+#' @param n number of reviews
+#' @param set test/train dataset
+#' @param model classifier (svm, rf, elm)
+#'
+#' @return list(X = data matrix, Y = prediction, R = real_class)
+#'
+data_imdb <- function(n = -1, set = "test", model = "svm") {
+    tmp <- readRDS(file.path(DATA_DIR, paste0("aclimdb_data_", set, ".rds")))
+    X <- as.matrix(tmp$data)
+    R <- as.numeric(tmp$class == "pos")
+    tmp <- readRDS(file.path(DATA_DIR, paste0("aclimdb_pred_", set, ".rds")))
+    Y <- as.numeric(tmp[[model]])
+    if (n > 0 && n < length(Y)) {
+        mask <- sample.int(length(Y), n)
+        X <- X[mask, ]
+        Y <- Y[mask]
+        R <- R[mask]
+    }
+    list(X = X, Y = Y, R = R, name = "imdb")
+}
+
+#' Get mtcars data
+#'
+#' @param model type of model to train (rf, lm, or svm)
+#'
+#' @return list(X = data matrix, Y = predictions, R = real_value)
+#'
+data_mtcars <- function(model = "rf") {
+    if (model == "rf") {
+        mod <- randomForest::randomForest(mpg ~ ., mtcars)
+    } else if (model == "lm") {
+        mod <- lm(mpg ~ ., mtcars)
+    } else if (model == "svm") {
+        mod <- e1071::svm(mpg ~ ., mtcars)
+    } else {
+        stop("Unknown model type")
+    }
+    list(
+        X = as.matrix(mtcars[-1]),
+        Y = predict(mod, mtcars),
+        R = mtcars$mpg,
+        model = mod,
+        name = "mtcars"
+    )
+}
\ No newline at end of file
diff --git a/experiments/explanations/exp_comparison.R b/experiments/explanations/exp_comparison.R
new file mode 100644
index 0000000..22a214e
--- /dev/null
+++ b/experiments/explanations/exp_comparison.R
@@ -0,0 +1,391 @@
+## --------------------------------------------------
+## Compare the explanations from different model-agnostic post-hoc explainers
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/exp_comparison.R index [python]
+##
+## Parameters:
+##
+##        index   : Specify the job index (1-90)
+##        python  : Optional python executable
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        The number of jobs should be a multiple of 90.
+##        Run the script without an index to produce plots.
+##
+## Running Manually:
+##        Rscript --vanilla experiments/explanations/exp_comparison.R 1:90
+##        Rscript --vanilla experiments/explanations/exp_comparison.R
+## --------------------------------------------------
+
+library(reticulate)
+library(keras)
+suppressMessages(suppressWarnings(library(dplyr)))
+
+suppressMessages(source("experiments/explanations/data.R"))
+suppressMessages(source("experiments/explanations/methods.R"))
+
+
+evaluate_expl <- function(expl_fn, predict_fn, data, index, destroy_fn, treshold = 0.1, relevance_k = 0.1, coverage_k = 100, coherence_k = 20, stability_k = 20, stability_noise = 0.2) {
+    dist <- apply(data$X, 1, data$distance_fn, data$X[index, ])
+    knn <- order(dist)[2:(coverage_k + 1)]
+    expl <- expl_fn(data$X, data$Y, index, predict_fn)
+    approx_Y <- expl$approx_fn(data$X)
+    # Fidelity (can the explanation be used for prediction)
+    fidelity_a <- abs(approx_Y[index] - data$Y[index])
+    fidelity_b <- mean(abs(expl$approx_fn(expl$neighbourhood$X) - expl$neighbourhood$Y), na.rm = TRUE)
+    # Coverage (how general is the explanation)
+    coverage_a <- mean(abs(approx_Y - data$Y) < treshold, na.rm = TRUE)
+    coverage_b <- mean(abs(approx_Y[knn] - data$Y[knn]) < treshold, na.rm = TRUE)
+    medianerr <- median(abs(approx_Y - data$Y), na.rm = TRUE)
+    # Relevance (how quickly can the explanation destroy the prediction)
+    if (relevance_k < 1) {
+        relevance_k <- relevance_k * ncol(data$X)
+    }
+    corruption <- seq(1, relevance_k, length.out = 20)
+    important <- order(expl$impact, decreasing = TRUE)
+    inverse <- destroy_fn(data$X[index, ])
+    impinv <- do.call(rbind, lapply(corruption, function(i) {
+        x <- data$X[index, ]
+        x[important[1:i]] <- inverse[important[1:i]]
+        x
+    }))
+    relevance <- c(1, predict_fn(impinv) / data$Y[index])
+    gc_rpy(TRUE)
+    # Coherence (do the knn have the same explanation)
+    x_orig <- data$X[index, , drop = FALSE]
+    y_orig <- data$Y[index]
+    x_norm_a <- NULL
+    hm_norm_a <- NULL
+    fx_a <- NULL
+    fix_a <- NULL
+    fxi_a <- NULL
+    fixi_a <- NULL
+    for (i in order(dist)[2:(coherence_k + 1)]) {
+        if (i != index) {
+            expl2 <- expl_fn(data$X, data$Y, i, predict_fn)
+            x_new <- data$X[i, , drop = FALSE]
+            # Adding the (implicit) intercept not present in the saliency maps
+            x_norm_a <- c(x_norm_a, euclidean_distance(c(1, x_new), c(1, x_orig)))
+            hm_norm_a <- c(hm_norm_a, euclidean_distance(
+                c(expl$alpha[1], expl$heatmap),
+                c(expl2$alpha[1], expl2$heatmap)
+            ))
+            fx_a <- c(fx_a, c(expl$approx_fn(x_orig)))
+            fix_a <- c(fix_a, c(expl2$approx_fn(x_orig)))
+            fxi_a <- c(fxi_a, c(expl$approx_fn(x_new)))
+            fixi_a <- c(fixi_a, c(expl2$approx_fn(x_new)))
+            expl2 <- NULL
+            gc_rpy(TRUE)
+        }
+    }
+    stability_a <- max(hm_norm_a / x_norm_a)
+    coherence_a1 <- max(sqrt((fx_a - fxi_a)^2 + (fix_a - fixi_a)^2) / x_norm_a)
+    coherence_a2 <- max(abs(fix_a - fixi_a - fx_a + fxi_a) / x_norm_a^2)
+    # Stability (does stability_noise alter the explanation)
+    x_norm_b <- NULL
+    hm_norm_b <- NULL
+    fx_b <- NULL
+    fix_b <- NULL
+    fxi_b <- NULL
+    fixi_b <- NULL
+    minX <- apply(data$X, 2, min)
+    maxX <- apply(data$X, 2, max)
+    for (i in rep(index, stability_k)) {
+        mask <- sample.int(ncol(data$X), ncol(data$X) * stability_noise)
+        data$X[i, mask] <- runif(length(mask), minX[mask], maxX[mask])
+        data$Y[i] <- predict_fn(data$X[i, , drop = FALSE])
+        expl2 <- expl_fn(data$X, data$Y, i, predict_fn)
+        x_new <- data$X[i, , drop = FALSE]
+        x_norm_b <- c(x_norm_b, euclidean_distance(c(1, x_new), c(1, x_orig)))
+        hm_norm_b <- c(hm_norm_b, euclidean_distance(
+            c(expl$alpha[1], expl$heatmap),
+            c(expl2$alpha[1], expl2$heatmap)
+        ))
+        fx_b <- c(fx_b, c(expl$approx_fn(x_orig)))
+        fix_b <- c(fix_b, c(expl2$approx_fn(x_orig)))
+        fxi_b <- c(fxi_b, c(expl$approx_fn(x_new)))
+        fixi_b <- c(fixi_b, c(expl2$approx_fn(x_new)))
+        expl2 <- NULL
+        gc_rpy(TRUE)
+    }
+    stability_b <- max(hm_norm_b / x_norm_b)
+    coherence_b1 <- max(sqrt((fx_b - fxi_b)^2 + (fix_b - fixi_b)^2) / x_norm_b)
+    coherence_b2 <- max(abs(fix_b - fixi_b - fx_b + fxi_b) / x_norm_b^2)
+
+    data$X[index, ] <- x_orig
+    data$Y[index] <- y_orig
+
+    list(
+        df1 = data.frame(
+            method = expl$name,
+            data = data$name,
+            fidelity_a = fidelity_a,
+            fidelity_b = fidelity_b,
+            coverage_a = coverage_a,
+            coverage_b = coverage_b,
+            medianerr = medianerr,
+            stability_a = stability_a,
+            stability_b = stability_b,
+            coherence_a1 = coherence_a1,
+            coherence_b1 = coherence_b1,
+            coherence_a2 = coherence_a2,
+            coherence_b2 = coherence_b2,
+            relevance = median(relevance),
+            heatmap_std = sd(expl$heatmap),
+            heatmap_max = max(abs(expl$heatmap)),
+            heatmap_mean = mean(abs(expl$heatmap)),
+            intercept = expl$alpha[1],
+            alpha_tot = sum(abs(expl$alpha))
+        ),
+        df2 = data.frame(
+            method = expl$name,
+            data = data$name,
+            corruption = c(0, corruption / ncol(data$X)),
+            relevance = relevance
+        ),
+        df3 = data.frame(
+            x_norm_a = x_norm_a,
+            hm_norm_a = hm_norm_a,
+            fx_a = fx_a,
+            fix_a = fix_a,
+            fxi_a = fxi_a,
+            fixi_a = fixi_a,
+            x_norm_b = x_norm_b,
+            hm_norm_b = hm_norm_b,
+            fx_b = fx_b,
+            fix_b = fix_b,
+            fxi_b = fxi_b,
+            fixi_b = fixi_b
+        )
+    )
+}
+
+get_config <- function(index) {
+    index <- index - 1
+    method <- index %% 10
+    index <- floor(index / 10)
+    dataset <- index %% 10
+    set.seed(42 + index)
+    reticulate::py_set_seed(as.integer(42 + index))
+    if (dataset < 10) {
+        data <- data_emnist(dataset)
+        data$distance_fn <- cosine_distance
+        model <- keras::load_model_hdf5("experiments/data/emnist_model.hdf5")
+        pred_fn <- function(x) {
+            dim(x) <- c(length(x) / 784, 784)
+            out <- predict(model, x)
+            out[, dataset + 1, drop = FALSE]
+        }
+        index <- which(data$Y > 0.5 & data$R == dataset)[[1]]
+        expl_fn <- if (method == 0) {
+            slise_emnist
+        } else if (method == 1) {
+            distslise_emnist
+        } else if (method == 2) {
+            limeslise_emnist
+        } else if (method == 3) {
+            function(...) lime_emnist(..., segmentation = "original")
+        } else if (method == 4) {
+            function(...) lime_emnist(..., segmentation = "small")
+        } else if (method == 5) {
+            function(...) lime_emnist(..., segmentation = "pixel")
+        } else if (method == 6) {
+            function(...) shap_emnist(..., deletion = "gray", mid = 0.5)
+        } else if (method == 7) {
+            function(...) shap_emnist(..., deletion = "invert", low = 0, mid = 0.5, high = 1)
+        } else if (method == 8) {
+            function(...) shap_emnist(..., deletion = "sample")
+        } else if (method == 9) {
+            rndexpl_emnist
+        } else {
+            stop("Unknown explanation method")
+        }
+        destroy_fn <- function(x) 1 - x
+    } else {
+        stop("Other datasets are not implemented yet")
+    }
+    list(
+        data = data,
+        pred_fn = pred_fn,
+        index = index,
+        expl_fn = expl_fn,
+        destroy_fn = destroy_fn
+    )
+}
+
+gc_rpy <- function(full = FALSE) {
+    gc(verbose = FALSE, full = full)
+    reticulate::import("gc")$collect(as.integer(ifelse(full, 2, 1)))
+}
+
+plot_lineup <- function(dir = "experiments/results") {
+    cat("Plotting a lineup of explanations\n")
+    contour <- NULL
+    heatmap <- NULL
+    labels <- NULL
+    offset <- 2 * 10
+    for (i in offset + 1:9) {
+        params <- get_config(i)
+        expl <- params$expl_fn(params$data$X, params$data$Y, params$index, params$pred_fn)
+        contour <- rbind(contour, params$data$X[params$index, ])
+        heatmap <- rbind(heatmap, expl$heatmap)
+        labels <- c(labels, stringr::str_replace(expl$name, "Weighted-SLISE", "SLISE (weighted)"))
+        cat(" ", expl$name, "\n")
+    }
+    cairo_pdf(file.path(dir, "comparison_lineup.pdf"), width = 0.8 * 9, height = 0.8 * 9)
+    plot(plot_mnist(
+        array(t(apply(heatmap, 1, function(x) x / max(abs(x)))), c(length(labels), 28, 28)),
+        array(contour, c(length(labels), 28, 28)),
+        c("not 2", "2"),
+        enhance_colours = FALSE
+    ) + facet_wrap(
+        vars(Var1),
+        nrow = 3,
+        labeller = function(x) data.frame(Var1 = labels)
+    ) + theme_image(legend.position = "None"))
+    dev.off()
+}
+
+plot_relevance <- function(df2, max_cor = 0.05, dir = "experiments/results") {
+    df <- df2 %>%
+        mutate(method = factord(stringr::str_replace(method, "Weighted-SLISE", "SLISE (weighted)"))) %>%
+        select(-data) %>%
+        # filter(corruption <= max_cor) %>%
+        group_by(method, corruption) %>%
+        summarise_all(mean)
+    cairo_pdf(file.path(dir, "comparison_relevance.pdf"), width = 0.6 * 9, height = 0.35 * 9)
+    plot(ggplot(df) +
+        geom_smooth(aes(corruption, relevance, group = method, linetype = method, size = method, color = method), se = FALSE) +
+        scale_size_manual(values = c(rep(1.5, 3), rep(1, 3), rep(0.7, 3), 1.4)) +
+        scale_linetype_manual(values = c(rep(c(1, 4, 2), 3), 9)) +
+        scale_color_manual(values = c(rep(SLISE_DARKPURPLE, 3), rep(SLISE_ORANGE, 3), rep("#1b9e77", 3), "black")) +
+        xlab("Fraction of Corruption") +
+        ylab("Relative Prediction") +
+        coord_cartesian(xlim = c(0, max_cor)) +
+        theme_paper(legend.key.width = unit(3, "line")))
+    dev.off()
+}
+
+table_results <- function(res) {
+    format <- function(means, sds, best = which.max, decimals = 3) {
+        means <- round(means, decimals)
+        sds <- round(sds, decimals)
+        mask <- best(means)
+        mask <- (means <= means[mask] + sds[mask] & means >= means[mask] - sds[mask]) + 1
+        fmt <- sprintf("%%.%df", decimals)
+        fmt <- paste0(c("$", "$\\bf{")[mask], fmt, " \\pm ", fmt, c("$", "}$")[mask])
+        sprintf(fmt, means, sds)
+    }
+    sanitise_names <- function(x) {
+        x <- stringr::str_replace(x, "Distance-SLISE", "SLISE (distance)")
+        x <- stringr::str_replace(x, "Euclidean-SLISE", "SLISE (distance)")
+        x <- stringr::str_replace(x, "Weighted-SLISE", "SLISE (weighted)")
+        x <- stringr::str_replace(x, "LIME", "\\\\lime")
+        x <- stringr::str_replace(x, "SHAP", "\\\\shap")
+        x <- stringr::str_replace(x, "SLISE", "\\\\slise")
+        x <- stringr::str_replace(x, "gray", "grey")
+        x
+    }
+    df <- res %>%
+        mutate(
+            method = factord(method),
+            stability2_a = stability_a / heatmap_max,
+            stability2_b = stability_b / heatmap_max
+        ) %>%
+        filter(stringr::str_starts(data, "emnist")) %>%
+        select(-data, -relevance) %>%
+        group_by(method) %>%
+        summarise_all(list(mean = mean, sd = sd)) %>%
+        transmute(
+            Method = sanitise_names(method),
+            `Fidelity (Item)` = format(fidelity_a_mean, fidelity_a_sd, which.min),
+            `Fidelity (Subset)` = format(fidelity_b_mean, fidelity_b_sd, which.min),
+            `Coverage (Dataset)` = format(coverage_a_mean, coverage_a_sd, which.max),
+            `Coverage (100 NN)` = format(coverage_b_mean, coverage_b_sd, which.max),
+            `Median Error` = format(medianerr_mean, medianerr_sd, which.min),
+            `Stability (20 NN)` = format(stability_a_mean, stability_a_sd, which.min),
+            `Stability (Noise)` = format(stability_b_mean, stability_b_sd, which.min),
+            `Coherence (20 NN)` = format(coherence_a2_mean, coherence_a2_sd, which.min),
+            `Coherence (Noise)` = format(coherence_b2_mean, coherence_b2_sd, which.min),
+            `Stability2 (20 NN)` = format(stability2_a_mean, stability2_a_sd, which.min),
+            `Stability2 (Noise)` = format(stability2_b_mean, stability2_b_sd, which.min),
+            `Coherence2 (20 NN)` = format(coherence_a1_mean, coherence_a1_sd, which.min),
+            `Coherence2 (Noise)` = format(coherence_b1_mean, coherence_b1_sd, which.min),
+            `sd(hm)` = format(heatmap_std_mean, heatmap_std_sd, which.max),
+            `mean(|hm|)` = format(heatmap_mean_mean, heatmap_mean_sd, which.max),
+            `max(|hm|)` = format(heatmap_max_mean, heatmap_max_sd, which.max),
+            intercept = format(intercept_mean, intercept_sd, which.min),
+            `sum(|alpha|)` = format(alpha_tot_mean, alpha_tot_sd, which.max)
+        )
+    table <- function(selection, label, caption = "TODO") {
+        tab <- print(
+            xtable::xtable(
+                as.data.frame(df %>% select(c("Method", selection))),
+                align = c("l", "l", rep("r", length(selection))),
+                label = paste0("tab:comp:", label),
+                caption = caption
+            ),
+            sanitize.text.function = identity,
+            sanitize.colnames.function = function(x) paste0("\\textbf{", x, "}"),
+            include.rownames = FALSE,
+            comment = FALSE,
+            booktabs = TRUE,
+            print.results = FALSE,
+            table.placement = "t",
+            caption.placement = "top"
+        )
+        tab <- stringr::str_replace(tab, "centering", "centering\n\\\\rowcolors{2}{white}{gray!25}")
+        paste0("\n", tab)
+    }
+    cat(table(c("Fidelity (Item)", "Fidelity (Subset)"), "fidelity"))
+    cat(table(c("Coverage (Dataset)", "Coverage (100 NN)", "Median Error"), "coverage"))
+    cat(table(c("Stability (20 NN)", "Stability (Noise)", "Coherence (20 NN)", "Coherence (Noise)"), "stability"))
+    # cat(table(c("Stability2 (20 NN)", "Stability2 (Noise)", "Coherence2 (20 NN)", "Coherence2 (Noise)"), "stability2"))
+    # cat(table(c("mean(|hm|)", "sd(hm)", "max(|hm|)", "sum(|alpha|)", "intercept"), "stats"))
+}
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        if (length(args) > 1) {
+            reticulate::use_python(args[2])
+        }
+        dir.create("experiments/results/comparison", showWarnings = FALSE, recursive = TRUE)
+        for (index in eval(parse(text = args[1]))) {
+            file <- sprintf("experiments/results/comparison/comparison_%03d.rds", index)
+            if (!file.exists(file)) {
+                time <- Sys.time()
+                suppressMessages(params <- get_config(index))
+                cat("[comparison] Init:", index, "\n")
+                res <- evaluate_expl(params$expl_fn, params$pred_fn, params$data, params$index, params$destroy_fn)
+                saveRDS(res, file)
+                time <- Sys.time() - time
+                cat("[comparison] Done:", index, "in", as.integer(time), "seconds\n")
+            } else {
+                cat("[comparison] Exists:", index, "\n")
+            }
+        }
+    } else {
+        df1 <- NULL
+        df2 <- NULL
+        for (f in list.files("experiments/results/comparison", full.names = TRUE)) {
+            res <- readRDS(f)
+            # res$df1$coherence_a1 = max(abs(res$df3$fx_a - res$df3$fxi_a - res$df3$fix_a + res$df3$fixi_a) / res$df3$x_norm_a)
+            # res$df1$coherence_b1 = max(abs(res$df3$fx_b - res$df3$fxi_b - res$df3$fix_b + res$df3$fixi_b) / res$df3$x_norm_b)
+            tryCatch(df1 <- rbind(df1, res$df1), error = function(e) cat("Could not join df1:", f, "\n"))
+            tryCatch(df2 <- rbind(df2, res$df2), error = function(e) cat("Could not join df2:", f, "\n"))
+            res <- NULL
+        }
+        plot_lineup()
+        plot_relevance(df2)
+        table_results(df1)
+    }
+}
\ No newline at end of file
diff --git a/experiments/explanations/exp_emnist.R b/experiments/explanations/exp_emnist.R
new file mode 100644
index 0000000..7321e6d
--- /dev/null
+++ b/experiments/explanations/exp_emnist.R
@@ -0,0 +1,265 @@
+## --------------------------------------------------
+## Experiments that use SLISE for explaining EMNIST
+##
+##
+## Usage:
+##
+##  Rscript --vanilla experiments/explanations/exp_emnist.R
+##
+## --------------------------------------------------
+
+source("experiments/explanations/utils.R")
+source("experiments/explanations/data.R")
+library(grid)
+library(keras)
+
+# The main EMNIST explanation
+exp_primary <- function(dir = "experiments/results", lambda = 2) {
+    set.seed(42)
+    cat("__EMNIST primary__\n")
+    data <- data_emnist(2)
+    selected <- 9 # which(data$R == 2)[8]
+    expl <- slise.explain(data$X, data$Y, epsilon = 0.5, x = selected, lambda1 = lambda, logit = TRUE)
+    cairo_pdf(file.path(dir, "explanation_emnist_digit.pdf"), 0.9 * 9, 0.35 * 9)
+    grid.draw(plot(expl, type = "image", plots = 3, labels = c("not 2", "2")))
+    dev.off()
+    cairo_pdf(file.path(dir, "explanation_emnist_dist.pdf"), 0.6 * 9, 0.35 * 9)
+    plot(plot(expl, "pred", approximation = FALSE, labels = c("Class Probability", "Number of Items"), title = "", partial = TRUE) + theme_paper())
+    dev.off()
+    # Lineup
+    cat("__EMNIST Lineup__\n")
+    data2 <- data_emnist(balance = FALSE)
+    Y2 <- data2$X %*% expl$alpha[-1] + expl$alpha[1]
+    sub2 <- abs(Y2 - limited_logit(data2$Y)) < expl$epsilon
+    lineup <- get_lineup(data2$Y, data2$R, sub2 & data2$R != 1, 6, 0.73)
+    # lineup <- c(1009, 32989, 25883, 20452, 7710, 35924)
+    cairo_pdf(file.path(dir, "explanation_emnist_lineup.pdf"), 1.0 * 9, 0.2 * 9)
+    grid.draw(plot_mnist(
+        array(t(expl$alpha[-1])[rep(1, length(lineup)), ], c(length(lineup), 28, 28)),
+        array(data2$X[lineup, ], c(length(lineup), 28, 28)),
+        c("not 2", "2")
+    ) + facet_wrap(
+        vars(Var1),
+        nrow = 1,
+        labeller = function(x) data.frame(Var1 = sprintf("p = %.3f", data2$Y[lineup]))
+    ) + theme_image())
+    dev.off()
+    # Scatter
+    cat("__EMNIST Scatter__\n")
+    cairo_pdf(file.path(dir, "explanation_emnist_scatter.pdf"), 0.6 * 9, 0.45 * 9)
+    plot(plot_scatter(expl, data2, lineup, scatter_size = 0.25))
+    dev.off()
+}
+
+# With less sparsity
+exp_dense <- function(dir = "experiments/results") {
+    set.seed(42)
+    cat("__EMNIST with L2__\n")
+    data <- data_emnist(2)
+    selected <- 9 # which(data$R == 2)[8]
+    expl <- slise.explain(data$X, data$Y, epsilon = 0.5, x = selected, lambda1 = 3, lambda2 = 6, logit = TRUE)
+    cairo_pdf(file.path(dir, "explanation_emnist_digit2.pdf"), 0.9 * 9, 0.35 * 9)
+    grid.draw(plot(expl, type = "image", plots = 3, labels = c("not 2", "2")) + theme_image())
+    dev.off()
+}
+
+# Try different parameter values in a grid
+exp_parameters <- function(dir = "experiments/results") {
+    set.seed(42)
+    cat("__EMNIST Grid__\n")
+    pars <- expand.grid(
+        epsilon = c(0.25, 0.5, 0.75, 1.0),
+        lambda = c(0.0, 1.0, 2.0, 5.0, 10.0, 20.0)
+    )
+    data <- data_emnist(2)
+    selected <- 9 # which(data$R == 2)[8]
+    expls <- mapply(function(e, l) {
+        cat(sprintf("  epsilon = %.2f  lambda = %.1f\n", e, l))
+        slise.explain(data$X, data$Y, e, selected, lambda1 = l, logit = TRUE)
+    }, pars$epsilon, pars$lambda, SIMPLIFY = FALSE)
+    models <- do.call(rbind, lapply(expls, function(e) e$alpha[-1]))
+    models <- sweep(models, 1, apply(abs(models), 1, max), `/`)
+    images <- do.call(rbind, lapply(expls, function(e) e$x))
+    sizes <- sapply(expls, function(e) mean(e$subset))
+    plt <- plot_mnist(
+        array(models, c(nrow(models), 28, 28)),
+        array(images, c(nrow(images), 28, 28)),
+        c("not 2", "2")
+    ) + facet_grid(
+        vars(pars$epsilon),
+        vars(pars$lambda),
+        switch = "y",
+        labeller = function(l) {
+            if (names(l) == "pars$lambda") {
+                list(paste("\u03BB =", l[[1]]))
+            } else {
+                list(paste("\u03B5 =", l[[1]]))
+            }
+        }
+    ) + theme_image() +
+        geom_text(aes(14.5, 29.2, label = sprintf("|S| = %.2f", sizes)), data = pars) +
+        theme(panel.spacing.y = unit(0.4, "cm")) +
+        coord_cartesian(clip = "off")
+    cairo_pdf(file.path(dir, "explanation_emnist_parameters.pdf"), 0.9 * 9, 0.6 * 9)
+    plot(plt)
+    dev.off()
+}
+
+exp_investigate <- function(dir = "experiments/results", lambda = 5) {
+    set.seed(42)
+    cat("__EMNIST 2-3__\n")
+    data <- data_emnist(2, balance = FALSE)
+    data$mask <- sample(which(data$R == 2 | data$R == 3))
+    data$R <- data$R[data$mask]
+    data$Y <- data$Y[data$mask]
+    data$X <- data$X[data$mask, ]
+    selected <- which(data$mask == 27673) # Same as the 9:th from above
+    expl <- slise.explain(data$X, data$Y, epsilon = 0.5, x = selected, lambda1 = lambda, logit = TRUE)
+    cairo_pdf(file.path(dir, "explanation_emnist_other.pdf"), 0.9 * 9, 0.35 * 9)
+    plot(expl, type = "image", plots = 3, labels = c("3", "2")) + theme_image()
+    dev.off()
+}
+
+# Create the subset lineup
+get_lineup <- function(Y, R, subset, num_examples = 6, blend = 0.5) {
+    inter <- 1 / num_examples
+    ys <- seq_len(num_examples) * inter - inter / 2
+    ys <- (ys * (max(Y) - min(Y)) + min(Y)) * blend +
+        quantile(Y[subset], ys) * (1 - blend)
+    selected <- c()
+    for (y in rev(ys)) {
+        sel <- which.min(abs(Y - y) - subset * 1000)
+        subset <- subset & R != R[sel]
+        selected <- c(sel, selected)
+    }
+    selected
+}
+
+# Randomly select samples for the scatter, but avoid overlapping
+# x,y: vector
+# r: radius
+# num: amount
+select_nonoverlap <- function(x, y, r, num = 50) {
+    r <- r * r
+    sel <- c()
+    for (i in seq_along(x)) {
+        add <- TRUE
+        for (j in sel) {
+            if ((x[i] - x[j])^2 + (y[i] - y[j])^2 < r) {
+                add <- FALSE
+                break()
+            }
+        }
+        if (add) {
+            sel <- c(sel, i)
+            if (length(sel) >= num) {
+                break()
+            }
+        }
+    }
+    sel
+}
+
+# Plot a scatterplot of images
+plot_scatter <- function(slise, data = NULL, lineup = NULL, width = 28, height = 28, scatter_size = 0.2, num_scatter = 100, logits = FALSE) {
+    x <- limited_logit(predict(slise, slise$X))
+    y <- slise$Y
+    images <- slise$X
+    selected <- select_nonoverlap(x, y, scatter_size, num_scatter)
+
+    plt <- ggplot() +
+        geom_point(aes(x[selected], y[selected])) +
+        lapply(selected, function(i) {
+            im <- matrix(images[i, ], width, height)
+            g <- rasterGrob(1 - im, name = i)
+            annotation_custom(g,
+                xmin = x[[i]] - scatter_size, xmax = x[[i]] + scatter_size,
+                ymin = y[[i]] - scatter_size, ymax = y[[i]] + scatter_size
+            )
+        }) +
+        geom_abline(aes(intercept = 0, slope = 1, col = "Subset"), size = 1) +
+        geom_abline(aes(intercept = slise$epsilon, slope = 1, col = "Subset"), linetype = "dashed", size = 1) +
+        geom_abline(aes(intercept = -slise$epsilon, slope = 1, col = "Subset"), linetype = "dashed", size = 1) +
+        theme_paper() +
+        theme(aspect.ratio = 1) +
+        scale_color_manual(
+            guide = guide_legend(title = NULL),
+            breaks = c("Subset", "Explained", "In Lineup"),
+            values = c("Subset" = SLISE_PURPLE, "Explained" = SLISE_ORANGE, "In Lineup" = "#1b9e77")
+        )
+
+    if (!is.null(lineup)) {
+        x2 <- limited_logit(predict(slise, data$X))
+        y2 <- limited_logit(data$Y)
+        images2 <- data$X
+        sels <- lineup[lineup != -1]
+        plt <- plt +
+            lapply(sels, function(i) {
+                im <- matrix(images2[i, ], width, height)
+                g <- rasterGrob(1 - im, name = -i)
+                annotation_custom(g,
+                    xmin = x2[[i]] - scatter_size, xmax = x2[[i]] + scatter_size,
+                    ymin = y2[[i]] - scatter_size, ymax = y2[[i]] + scatter_size
+                )
+            }) +
+            geom_tile(aes(x2[sels], y2[sels], width = scatter_size * 2, height = scatter_size * 2, col = "In Lineup"), fill = NA, size = 1)
+    }
+
+    plt <- plt + lapply(1, function(i) {
+        im <- matrix(slise$x, width, height)
+        g <- rasterGrob(1 - im, name = 0)
+        annotation_custom(g,
+            xmin = slise$y - scatter_size, xmax = slise$y + scatter_size,
+            ymin = slise$y - scatter_size, ymax = slise$y + scatter_size
+        )
+    }) +
+        geom_tile(aes(slise$y, slise$y, width = scatter_size * 2, height = scatter_size * 2, col = "Explained"), fill = NA, size = 1)
+
+    lims <- quantile(c(x, y), c(0.01, 0.99))
+    plt + xlab("SLISE Approximation") + ylab("Classifier Prediction") +
+        coord_cartesian(xlim = lims, ylim = lims) +
+        scale_x_continuous(labels = function(x) round(sigmoid(x), 2)) +
+        scale_y_continuous(labels = function(x) round(sigmoid(x), 2))
+}
+
+exp_new_image <- function(dir = "experiments/results") {
+    set.seed(42)
+    digit <- 2
+    # new_image <- matrix(1 - png::readPNG("experiments/data/new2.png")[, , 1], 1, 784)
+    new_image <- matrix(c(
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    ), 1, 784)
+    model <- keras::load_model_hdf5("experiments/data/emnist_model.hdf5")
+    pred <- predict(model, new_image)
+    data <- data_emnist(digit)
+    expl <- slise.explain(
+        data$X,
+        data$Y,
+        epsilon = 0.5,
+        x = new_image,
+        y = pred[digit + 1],
+        lambda1 = 2,
+        lambda2 = 0.1,
+        logit = TRUE
+    )
+    names(pred) <- paste(0:9)
+    cat("Predictions:\n")
+    print(pred)
+    cairo_pdf(file.path(dir, "explanation_emnist_new.pdf"), 0.9 * 9, 0.35 * 9)
+    grid.draw(plot(
+        expl,
+        "mnist",
+        plots = 3,
+        title = sprintf("Predicted %.1f%% likely to be a %s", pred[digit + 1] * 100, digit),
+        labels = c("not 2", "2")
+    ))
+    dev.off()
+}
+
+
+if (sys.nframe() == 0L) { # Only run with Rscript
+    exp_primary()
+    exp_parameters()
+    exp_investigate()
+    exp_new_image()
+}
\ No newline at end of file
diff --git a/experiments/explanations/exp_gradopt.R b/experiments/explanations/exp_gradopt.R
new file mode 100644
index 0000000..b44db0b
--- /dev/null
+++ b/experiments/explanations/exp_gradopt.R
@@ -0,0 +1,55 @@
+## --------------------------------------------------
+## Example for use to describe Graduated Optimisation
+##
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/explanations/exp_intro.R
+##
+## --------------------------------------------------
+
+library(ggplot2)
+source("experiments/explanations/utils.R")
+
+plot_gradopt <- function(filename) {
+    set.seed(42)
+    size <- 40
+
+    x <- matrix(runif(size, -0.5, 0.5), size, 1)
+    y <- -x + rnorm(size, 0, 0.15)
+
+    a <- seq(-1.6, -0.2, 0.002)
+    l1 <- sapply(a, loss_smooth, x, y, 0.1, 10)
+    l2 <- sapply(a, loss_smooth, x, y, 0.1, 400)
+    l3 <- sapply(a, loss_smooth, x, y, 0.1, 1500)
+    l4 <- sapply(a, loss_sharp, x, y, 0.1)
+
+    df <- data.frame(
+        x = c(rep(a, 4)),
+        y = c(l1, l2, l3, l4),
+        b = c(rep(1:4, each = length(a)))
+    )
+    lblr <- as_labeller(function(b) {
+        expression(
+            beta == 0,
+            beta > 0,
+            paste(beta, "\u226B", 0),
+            beta %->% infinity
+        )
+    }, default = label_parsed)
+    gg <- ggplot(df) +
+        xlab(expression(alpha)) +
+        ylab("Loss") +
+        geom_line(aes(x = x, y = y)) +
+        facet_wrap(~b, ncol = 4, scale = "free_y", labeller = lblr) +
+        theme_paper() +
+        theme(axis.ticks = element_blank(), axis.text = element_blank())
+
+    cairo_pdf(filename, width = 1.0 * 9, height = 0.22 * 9)
+    plot(gg)
+    dev.off()
+}
+
+if (sys.nframe() == 0L) { # Only run with Rscript
+    plot_gradopt("experiments/results/gradopt_example.pdf")
+}
\ No newline at end of file
diff --git a/experiments/explanations/exp_imdb.R b/experiments/explanations/exp_imdb.R
new file mode 100644
index 0000000..fbe5793
--- /dev/null
+++ b/experiments/explanations/exp_imdb.R
@@ -0,0 +1,105 @@
+## --------------------------------------------------
+## Experiments that use SLISE for explaining IMDB
+##
+##
+## Usage:
+##
+##  Rscript --vanilla experiments/explanations/exp_imdb.R
+##
+## --------------------------------------------------
+
+source("experiments/explanations/utils.R")
+source("experiments/explanations/data.R")
+library(stringr)
+library(tm)
+
+exp_imdb <- function(dir = "experiments/results") {
+    set.seed(42)
+    data <- data_imdb()
+
+    # Find a review with "not bad" (wasn't, isn't, aren't, wouldn't, and couldn't)
+    not_words <- c("wasnt", "isnt", "arent", "couldnt", "wouldnt")
+    items <- which(data$R == 1 & data$Y < 0.5 & rowSums(data$X[, not_words]) > 0 & data$X[, "bad"] > 0)
+    for (selected in items) {
+        review <- get_imdb_review(selected)
+        if (length(review$split) > 250 || !str_detect(review$text, "((was)|(is)|(are)|(could)|(would))n't[^.!?:,;\\n]*bad")) {
+            next()
+        }
+        mask <- data$X[selected, ] != 0
+        expl <- slise.explain(data$X[, mask], data$Y, 1.0, selected, lambda1 = 10, logit = TRUE)
+        if (expl$alpha["bad"] >= 0 || min(expl$alpha[not_words], na.rm = TRUE) / expl$alpha["bad"] < 1 / 8) {
+            next()
+        }
+        print(selected)
+        print_latex(review$split, review$tokens, expl)
+    }
+
+    # Find a review to compare to lime
+    # set.seed(42)
+    # imdb3 <- data_aclimdb(10000, model="lr")
+    # imdb4 <- data_aclimdb(model="lr")
+    # selected <- 10356
+    # review <- aclimdb_get_review(imdb4, selected)
+    # lime_expl <- lime_explain_aclimdb(imdb4, selected, 10)
+    # mask <- imdb3$X[, colnames(imdb3$X) == "street"] == 0
+    # slise_expl1 <- slise.explain(imdb3$X, imdb3$Y, 0.1, imdb4$X[selected, ], imdb4$Y[selected], lambda1=0.75, logit=TRUE)
+    # slise_expl2 <- slise.explain(imdb3$X[mask, ], 0.1, imdb3$Y[mask], imdb4$X[selected, ], imdb4$Y[selected], lambda1=0.75, logit=TRUE)
+    # print_latex(review$text, review$tokens, slise_expl1)
+    # print_latex(review$text, review$tokens, slise_expl2)
+}
+
+get_imdb_review <- function(index, set = "test", datadir = "experiments/data") {
+    strs <- readRDS(file.path(datadir, paste0("aclimdb_str_", set, ".rds")))
+    text <- if (index <= length(strs$pos)) strs$pos[[index]] else strs$neg[[index - length(strs$pos)]]
+    text <- str_replace_all(text, "((<br />)|(\\n)|\n)", " \n ")
+    text <- str_replace_all(text, "  ", " ")
+    split <- str_split(text, " ")[[1]]
+    corpus <- aclimdb_clean(VectorSource(split))
+    tokens <- sapply(corpus$content, function(d) d$content)
+    list(text = text, tokens = tokens, split = split)
+}
+
+aclimdb_clean <- function(source) {
+    data <- VCorpus(source, readerControl = list(language = "en"))
+    data <- tm_map(data, content_transformer(function(x) gsub("((<br />)|(\\\\n)|(\\n))", " ", x)))
+    data <- tm_map(data, content_transformer(function(x) gsub("(<|>|\\(|\\)|\\[|\\]|\\{|\\})", " ", x)))
+    data <- tm_map(data, stripWhitespace)
+    data <- tm_map(data, content_transformer(tolower))
+    data <- tm_map(data, content_transformer(removePunctuation))
+    data <- tm_map(data, content_transformer(removeNumbers))
+    data <- tm_map(data, removeWords, stopwords("english"))
+    data <- tm_map(data, stemDocument)
+    data
+}
+
+print_latex <- function(text_split, tokens, expl) {
+    alpha <- expl$alpha[-1]
+    names(alpha) <- names(expl$alpha)[-1]
+    lim <- max(abs(alpha)) / 2
+    text_split <- str_replace_all(text_split, "&", "\\\\&")
+    text_split <- str_replace_all(text_split, "\n", "\\\\\\\\\n")
+    for (i in seq_along(text_split)) {
+        if (tokens[[i]] == "") next()
+        v <- alpha[tokens[[i]]]
+        if (is.na(v)) next()
+        if (v > lim) {
+            text_split[i] <- paste0("\\colorbox[HTML]{7FBC41}{", text_split[i], "}")
+        } else if (v > lim / 3) {
+            text_split[i] <- paste0("\\colorbox[HTML]{E6F5D0}{", text_split[i], "}")
+        } else if (v < -lim) {
+            text_split[i] <- paste0("\\colorbox[HTML]{DE77AE}{", text_split[i], "}")
+        } else if (v < -lim / 3) {
+            text_split[i] <- paste0("\\colorbox[HTML]{FDE0EF}{", text_split[i], "}")
+        }
+    }
+    nt <- do.call(paste, c(
+        list("\\noindent\\fbox{\\parbox{0.98\\textwidth}{\\small\n"),
+        text_split,
+        list("\n}}\n")
+    ))
+    cat(nt)
+}
+
+if (sys.nframe() == 0L) { # Only run with Rscript
+    exp_imdb()
+}
\ No newline at end of file
diff --git a/experiments/explanations/exp_internal.R b/experiments/explanations/exp_internal.R
new file mode 100644
index 0000000..2a2bdb6
--- /dev/null
+++ b/experiments/explanations/exp_internal.R
@@ -0,0 +1,100 @@
+## --------------------------------------------------
+## Experiments that use SLISE combined with Activation Maximisation
+##
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/explanations/exp_internal.R
+##
+## --------------------------------------------------
+
+source("experiments/explanations/utils.R")
+source("experiments/explanations/data.R")
+library(grid)
+library(gridExtra)
+
+
+emnist_get_internal <- function(datadir = "experiments/data") {
+    data <- readRDS(file.path(datadir, "emnist.rds"))
+    pred <- readRDS(file.path(datadir, "emnist_preds.rds"))
+    inter <- readRDS(file.path(datadir, "emnist_internal.rds"))
+    inter$image <- data$image[int$selected, ]
+    inter$label <- data$label[int$selected]
+    inter$pred <- sapply(seq_along(int$label), function(i) pred[int$selected[i], int$label[i] + 1])
+    inter
+}
+
+exp_internal <- function(dir = "experiments/results") {
+    set.seed(42)
+    int <- emnist_get_internal()
+    emnist <- data_emnist(3)
+    # plot_mnist(array(int$image[which(int$label == 3),], c(40, 28, 28)), colours = c("white", "black")) + facet_wrap(vars(Var1), ncol=7)
+    selected <- which(int$label == 3)[19]
+    # plot_mnist(array(am, c(32, 28, 28)), int$image[selected, ], colours = c("white", "black"), enhance_colours = FALSE) + facet_wrap(vars(Var1), ncol=7)
+    expl <- slise.explain(
+        int$internal[emnist$mask, ],
+        emnist$Y,
+        epsilon = 0.5,
+        x = int$internal[int$selected[selected], ],
+        y = int$pred[selected],
+        logit = TRUE,
+        lambda1 = 100
+    )
+    # plot(expl, "pred")
+    # print(expl)
+    nodes <- lapply(int$nodes, function(n) n[selected, ])
+    cairo_pdf(file.path(dir, "internal_emnist_colour.pdf"), 1.0 * 9, 0.35 * 9)
+    grid.draw(plot_internal(expl, nodes, int$image[selected, ], 6, bw = FALSE))
+    dev.off()
+    cairo_pdf(file.path(dir, "internal_emnist_gray.pdf"), 1.0 * 9, 0.35 * 9)
+    grid.draw(plot_internal(expl, nodes, int$image[selected, ], 6, bw = TRUE))
+    dev.off()
+}
+
+plot_internal <- function(slise, nodes, image = NULL, num_nodes = 6, impact = TRUE, bw = FALSE) {
+    selected <- order(-abs(if (impact) slise$impact[-1] else slise$alpha[-1]))
+    if (num_nodes < length(nodes)) {
+        selected <- selected[1:num_nodes]
+    } else {
+        num_nodes <- length(nodes)
+    }
+    nodes <- array(t(simplify2array(nodes[selected])), c(num_nodes, 28, 28))
+    labeller <- function(x) list(paste("Neuron", selected))
+    node_plot <- if (bw) {
+        plot_mnist(nodes - 0.5, image, enhance_colours = FALSE, colours = c("white", "black")) +
+            facet_wrap(vars(Var1), labeller = labeller, nrow = 1) +
+            theme_image(legend.position = "none", axis.title.y = element_text()) +
+            labs(y = "\nActivation\nMaximisations")
+    } else {
+        plot_mnist(nodes - 0.5, image, c("White", "Black"), enhance_colours = FALSE) +
+            facet_wrap(vars(Var1), labeller = labeller, nrow = 1) +
+            theme_image(
+                legend.position = "left",
+                legend.box.margin = margin(c(0, -10, 0, 0))
+            ) + # , axis.title.y = element_text()) + labs(y = "Activation\nMaximisations")
+            guides(fill = guide_legend(title.hjust = 0.5, label.position = "bottom"))
+    }
+    df <- data.frame(
+        x = slise$x[selected],
+        alpha = slise$alpha[-1][selected],
+        impact = slise$impact[-1][selected],
+        name = factord(paste("Neuron", selected))
+    )
+    labels <- factord(c("Activation", "Coefficient", "Impact"))
+    cols_plot <- ggplot(df) +
+        geom_col(aes(labels[1], x)) +
+        geom_col(aes(labels[2], alpha)) +
+        geom_col(aes(labels[3], impact)) +
+        coord_flip() +
+        scale_x_discrete(limits = rev) +
+        facet_wrap(vars(name), nrow = 1) +
+        labs(x = NULL, y = NULL, title = NULL) +
+        theme_paper() +
+        theme(strip.text = element_blank())
+    grid.arrange(node_plot, cols_plot, nrow = 2, heights = c(0.6, 0.4))
+}
+
+
+if (sys.nframe() == 0L) { # Only run with Rscript
+    exp_internal()
+}
\ No newline at end of file
diff --git a/experiments/explanations/exp_intro.R b/experiments/explanations/exp_intro.R
new file mode 100644
index 0000000..02896c3
--- /dev/null
+++ b/experiments/explanations/exp_intro.R
@@ -0,0 +1,194 @@
+## --------------------------------------------------
+## Plots for the introduction
+##
+##
+## Usage:
+##
+##  Rscript --vanilla experiments/explanations/exp_intro.R
+##
+## --------------------------------------------------
+
+source("experiments/explanations/utils.R")
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+
+    # Local Linear Approximation
+    x <- c(-2, runif(80, -3.3, 3.3))
+    set.seed(42)
+    y <- cos(x)
+    expl <- slise.explain(x, y, 0.25, 1)
+    cairo_pdf("experiments/results/intro_approximation.pdf", width = 0.55 * 9, height = 0.3 * 9)
+    plot(plot(expl, "2d", labels = c("x", "cos(x)"), title = "", partial = TRUE, size = 1.5) + coord_fixed() + theme_paper())
+    dev.off()
+
+    # Constrained Data Manifold
+    set.seed(42)
+    x1 <- runif(50, -3, 2)
+    x2 <- x1 * 0.55 + rnorm(50, 1, 0.15)
+    x3 <- runif(50, -2, 3)
+    x4 <- x3 * 0.55 - rnorm(50, 1, 0.15)
+    X <- cbind(c(x1, x3), c(x2, x4))
+    Y <- c(x1 < -0.75, x3 > 0.75)
+    gg <- ggplot() +
+        geom_point(aes(X[, 1], X[, 2], shape = Y, color = Y)) +
+        geom_text(aes(c(-1.5, 1.5), c(1.5, -1.5), label = "?")) +
+        scale_shape_manual(
+            limits = c(FALSE, TRUE),
+            values = c(0, 4),
+            labels = c("Class A", "Class B"),
+            name = NULL
+        ) +
+        scale_color_manual(
+            limits = c(FALSE, TRUE),
+            values = c(SLISE_ORANGE, SLISE_PURPLE),
+            labels = c("Class A", "Class B"),
+            name = NULL
+        ) +
+        labs(x = "x1", y = "x2") +
+        theme_paper()
+    cairo_pdf("experiments/results/intro_manifold.pdf", width = 0.45 * 9, height = 0.3 * 9)
+    plot(gg)
+    dev.off()
+
+    # Constrained Data Manifold 2
+    set.seed(42)
+    num <- 30
+    x1 <- seq(length.out = num, -2.5, 2.4)
+    x2 <- x1 * 0.15 + 1 + rnorm(num, 0, 0.1)
+    x3 <- seq(length.out = num, -2.4, 2.5)
+    x4 <- x3 * 0.15 - 1 + rnorm(num, 0, 0.1)
+    df <- data.frame(x = c(x1, x3), y = c(x2, x4), z = c(x1 < -0.4, x3 > 0.4))
+    gd <- 50
+    grid <- expand.grid(x = seq(min(df$x), max(df$x), length.out = gd), y = seq(min(df$y), max(df$y), length.out = gd))
+    grid2 <- grid
+    grid$z <- apply(grid, 1, function(x) {
+        dist <- (df$x - x[1])^2 + (df$y - x[2])^2 / (df$z * 5 + 1)
+        as.numeric(df$z[which.min(dist)])
+    })
+    grid$f <- "Decision Boundary 1"
+    grid2$z <- apply(grid2, 1, function(x) {
+        dist <- (df$x - x[1])^2 + (df$y - x[2])^2 / (4 - df$z * 3)
+        as.numeric(df$z[which.min(dist)])
+    })
+    grid2$f <- "Decision Boundary 2"
+    grid <- rbind(grid, grid2)
+    gg <- ggplot(grid, aes(x, y, shape = z, color = z, z = z)) +
+        geom_point(data = df) +
+        geom_contour(bins = 1, color = "black", linetype=2) +
+        facet_grid(cols = vars(f)) +
+        scale_shape_manual(
+            limits = c(FALSE, TRUE),
+            values = c(4, 0),
+            labels = c("Class A", "Class B"),
+            name = NULL
+        ) +
+        scale_color_manual(
+            limits = c(FALSE, TRUE),
+            values = c(SLISE_ORANGE, SLISE_PURPLE),
+            labels = c("Class A", "Class B"),
+            name = NULL
+        ) +
+        labs(x = "x1", y = "x2") +
+        coord_fixed() +
+        theme_paper()
+    cairo_pdf("experiments/results/intro_manifold2.pdf", width = 0.85 * 9, height = 0.3 * 9)
+    plot(gg)
+    dev.off()
+
+    # Constrained Data Manifold 3
+    set.seed(42)
+    num <- 30
+    x1 <- seq(length.out = num, -2.5, 2.4)
+    x2 <- x1 * 0.15 + 1 + rnorm(num, 0, 0.1)
+    x3 <- seq(length.out = num, -2.4, 2.5)
+    x4 <- x3 * 0.15 - 1 + rnorm(num, 0, 0.1)
+    df <- data.frame(x = c(x1, x3), y = c(x2, x4), z = c(x1 < -0.4, x3 > 0.4))
+    dl <- data.frame(
+        x=c(-0.37, -0.43, -2.5, 0.37, 0.43, 2.5, -0.43, -0.37, 2.5, 0.43, 0.37, -2.5),
+        y=c(1.5, 0.75, -1.0, -1.5, -0.75, 1.0, 1.5, 0.9, -0.2, -1.5, -0.9, 0.2),
+        z=rep(rep(factor(1:2), each=3), 2),
+        f=rep(sprintf("Decision Boundary %d", 1:2), each=6)
+    )
+    gg <- ggplot(dl, aes(x, y, shape = z, color = z, group=z)) +
+        geom_point(data = df) +
+        geom_path(color="black", linetype=2) +
+        facet_grid(cols = vars(f)) +
+        scale_shape_manual(
+            limits = c(FALSE, TRUE),
+            values = c(4, 0),
+            labels = c("Class A", "Class B"),
+            name = NULL
+        ) +
+        scale_color_manual(
+            limits = c(FALSE, TRUE),
+            values = c(SLISE_ORANGE, SLISE_PURPLE),
+            labels = c("Class A", "Class B"),
+            name = NULL
+        ) +
+        labs(x = "x1", y = "x2") +
+        coord_fixed() +
+        theme_paper()
+    cairo_pdf("experiments/results/intro_manifold3.pdf", width = 0.85 * 9, height = 0.3 * 9)
+    plot(gg)
+    dev.off()
+
+    # Constrained Data Manifold 4
+    set.seed(42)
+    num <- 30
+    x1 <- seq(length.out = num, -2.5, 2.4)
+    x2 <- x1 * 0.15 + 1.3 + rnorm(num, 0, 0.1)
+    x3 <- seq(length.out = num, -2.4, 2.5)
+    x4 <- x3 * 0.15 - 1.3 + rnorm(num, 0, 0.1)
+    df <- data.frame(x = c(x1, x3), y = c(x2, x4), z = c("A", "B")[1+c(x1 < -0.4, x3 > 0.4)])
+    dl <- data.frame(
+        x=c(-0.39, -0.41, -2.5, 0.39, 0.41, 2.5, -0.63, -0.37, 2.5, 0.63, 0.37, -2.5),
+        y=c(1.7, 0.95, -1.0, -1.7, -0.95, 1.0, 1.7, 1.0, 0.2, -1.7, -1.0, -0.2),
+        z=rep(rep(factor(1:2), each=3), 2),
+        f=rep(sprintf("Decision Boundary %d", 2:1), each=6)
+    )
+    da1 <- data.frame(
+        x = c(df$x[18]-0.06, df$x[18] - 0.6, df$x[18]-0.02, df$x[18] - 0.2),
+        y=c(df$y[18]+0.01, df$y[18] + 0.05, df$y[18]-0.055, df$y[18] - 0.55),
+        z="D",
+        f=rep(sprintf("Decision Boundary %d", 2:1), each=2)
+    )
+    da2 <- data.frame(
+        x = c(df$x[53]-0.06, df$x[53] - 0.6, df$x[53]-0.035, df$x[53] - 0.35),
+        y = c(df$y[53]-0.01, df$y[53] - 0.1, df$y[53]+0.045, df$y[53] + 0.45),
+        z = "D",
+        f = rep(sprintf("Decision Boundary %d", 1:2), each=2)
+    )
+    gg <- ggplot(dl, aes(x, y, group=z)) +
+        geom_point(aes(shape = z, color = z), data = df) +
+        geom_path(color="black", linetype=2) +
+        geom_path(aes(linetype=z), color="#1b9e77", data=da1, arrow=arrow(length=unit(0.08, "inch"))) +
+        geom_path(aes(linetype=z), color="#1b9e77", data=da2, arrow=arrow(length=unit(0.08, "inch"))) +
+        facet_grid(cols = vars(f)) +
+        scale_shape_manual(
+            limits = c("A", "B"),
+            values = c(4, 0),
+            labels = c("Class A", "Class B"),
+            name = NULL
+        ) +
+        scale_color_manual(
+            limits = c("A", "B"),
+            values = c(SLISE_ORANGE, SLISE_PURPLE),
+            labels = c("Class A", "Class B"),
+            name = NULL
+        ) +
+        scale_linetype_manual(
+            limits = c("D"),
+            values = c(1),
+            labels = c("Gradient"),
+            name = NULL
+        ) +
+        labs(x = "x1", y = "x2") +
+        coord_fixed() +
+        theme_paper()
+    cairo_pdf("experiments/results/intro_manifold4.pdf", width = 0.9 * 9, height = 0.35 * 9)
+    plot(gg)
+    dev.off()
+}
\ No newline at end of file
diff --git a/experiments/explanations/exp_jets.R b/experiments/explanations/exp_jets.R
new file mode 100644
index 0000000..63617fd
--- /dev/null
+++ b/experiments/explanations/exp_jets.R
@@ -0,0 +1,61 @@
+## --------------------------------------------------
+## Experiments that use SLISE for explaining jets
+##
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/explanations/exp_jets.R
+##
+## --------------------------------------------------
+
+source("experiments/explanations/utils.R")
+source("experiments/explanations/data.R")
+library(xtable)
+library(stringr)
+library(grid)
+
+
+exp_jet_tab <- function(dir = "experiments/results") {
+    set.seed(42)
+    jets <- data_jets(n = 100000)
+    x <- c(1196, 0.935, 0.002, 16) # old example
+    selected <- which.min(rowSums(abs(scale(sweep(jets$X, 2, x), center = FALSE))))
+    expl <- slise.explain(jets$X, jets$Y, 0.3, selected, logit = TRUE, normalise = TRUE, lambda1 = 10)
+    print(expl)
+
+    df <- data.frame(
+        "Unnormalised Jet" = expl$x,
+        "Unnormalised Model" = expl$coefficients[-1],
+        "Unnormalised Impact" = expl$impact[-1],
+        "_" = NA,
+        "Normalised Jet" = expl$normalised_x,
+        "Normalised Model" = expl$normalised[-1],
+        "Normalised Impact" = expl$normalised_impact[-1]
+    )
+    rownames(df) <- names(expl$coefficients[-1])
+    caption <- paste("Using \\slise to explain why this jet from the \\physics dataset is a", c("Gluon.", "Quark.")[jets$R[selected] + 1])
+    table <- xtable(df, caption = caption, label = "tab:exp:jets1", digits = rep(3, 8), align = "lrrrcrrr")
+    header1 <- "% \\textbf{Variables} & \\multicolumn{3}{c}{\\textbf{Unnormalised}} & & \\multicolumn{3}{c}{\\textbf{Normalised}} \\\\\n"
+    header2 <- "& Jet & Model & Impact & & Jet & Model & Impact \\\\\n"
+    print(table,
+        booktabs = TRUE, comment = FALSE, include.colnames = FALSE, caption.placement = "top",
+        add.to.row = list(pos = list(0, 0), command = c(header1, header2))
+    )
+}
+
+exp_jet_img <- function(dir = "experiments/results") {
+    set.seed(42)
+    imgs <- data_jets(TRUE, 100000)
+    selected <- 32
+    expl <- slise.explain(imgs$X, imgs$Y, 0.5, selected, lambda1 = 200, logit = TRUE)
+    print(expl)
+    plot(expl, "pred")
+    cairo_pdf(file.path(dir, "explanation_jet_img.pdf"), 0.7 * 9, 0.35 * 9)
+    grid.draw(plot(expl, type = "image", plots = 2, labels = c("Gluon", "Quark"), title = "Physics Jet Image"))
+    dev.off()
+}
+
+if (sys.nframe() == 0L) { # Only run with Rscript
+    exp_jet_tab()
+    exp_jet_img()
+}
\ No newline at end of file
diff --git a/experiments/explanations/exp_tabular.R b/experiments/explanations/exp_tabular.R
new file mode 100644
index 0000000..e4e407c
--- /dev/null
+++ b/experiments/explanations/exp_tabular.R
@@ -0,0 +1,37 @@
+## --------------------------------------------------
+## Experiments that use SLISE for tabular explanations
+##
+##
+## Usage:
+##
+##  Rscript --vanilla experiments/explanations/exp_tabular.R
+##
+## --------------------------------------------------
+
+source("experiments/explanations/utils.R")
+source("experiments/explanations/data.R")
+
+exp_mtcars <- function() {
+    # This dataset so small that the explanations are relatively unstable
+    cat("mtcars\n")
+    set.seed(42)
+    data <- data_mtcars("lm")
+    cat("Linear Model MSE: ", mean((data$Y - data$R)^2), "\n")
+    data <- data_mtcars("rf")
+    cat("Random Forest MSE:", mean((data$Y - data$R)^2), "\n")
+    expl <- slise.explain(data$X, data$Y, 0.15, 1, lambda1 = 0.1, normalise = TRUE)
+    print(expl, 20)
+    cat("Random Forest Importance\n")
+    imp <- randomForest::importance(data$model)
+    print(imp[order(-abs(expl$normalised[-1])),])
+    # plot(ne, "dist")
+}
+
+exp_tabbincl <- function() {
+    # TODO: Find some nice tabular dataset with a binary classification
+}
+
+if (sys.nframe() == 0L) { # Only run with Rscript
+    exp_mtcars()
+    exp_boston()
+}
diff --git a/experiments/explanations/methods.R b/experiments/explanations/methods.R
new file mode 100644
index 0000000..888e0c0
--- /dev/null
+++ b/experiments/explanations/methods.R
@@ -0,0 +1,265 @@
+# This script contains wrapper functions for explanations with LIME, SHAP, and SLISE
+
+require(keras)
+library(reticulate)
+library(tensorflow)
+
+source("experiments/explanations/utils.R")
+
+# This version uses superpixels from LIME but with SLISE instead of LASSO
+limeslise_emnist <- function(X,
+                             Y,
+                             index,
+                             predict_fn,
+                             epsilon = 0.5,
+                             lambda = 5,
+                             samples = 10000,
+                             compactness = 0.5,
+                             removed = 0.25,
+                             similarity = 0.2,
+                             ...) {
+    x <- c(X[index, ])
+    y <- Y[index]
+    lime <- reticulate::import("lime")
+    segmenter <- lime$wrappers$scikit_image$SegmentationAlgorithm("slic", compactness = compactness, convert2lab = FALSE, start_label = as.integer(1))
+    segments <- c(segmenter(matrix(x, 28, 28)))
+    segmenter <- NULL
+    max_seg <- max(segments)
+    dimred <- function(hx) {
+        diff <- abs(hx - x)
+        vapply(1:max_seg, function(i) mean(diff[segments == i]) < similarity, numeric(1))
+    }
+    dimexp <- function(lx) {
+        ifelse(segments %in% which(as.logical(lx)), x, x * 0 + 0.5)
+    }
+    X <- matrix(as.numeric(runif(samples * max_seg) > removed), samples, max_seg)
+    X <- rbind(rep(1, ncol(X)), X)
+    HX <- t(apply(X, 1, dimexp))
+    Y <- predict_fn(HX)
+    weight <- exp(-apply(X, 1, cosine_distance, X[index, ]))
+    weight <- weight * length(Y) / sum(weight) # uniformly scale so that the same lambdas can be used
+    expl <- slise.explain(X, Y, epsilon, 1, lambda1 = lambda, lambda2 = lambda, logit = TRUE, weight = weight)
+    nh <- list(X = HX[expl$subset, ], Y = Y[expl$subset])
+    approx_fn <- function(X) {
+        if (length(X) == length(x)) {
+            predict(expl, dimred(X))
+        } else {
+            predict(expl, t(apply(X, 1, dimred)))
+        }
+    }
+    coef <- expl$alpha[-1][segments]
+    list(heatmap = coef, impact = coef, alpha = expl$alpha, approx_fn = approx_fn, neighbourhood = nh, name = "LIME-SLISE")
+}
+
+# Pixelwise KernelSHAP with various options for "off" pixels
+shap_emnist <- function(X,
+                        Y,
+                        index,
+                        predict_fn,
+                        deletion = "invert",
+                        samples = 10000,
+                        similarity = 0.2,
+                        ...) {
+    shap <- reticulate::import("shap")
+    shap2 <- reticulate::import("shap", convert = FALSE)
+    nhX <- NULL
+    nhY <- NULL
+    pred <- function(x) {
+        out <- predict_fn(x)
+        nhX <<- rbind(nhX, x)
+        nhY <<- c(nhY, out)
+        out
+    }
+    py_pred <- reticulate::py_func(pred)
+    item <- unname(X[index, , drop = FALSE])
+    if (deletion == "invert") {
+        explainer <- shap$KernelExplainer(py_pred, 1 - item, "logit")
+    } else if (deletion == "background") {
+        explainer <- shap$KernelExplainer(py_pred, 0 * item, "logit")
+    } else if (deletion == "gray") {
+        explainer <- shap$KernelExplainer(py_pred, 0 * item + 0.5, "logit")
+    } else if (deletion == "sample") {
+        # Manually do the logit-link function
+        py_pred <- reticulate::py_func(function(x) {
+            out <- pred(x)
+            out2 <- limited_logit(out)
+            dim(out2) <- dim(out)
+            out2
+        })
+        # Subsample the data to increase speed
+        Xsamp <- unname(X[seq(1, nrow(X), 10), ])
+        explainer <- shap$explainers$Sampling(py_pred, Xsamp)
+    } else {
+        stop("Unkown deletion method (must be one of invert, background, gray, or sample)")
+    }
+    shap_values <- explainer$shap_values(item, nsamples = as.integer(samples), l1_reg = "aic")
+    if (length(shap_values) == 1) {
+        shap_values <- shap_values[[1]]
+    }
+    hm <- shap_values[1, ]
+    intercept <- c(explainer$expected_value)
+    pred <- function(X) sigmoid(apply(X, 1, function(x) intercept + sum(hm[abs(x - item) < similarity])))
+    list(heatmap = hm, impact = hm, alpha = c(intercept, hm), approx_fn = pred, neighbourhood = list(X = nhX, Y = nhY), name = sprintf("SHAP (%s)", deletion))
+}
+
+# Give a random vector as a linear approximation
+rndexpl_emnist <- function(X,
+                           Y,
+                           index,
+                           predict_fn = NULL,
+                           mean = 0,
+                           stddv = 0.25,
+                           ...) {
+    hm <- rnorm(ncol(X) + 1, mean, stddv)
+    nh <- list(X = X, Y = Y)
+    approx_fn <- function(X) sigmoid(X %*% hm[-1] + hm[1])
+    list(heatmap = hm[-1], impact = hm[-1], alpha = hm, approx_fn = approx_fn, neighbourhood = nh, name = "Random")
+}
+
+# SLISE explanation, but with output the same as the other explainers
+slise_emnist <- function(X,
+                         Y,
+                         index,
+                         predict_fn = NULL,
+                         ...,
+                         epsilon = 0.5,
+                         lambda = 2) {
+    expl <- slise.explain(X, Y, epsilon, index, lambda1 = lambda, lambda2 = lambda, logit = TRUE)
+    nh <- list(X = X[expl$subset, ], Y = Y[expl$subset])
+    approx_fn <- function(X) predict(expl, X)
+    coef <- expl$alpha[-1]
+    list(heatmap = coef, impact = coef * (X[index, ] - 0.5), alpha = expl$alpha, approx_fn = approx_fn, neighbourhood = nh, name = "SLISE")
+}
+
+# SLISE explanation with distance weights, but with output the same as the other explainers
+distslise_emnist <- function(X,
+                             Y,
+                             index,
+                             predict_fn = NULL,
+                             epsilon = 0.5,
+                             lambda = 2,
+                             distance = cosine_distance,
+                             divide_by_median_dist = FALSE,
+                             ...) {
+    dist <- apply(X, 1, distance, X[index, ])
+    if (divide_by_median_dist) {
+        dist <- dist / median(dist)
+    }
+    weight <- exp(-dist)
+    weight <- weight * length(Y) / sum(weight) # uniformly scale so that the same lambdas can be used
+    expl <- slise.explain(X, Y, epsilon, index, lambda1 = lambda, lambda2 = lambda, weight = weight, logit = TRUE)
+    nh <- list(X = X[expl$subset, ], Y = Y[expl$subset])
+    approx_fn <- function(X) predict(expl, X)
+    coef <- expl$alpha[-1]
+    list(heatmap = coef, impact = coef * (X[index, ] - 0.5), alpha = expl$alpha, approx_fn = approx_fn, neighbourhood = nh, name = "SLISE (weighted)")
+}
+
+# LIME forexplaining EMNIST, with different superpixel sizes
+lime_emnist <- function(X,
+                        Y,
+                        index,
+                        predict_fn,
+                        segmentation = "original",
+                        samples = 10000,
+                        verbose = FALSE,
+                        similarity = 0.2,
+                        ...) {
+    lime <- reticulate::import("lime")
+    explainer <- lime$lime_image$LimeImageExplainer(verbose = verbose)
+    py_img <- reticulate::r_to_py(matrix(X[index, ], 28, 28))
+    nhX <- NULL
+    nhY <- NULL
+    py_pred <- reticulate::py_func(function(x) {
+        x <- x[, , , 1]
+        dim(x) <- c(dim(x)[1], length(x) / dim(x)[1])
+        out <- predict_fn(x)
+        nhX <<- rbind(nhX, x)
+        nhY <<- c(nhY, out)
+        out
+    })
+    if (segmentation == "small") {
+        segmenter <- lime$wrappers$scikit_image$SegmentationAlgorithm("slic", compactness = 0.5, convert2lab = FALSE, start_label = as.integer(0))
+    } else if (segmentation == "original") {
+        # This is taken from the MNIST example in the LIME repository (august 2021):
+        segmenter <- lime$wrappers$scikit_image$SegmentationAlgorithm("quickshift", kernel_size = 1, max_dist = 200, ratio = 0.2, start_label = as.integer(0))
+    } else if (segmentation == "pixel") {
+        segmenter <- lime$wrappers$scikit_image$SegmentationAlgorithm("quickshift", kernel_size = 1, max_dist = 0.1, ratio = 1, convert2lab = FALSE, start_label = as.integer(0))
+    } else {
+        stop("Uknown segmentation (must be one of original, small, or pixel)")
+    }
+    expl <- explainer$explain_instance(
+        py_img,
+        classifier_fn = py_pred,
+        num_samples = as.integer(samples),
+        batch_size = as.integer(1000),
+        segmentation_fn = segmenter,
+        hide_color = 0.5
+    )
+    intercept <- c(expl$intercept[[1]])
+    local_exp <- c(expl$local_exp[[1]])
+    segments <- c(expl$segments)
+    img <- segments * 0
+    for (e in local_exp) {
+        img[segments == e[[1]]] <- e[[2]]
+    }
+    img <- c(img)
+    alpha <- c(intercept, sapply(local_exp, function(e) e[[2]]))
+    pred <- function(x) {
+        dim(x) <- c(length(x) / 784, 784)
+        apply(x, 1, function(x) {
+            out <- intercept
+            for (e in local_exp) {
+                mask <- segments == e[[1]]
+                if (mean(abs(X[index, mask] - x[mask])) < similarity) {
+                    out <- out + e[[2]]
+                }
+            }
+            out
+        })
+    }
+    list(heatmap = img, impact = img, alpha = alpha, approx_fn = pred, neighbourhood = list(X = nhX, Y = nhY), name = sprintf("LIME (%s)", segmentation))
+}
+
+
+
+#' Create wrappers for the classifier so LIME can use them
+#' This can be used instead of as_classifier
+#'
+#' @param model classifier
+#' @param labels class names
+#'
+#' @return
+#'
+lime_model_wrapper <- function(model, labels) {
+    if ("svm" %in% class(model)) {
+        as_classifier(structure(list(svm = model), class = "svm_wrapper"), labels)
+    } else if ("lr" %in% class(model)) {
+        as_classifier(structure(list(model = model), class = "lr_wrapper"), labels)
+    } else if ("keras.engine.training.Model" %in% class(model)) {
+        as_classifier(structure(list(model = model, labels = labels), class = "keras_wrapper"), labels)
+    } else {
+        as_classifier(model, labels)
+    }
+}
+
+predict.svm_wrapper <- function(model, newdata, ...) {
+    p_svm <- predict(model$svm, newdata = newdata, probability = TRUE)
+    p <- unname(attr(p_svm, "probabilities"))[, 1]
+    cbind(1 - p, p)
+}
+
+predict.lr_wrapper <- function(model, newdata, ...) {
+    p <- predict(model$model, newdata = newdata)
+    p <- sigmoid(unname(p))
+    cbind(1 - p, p)
+}
+
+predict.keras_wrapper <- function(model, newdata, type = "raw", ...) {
+    res <- predict(model$model, as.matrix(newdata))
+    if (type == "raw") {
+        data.frame(Response = res[, 1])
+    } else {
+        colnames(res) <- model$labels
+        as.data.frame(res, check.names = FALSE)
+    }
+}
\ No newline at end of file
diff --git a/experiments/explanations/utils.R b/experiments/explanations/utils.R
new file mode 100644
index 0000000..372f80e
--- /dev/null
+++ b/experiments/explanations/utils.R
@@ -0,0 +1,53 @@
+# Helper methods for the experiments
+
+library(devtools)
+devtools::load_all()
+options(matprod = "blas")
+library(ggplot2)
+
+# Use the correct number of cores if running on a slurm-cluster
+cores_from_slurm <- as.integer(Sys.getenv("SLURM_CPUS_PER_TASK"))
+if (is.finite(cores_from_slurm) && cores_from_slurm > 1) {
+    options(mc.cores = cores_from_slurm)
+}
+
+
+euclidean_distance <- function(x, y) sqrt(sum((x - y)^2))
+cosine_distance <- function(x, y) 1 - x %*% y / sqrt((x %*% x) * (y %*% y))
+
+theme_paper <- function(...) {
+    theme_bw() +
+    theme(
+        legend.title = element_blank(),
+        strip.background = element_blank(),
+        strip.text = element_text(color = "black", size = 12),
+        panel.spacing.x = unit(0.5, "cm"),
+        panel.grid.minor = element_blank(),
+        panel.border = element_blank(),
+        axis.line = element_line(colour = "black"),
+        axis.text = element_text(size = 8),
+        axis.title = element_text(size = 11),
+        ...
+    )
+}
+
+theme_image <- function(..., aspect.ratio = 1) {
+    theme_bw() +
+    theme(
+        legend.title = element_blank(),
+        strip.background = element_blank(),
+        strip.text = element_text(color = "black", size = 12),
+        panel.spacing.x = unit(0.0, "cm"),
+        panel.grid.minor = element_blank(),
+        panel.border = element_blank(),
+        # panel.border = element_rect(colour = "black"),
+        axis.title = element_blank(),
+        axis.text = element_blank(),
+        axis.ticks = element_blank(),
+        aspect.ratio = aspect.ratio,
+        ...
+    )
+}
+
+# Factor where the levels are not sorted
+factord <- function(x, ...) factor(x, unique(x), ...)
diff --git a/experiments/regression/README.md b/experiments/regression/README.md
new file mode 100644
index 0000000..ed92d0d
--- /dev/null
+++ b/experiments/regression/README.md
@@ -0,0 +1,38 @@
+# Experiments for the Robust Regression paper
+
+## Required Packages
+
+```R
+install.packages(c(
+    "devtools",
+    "roxygen2",
+    "Rcpp",
+    "RcppArmadillo",
+    "lbfgs",
+    "dplyr",
+    "tidyr",
+    "ggplot2",
+    "glmnet",
+    "robustbase",
+    "reticulate",
+    "R.utils",
+    "ggrepel",
+    "lemon",
+    "xtable",
+    "robustHD",
+    "MTE",
+    "conquer"
+), Ncpus = 4)
+## If using MRO 3.5 then a newer checkpoint of the repository is needed for `conquer`:
+# install.packages("conquer", repos='https://mran.microsoft.com/snapshot/2020-08-28', Ncpus = 4)
+## If using R 4.0 then `robustHD` and `MTE` needs to be compiled from Github:
+# devtools::install_github("aalfons/robustHD")
+# devtools::install_git("https://github.com/shaobo-li/MTE.git")
+```
+
+Furthermore, some experiments require `scikit-learn` (for RANSAC) accessed through `reticulate`.
+
+## Datasets
+
+Most datasets are downloaded when used, run `experiments/regression/data.R` to download all datasets in advance.
+The datasets in `experiments/data/retrive_*.R` can be acquired and pre-processed by running the respective `retrieve_*.R` scripts, but if not then the the corresponding datasets from the conference paper supplements will be downloaded instead.
diff --git a/experiments/regression/data.R b/experiments/regression/data.R
new file mode 100644
index 0000000..d995ee4
--- /dev/null
+++ b/experiments/regression/data.R
@@ -0,0 +1,556 @@
+# This script contains methods for downloading and loading datasets
+
+DATA_DIR <- "experiments/data"
+
+.data_download <- function(filename, url, path = DATA_DIR) {
+    filepath <- file.path(path, filename)
+    if (!file.exists(filepath)) {
+        dir.create(path, showWarnings = FALSE)
+        download.file(url, filepath)
+    }
+    filepath
+}
+
+.data_exists <- function(filename, path = DATA_DIR) {
+    file.exists(file.path(path, filename))
+}
+
+# Create a list for the dataset with automatically scaled lambdas
+.datacontainer <- function(X, Y, name, epsilon_lg, epsilon_md, epsilon_sm, lambda = 1e-4, ...) {
+    list(
+        X = X,
+        Y = Y,
+        name = name,
+        epsilon_lg = epsilon_lg,
+        epsilon_md = epsilon_md,
+        epsilon_sm = epsilon_sm,
+        lambda = lambda,
+        lambda_lg = lambda * nrow(X) * epsilon_lg^2,
+        lambda_md = lambda * nrow(X) * epsilon_md^2,
+        lambda_sm = lambda * nrow(X) * epsilon_sm^2,
+        ...
+    )
+}
+
+
+# Load the Forest Fires (regression) dataset.
+# For more information see: https://archive.ics.uci.edu/ml/datasets/Forest+Fires
+# This is the eleventh most popular dataset on the UCI ML repository.
+#
+# The original paper can be found here: http://www3.dsi.uminho.pt/pcortez/fires.pdf
+# In the paper they note that a linear model mostly finds the "mean", meaning that the dataset
+#   is "too difficult" for a linear model.
+# One option would be to ignore most items with area=0 (but that is not the original problem!).
+#   This does not fix OLS, but SLISE with small epsilons start working.
+# One odd thing is that lower temperatures corresponds to worse fires, but this can be
+#   explained with worse fires during the winter (why?).
+# Another odd thing is that rain makes the fires worse, but that this effect is the first to
+#   disappear when regularisation is added (overfitting?).
+data_forest_fires <- function(raw = FALSE, balance = TRUE, months = TRUE, days = TRUE, coord = FALSE, ...) {
+    data <- read.csv(.data_download(
+        "forest_fires.csv",
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv"
+    ))
+    if (balance) {
+        mask <- which(data$area > 0)
+        mask <- c(sample(which(data$area == 0), length(mask) / 9), mask)
+        data <- data[sort(mask), ]
+    }
+    if (raw) {
+        data
+    } else {
+        if (months) {
+            data$month <- factor(data$month, tolower(month.abb), month.name)
+        } else {
+            data$month <- NULL
+        }
+        if (days) {
+            data$day <- factor(
+                data$day,
+                c("mon", "tue", "wed", "thu", "fri", "sat", "sun"),
+                c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
+            )
+        } else {
+            data$day <- NULL
+        }
+        if (!coord) {
+            data$X <- data$Y <- NULL
+        }
+        .datacontainer(
+            model.matrix(area ~ 1 + ., data)[, -1],
+            log(data$area + 1),
+            "Forest Fires",
+            0.1, 0.1, 0.1
+        )
+    }
+}
+
+# Load the Wine Quality (regression) dataset
+# For more information see: https://archive.ics.uci.edu/ml/datasets/Wine+Quality
+# This is the sixth most popular data set on the UCI ML repository.
+#
+# Since the targets are (discrete) integers, make sure that epsilon is large enough to "bridge the gaps".
+# The different variables have widely different scales, so normalisation is recommended for interpretation.
+#
+# The original paper can be found here: https://doi.org/10.1016/j.dss.2009.05.016
+# After a quick test, SLISE performs better than the linear regression reported
+#   in the paper, but slightly worse than the SVM reported in the paper.
+# In the paper they separated red and white wine, but combining them causes no issues.
+data_wine_quality <- function(type = "all", ...) {
+    type <- tolower(type)
+    if (type == "all" || type == "red") {
+        path_red <- .data_download(
+            "wine_quality_red.csv",
+            "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
+        )
+        red <- read.csv(path_red, header = TRUE, sep = ";")
+    }
+    if (type == "all" || type == "white") {
+        path_white <- .data_download(
+            "wine_quality_white.csv",
+            "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
+        )
+        white <- read.csv(path_white, header = TRUE, sep = ";")
+    }
+    if (type == "all") {
+        red$type <- factor("red", c("red", "white"))
+        white$type <- factor("white", c("red", "white"))
+        data <- rbind(red, white)
+    } else if (type == "red") {
+        data <- red
+    } else if (type == "white") {
+        data <- white
+    } else {
+        stop("Unknown type of wine")
+    }
+    .datacontainer(
+        X = model.matrix(quality ~ 1 + ., data)[, -1],
+        Y = data$quality,
+        name = "Wine Quality",
+        epsilon_lg = 0.51,
+        epsilon_md = 0.27,
+        epsilon_sm = 0.14
+    )
+}
+
+# Load the Student Performance (regression) dataset
+# For more information see: https://archive.ics.uci.edu/ml/datasets/Student+Performance
+# This is the twelth most popular data set on the UCI ML repository.
+#
+# Since the grades for period 3 (the target) are very correlated with the grades for
+#   period 1 and 2 I have chosen to remove those.
+#
+# The original paper can be found here: http://www3.dsi.uminho.pt/pcortez/student.pdf
+# After a quick test, SLISE performs similar to, and sometimes better than, the classifiers
+#   in the paper (even though SLISE is used for regression).
+# Comparing to the regression methods is difficult, since they only report RMSE, which does
+#   not take into account that SLISE ignores some items.
+data_student_performance <- function(subject = "both", ...) {
+    subject <- tolower(subject)
+    path <- .data_download(
+        "student_performance.zip",
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip"
+    )
+    if (subject == "maths" || subject == "maths" || subject == "mat") {
+        data <- read.csv(unz(path, "student-mat.csv"), sep = ";")
+    } else if (subject == "portuguese" || subject == "por") {
+        data <- read.csv(unz(path, "student-por.csv"), sep = ";")
+    } else if (subject == "both") {
+        data <- rbind(
+            read.csv(unz(path, "student-mat.csv"), sep = ";"),
+            read.csv(unz(path, "student-por.csv"), sep = ";")
+        )
+    } else {
+        stop("Unknown subject")
+    }
+    .datacontainer(
+        X = model.matrix(G3 ~ 1 + . - G2 - G1, data)[, -1],
+        Y = data$G3,
+        name = "Student",
+        epsilon_lg = 0.90,
+        epsilon_md = 0.45,
+        epsilon_sm = 0.22
+    )
+}
+
+# Load the Superconductivty dataset
+# For more information see: https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data
+#
+# The original paper can be found here: https://doi.org/10.1016/j.commatsci.2018.07.052
+data_superconductivity <- function(n = 10000, ...) {
+    path <- .data_download(
+        "superconductivity.zip",
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/00464/superconduct.zip"
+    )
+    data <- read.csv(unz(path, "train.csv"))
+    sel <- sample.int(nrow(data), min(n, nrow(data)))
+    .datacontainer(
+        X = model.matrix(critical_temp ~ 1 + ., data)[sel, -1],
+        Y = data$critical_temp[sel],
+        name = "Superconductivity",
+        epsilon_lg = 0.70,
+        epsilon_md = 0.30,
+        epsilon_sm = 0.14
+    )
+}
+
+
+# Load the Air Quality dataset
+# For more information see: https://archive.ics.uci.edu/ml/datasets/Air+Quality
+#
+# The original paper can be found here: http://dx.doi.org/10.1016/j.snb.2007.09.060
+# This dataset has also been used in the drifter paper
+data_air_quality <- function(...) {
+    path <- .data_download(
+        "air_quality.zip",
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip"
+    )
+    data <- read.csv2(unz(path, "AirQualityUCI.csv"))
+    data <- data[, -c(1, 2, 5, ncol(data), ncol(data) - 1)]
+    Y <- data[, 1]
+    X <- as.matrix(data[, -1])
+    X[X == -200] <- NA
+    Y[Y == -200] <- NA
+    mask <- complete.cases(Y) & complete.cases(X)
+    .datacontainer(
+        X = X[mask, ],
+        Y = Y[mask],
+        name = "Air Quality",
+        epsilon_lg = 0.24,
+        epsilon_md = 0.12,
+        epsilon_sm = 0.07
+    )
+}
+
+
+#' Get Physics "jets" data
+#'
+#' @param n number of jets
+#'
+#' @return list(X, Y, R=real_class, model)
+#'
+data_jets <- function(n = 10000, ...) {
+    path <- file.path(DATA_DIR, "jets.rds")
+    if (!file.exists(path)) {
+        warning("Using the data from the conference supplements instead of running `experiments/data/retrieve_jets.R`")
+        supp_path <- .data_download(
+            "supplement.zip",
+            "https://github.com/edahelsinki/slise/releases/download/v1.0/supplement.zip"
+        )
+        path <- unzip(supp_path, path)
+    }
+    data <- readRDS(path)
+    if (n > 0) {
+        mask <- sample(c(sample(which(data$R == 1), n / 2), sample(which(data$R == 0), n / 2)))
+        data$R <- data$R[mask]
+        data$Y <- data$Y[mask]
+        data$X <- data$X[mask, ]
+    }
+    .datacontainer(
+        X = data$X,
+        Y = limited_logit(data$Y),
+        R = data$R,
+        name = "Physics",
+        epsilon_lg = 0.33,
+        epsilon_md = 0.17,
+        epsilon_sm = 0.09
+    )
+}
+
+#' Get imdb review data
+#'
+#' @param n number of reviews
+#' @param model classifier (svm, rf, lr, elm)
+#'
+#' @return list(X, Y, R=real_class, model)
+#'
+data_imdb <- function(n = 10000, model = "svm", ...) {
+    path_data <- file.path(DATA_DIR, "aclimdb_data_test.rds")
+    path_model <- file.path(DATA_DIR, paste0("aclimdb_model_", model, ".rds"))
+    if (!file.exists(path_data) || !file.exists(path_model)) {
+        warning("Using the data from the conference supplements instead of running `experiments/data/retrieve_aclimdb.R`")
+        supp_path <- .data_download(
+            "supplement.zip",
+            "https://github.com/edahelsinki/slise/releases/download/v1.0/supplement.zip"
+        )
+        path_data <- unzip(supp_path, path_data)
+        path_model <- unzip(supp_path, path_model)
+    }
+    tmp <- readRDS(path_data)
+    X <- as.matrix(tmp$data)
+    R <- as.numeric(tmp$class == "pos")
+    Y <- as.numeric(tmp[[paste0("prob_", model)]])
+    m <- readRDS(path_model)
+    class(m) <- c(model, class(m))
+    if (n > 0 && n < length(Y)) {
+        mask <- sample.int(length(Y), n)
+        X <- X[mask, ]
+        Y <- Y[mask]
+        R <- R[mask]
+    }
+    .datacontainer(
+        X = X,
+        Y = limited_logit(Y),
+        R = R,
+        name = "IMDB",
+        epsilon_lg = 0.29,
+        epsilon_md = 0.11,
+        epsilon_sm = 0.05
+    )
+}
+
+
+# Get EMNIST data
+# Params:
+#   n:          number of images
+#   d:          number of dimensions (default == -1 == all dimensions)
+#   digit:      the digit for the "correct" class
+#   th:         discard dimensions with variance less than th
+#   balance:    should the number of samples in each class be balanced
+data_emnist <- function(n = 10000, d = -1, digit = 2, th = 0, balance = TRUE, digit_in_name = FALSE, ...) {
+    path_img <- file.path(DATA_DIR, "emnist.rds")
+    path_pred <- file.path(DATA_DIR, "emnist_digits.rds")
+    if (!file.exists(path_img) || !file.exists(path_pred)) {
+        warning("Using the data from the conference supplements instead of running `experiments/data/retrieve_jets.R`")
+        supp_path <- .data_download(
+            "supplement.zip",
+            "https://github.com/edahelsinki/slise/releases/download/v1.0/supplement.zip"
+        )
+        path_img <- unzip(supp_path, path_img)
+        path_pred <- unzip(supp_path, path_pred)
+    }
+    emnist <- readRDS(path_img)
+    X <- emnist$image
+    R <- emnist$label
+    # Adjusting for softmax
+    Y <- readRDS(path_pred)
+    Y2 <- apply(Y[, -digit - 1], 1, max)
+    Y <- Y[, digit + 1]
+    Y <- Y / (Y + Y2)
+    # Balancing classes
+    if (balance) {
+        mask1 <- which(R == digit)
+        mask2 <- which(R != digit)
+        n <- min(length(mask1), length(mask2), if (n > 0) n / 2 else length(Y))
+        mask <- sample(c(sample(mask1, n), sample(mask2, n)))
+        Y <- Y[mask]
+        X <- X[mask, , drop = FALSE]
+        R <- R[mask]
+    } else if (n > 0 && n < length(Y)) {
+        mask <- sample.int(length(Y), n)
+        X <- X[mask, , drop = FALSE]
+        Y <- Y[mask]
+        R <- R[mask]
+    }
+    colnames(X) <- paste0("pixel_", 1:ncol(X))
+    if (th >= 0) {
+        X <- X[, apply(X, 2, var, na.rm = TRUE) > th, drop = FALSE]
+    }
+    if (d > 0 && d < ncol(X)) {
+        X <- X[, sort(sample.int(ncol(X), d)), drop = FALSE]
+    }
+    .datacontainer(
+        X = X,
+        Y = limited_logit(Y),
+        R = R,
+        name = if (digit_in_name) paste("EMNIST", digit) else "EMNIST",
+        epsilon_lg = 0.28,
+        epsilon_md = 0.13,
+        epsilon_sm = 0.05
+    )
+}
+
+#' Create Synthetic data
+#'
+#' @param n number of items
+#' @param d number of columns
+#' @param num_zero number of irrelevant features
+#' @param epsilon noise scale
+#' @param extra_models number of "wrong" models
+#' @param sparsity how large fraction of the coefficients should be zero
+#'
+#' @return list(X, Y, alpha, clean)
+#'
+data_create <- function(n = 1000, d = 50, epsilon = 1.0, extra_models = 6, sparsity = 0.0, ...) {
+    X <- do.call(rbind, lapply(1:10, function(i) {
+          t(matrix(rnorm(n * d / 10, rnorm(d) * 2, runif(d)), nrow = d))
+      }))[c(sapply(1:10, seq, n, 10)), ]
+    Y <- rep(0, n)
+    start <- 1
+    if (extra_models > 0) {
+        for (i in 1:extra_models) {
+            end <- start + n / 10 - 1
+            alpha <- runif(d + 1, -1, 1)
+            Y[start:end] <- X[start:end, ] %*% alpha[-1] + alpha[1] + runif(n / 10, 0, epsilon)
+            start <- end + 1
+        }
+    }
+    alpha <- runif(d + 1, -1, 1)
+    sparsity <- round(d * sparsity)
+    if (sparsity > 0) {
+        alpha[which_min_n(abs(alpha[-1]), sparsity) + 1] <- 0
+    }
+    Y[start:n] <- X[start:n, ] %*% alpha[-1] + alpha[1] + runif(n - start + 1, 0, epsilon)
+    .datacontainer(
+        X = X,
+        Y = Y,
+        alpha = alpha,
+        name = "Synthetic",
+        epsilon_lg = 0.79,
+        epsilon_md = 0.23,
+        epsilon_sm = 0.11
+    )
+}
+
+#' Create Synthetic data that tries to confuse lasso, ols, and zeros initialisation.
+#' However, that goal does not work sometimes (with this setup).
+#'
+#' @param n number of items
+#' @param d number of columns
+#' @param num_zero number of irrelevant features
+#' @param epsilon noise scale
+#' @param clean the fraction of clean y:s
+#'
+#' @return list(X, Y, alpha, clean)
+#'
+data_create2 <- function(n = 1000, d = 50, epsilon = 1.0, clean = 0.5, ...) {
+    X <- do.call(rbind, lapply(1:10, function(i) {
+          t(matrix(rnorm(n * d / 10, rnorm(d) * 3, runif(d)), nrow = d))
+      }))[c(sapply(1:10, seq, n, 10)), ]
+    alpha <- runif(d + 1, -1, 1)
+    Y <- X %*% alpha[-1] + alpha[1] + runif(n, 0, epsilon)
+    start <- as.integer(clean * n)
+    left <- n - start
+    if (left > 0) {
+        end <- start + as.integer(left * 0.5)
+        start <- start + 1
+        Y[start:end] <- 0
+        start <- end + 1
+        may <- max(abs(Y))
+        Xi <- add_intercept_column(X)
+        for (i in start:n) {
+            lm <- .lm.fit(Xi[1:(i - 1), ], Y[1:(i - 1)])
+            yi <- sum(Xi[i, ] * lm$coefficients)
+            if (yi > 0) {
+                Y[i] <- -runif(1, 0, may)
+            } else if (yi < 0) {
+                Y[i] <- runif(1, 0, may)
+            }
+        }
+    }
+    .datacontainer(
+        X = X,
+        Y = Y,
+        alpha = alpha,
+        name = "Synthetic",
+        epsilon_lg = 0.79,
+        epsilon_md = 0.23,
+        epsilon_sm = 0.11
+    )
+}
+
+robust_data <- function(data, scale_x = TRUE, scale_y = TRUE) {
+    data2 <- rlang::duplicate(data, shallow = TRUE)
+    if (scale_x) {
+        data2$X <- scale_robust(remove_constant_columns(data$X))
+    } else {
+        data2$X <- remove_constant_columns(data$X)
+    }
+    if (scale_y) {
+        data2$Y <- scale_robust(data$Y)
+    }
+    data2
+}
+
+outlier_data <- function(data, vertical = 0.25, leverage = 0.25) {
+    n <- length(data$Y)
+    rnd <- sample.int(n)
+    X <- data$X[rnd, ]
+    Y <- data$Y[rnd]
+    start <- 1
+    vertical <- round(n * vertical)
+    if (vertical > 0) {
+        sel <- start:(start + vertical - 1)
+        start <- start + vertical
+        Y[sel] <- rnorm(vertical, 10, 1)
+    }
+    leverage <- round(n * leverage)
+    if (leverage > 0) {
+        sel <- start:(start + leverage - 1)
+        start <- start + leverage
+        X[sel, ] <- rnorm(ncol(X) * leverage, 10, 1)
+    }
+    data2 <- rlang::duplicate(data, shallow = TRUE)
+    data2$X <- X
+    data2$Y <- Y
+    data2
+}
+
+data_at_index <- function(index, ..., seed_start = 42, normalise = TRUE) {
+    set.seed(seed_start + floor((index - 1) / 17) + 1)
+    index <- (index - 1) %% 17 + 1
+    data <- switch(index,
+        `1` = data_student_performance(...),
+        `2` = data_air_quality(...),
+        `3` = data_superconductivity(...),
+        `4` = data_jets(...),
+        `5` = data_imdb(...),
+        `6` = data_emnist(digit = 0, ...),
+        `7` = data_emnist(digit = 1, ...),
+        `8` = data_emnist(digit = 2, ...),
+        `9` = data_emnist(digit = 3, ...),
+        `10` = data_emnist(digit = 4, ...),
+        `11` = data_emnist(digit = 5, ...),
+        `12` = data_emnist(digit = 6, ...),
+        `13` = data_emnist(digit = 7, ...),
+        `14` = data_emnist(digit = 8, ...),
+        `15` = data_emnist(digit = 9, ...),
+        `16` = data_create(n = 2000, d = 50, ...),
+        `17` = data_create(n = 1000, d = 20, ...)
+    )
+    if (normalise) {
+        data <- robust_data(data, !(index %in% c(5, 7:16)), TRUE)
+    }
+    data
+}
+
+data_all <- function(indices = 1:17, ...) {
+    lapply(indices, data_at_index, ...)
+}
+
+latex_name <- function(data_name) {
+    switch(data_name,
+        "Wine Quality" = "\\winequality",
+        "Student" = "\\student",
+        "Air Quality" = "\\airquality",
+        "Superconductivity" = "\\superconductivity",
+        "Physics" = "\\physics",
+        "IMDB" = "\\imdb",
+        "EMNIST" = "\\emnist",
+        "Synthetic" = "\\synthetic",
+        data_name
+    )
+}
+
+data_name_factor <- function() {
+    names <- c(
+        "Air Quality",
+        "Student",
+        "Superconductivity",
+        "Synthetic",
+        "Physics",
+        "EMNIST",
+        "IMDB"
+    )
+    factor(names, names)
+}
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    devtools::load_all()
+    cat("Preparing all datasets...")
+    tmpo <- data_all()
+    cat("Done!")
+}
diff --git a/experiments/regression/exp_beta_max.R b/experiments/regression/exp_beta_max.R
new file mode 100644
index 0000000..03371aa
--- /dev/null
+++ b/experiments/regression/exp_beta_max.R
@@ -0,0 +1,163 @@
+## --------------------------------------------------
+## Peform experiment for selecting the best beta_max
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_beta_max.R index
+##
+## Parameters:
+##
+##        index   : Specify the job index (1-170)
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        The number of jobs should be a multiple of 17.
+##        Run it again without an index to compile a plot.
+##
+## Running Manually:
+##        Rscript --vanilla experiments/regression/exp_beta_max.R 1:17
+##        Rscript --vanilla experiments/regression/exp_beta_max.R
+## --------------------------------------------------
+
+
+## --------------------------------------------------
+## Libraries
+## --------------------------------------------------
+suppressMessages(require(ggplot2))
+suppressMessages(require(ggrepel))
+suppressMessages(require(tidyr))
+suppressMessages(require(lemon))
+suppressMessages(source("experiments/regression/data.R"))
+suppressMessages(source("experiments/regression/utils.R"))
+
+
+## --------------------------------------------------
+## Calculation
+## --------------------------------------------------
+calc_beta_max <- function(dataset, maxes = c(0.5, 1, 5, 10, 20, 30)) {
+    do.call(rbind, lapply(maxes, function(b) {
+        time <- system.time(
+            slise <- slise.fit(
+                dataset$X,
+                dataset$Y,
+                dataset$epsilon_sm,
+                dataset$lambda_sm,
+                beta_max = b / dataset$epsilon_sm^2
+            )
+        )[3]
+        data.frame(
+            dataset = dataset$name,
+            beta = b,
+            time = time,
+            loss = slise$value,
+            subset = mean(slise$subset),
+            stringsAsFactors = TRUE
+        )
+    }))
+}
+
+
+## --------------------------------------------------
+## Plot
+## --------------------------------------------------
+plot_beta_max <- function(df) {
+    df <- df %>%
+        gather(m, value, c("loss", "time")) %>%
+        mutate(m = factor(m, labels = c("Loss", "Time [s]"))) %>%
+        group_by(dataset, m) %>%
+        mutate(value = value / median(abs(value))) %>% # / abs(median(value))) %>%
+        group_by(dataset, m, beta) %>%
+        mutate(value = median(value))
+    ggplot(df, aes(beta, value, col = dataset, linetype = dataset)) +
+        theme_bw() +
+        theme_paper() +
+        geom_line() +
+        facet_wrap(vars(m), nrow = 1, scales = "free_y") +
+        scale_linetype_discrete(name = NULL) +
+        scale_color_brewer(type = "qual", palette = "Dark2", name = NULL) +
+        xlab(expression(beta[max] ~ epsilon^2)) +
+        ylab("Normalised Value")
+}
+
+plot_dataset <- function(df) {
+    df <- df %>%
+        group_by(dataset, beta) %>%
+        summarize(Time = mean(time), Loss = mean(loss)) %>%
+        gather(m, value, c("Loss", "Time"))
+    ggplot(df, aes(beta, value)) +
+        theme_bw() +
+        theme_paper() +
+        geom_line() +
+        facet_wrap(vars(dataset, m), nrow = 4, scales = "free") +
+        scale_y_continuous(labels = function(x) sprintf("%.2f", x)) +
+        theme(legend.title = element_text()) +
+        theme(strip.text = element_text(margin = margin(0, 0, 0.1, 0, "cm"))) +
+        xlab(expression(beta[max] ~ epsilon^2)) +
+        ylab(NULL)
+}
+
+plot_scatter <- function(df) {
+    df <- df %>%
+        mutate(beta = factor(sprintf("%2g", beta), sprintf("%2g", unique(beta)))) %>%
+        group_by(dataset, beta) %>%
+        summarize(time = mean(time), loss = mean(loss))
+    labels <- sapply(levels(df$beta), function(l) as.expression(bquote(.(l)/epsilon^2)))
+    gg <- ggplot(df) +
+        theme_bw() +
+        theme_paper() +
+        geom_point(aes(time, loss, shape = beta, col = beta), size = 3) +
+        geom_text_repel(
+            aes(time, loss, label = beta),
+            # hjust = "inward",
+            # vjust = "inward",
+            force = 0.35,
+            min.segment.length = 1,
+            segment.alpha = 0.3,
+            box.padding = 0.1,
+            nudge_x = 0.01
+        ) +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        scale_shape_manual(
+            name = expression(beta[max]),
+            values = c(1, 2, 0, 3, 4, 18, 20),
+            labels = labels
+        ) +
+        scale_color_brewer(
+            name = expression(beta[max]),
+            palette = "Dark2",
+            labels = labels
+        ) +
+        theme(legend.title = element_text()) +
+        xlab("Time") +
+        ylab("Loss") +
+        guides(col = guide_legend(ncol = 2), shape = guide_legend(ncol = 2))
+    reposition_legend(gg, "center", panel = "panel-4-2")
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        for (index in eval(parse(text = args[1]))) {
+            set.seed(42 + index)
+
+            data <- data_at_index(index, seed_start = 42, normalise = TRUE)
+            cat(sprintf("[beta_max] Init: %2d   %s\n", index, data$name))
+
+            df <- calc_beta_max(data)
+
+            save_results(df, "beta_max", index)
+            cat(sprintf("[beta_max] Done: %2d   %s\n", index, data$name))
+        }
+    } else {
+        df <- load_results("beta_max")
+        # plot_pdf(plot_beta_max(df), "beta_max", 0.8, 0.3)
+        # plot_pdf(plot_dataset(df), "beta_max2", 1.0, 0.8)
+        plot_pdf(plot_scatter(df), "beta_max_scatter", 1.0, 0.4)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/exp_candidates.R b/experiments/regression/exp_candidates.R
new file mode 100644
index 0000000..e3f0e3d
--- /dev/null
+++ b/experiments/regression/exp_candidates.R
@@ -0,0 +1,159 @@
+## --------------------------------------------------
+## Peform experiment for selecting the best number of candidates
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_candidates.R index
+##
+## Parameters:
+##
+##        index   : Specify the job index
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        The number of jobs should be a multiple of 17.
+##        Run it again without an index to compile a plot.
+##
+## Running Manually:
+##        Rscript --vanilla experiments/regression/exp_candidates.R 1:17
+##        Rscript --vanilla experiments/regression/exp_candidates.R
+##
+## --------------------------------------------------
+
+
+## --------------------------------------------------
+## Libraries
+## --------------------------------------------------
+suppressMessages(require(ggplot2))
+suppressMessages(require(ggrepel))
+suppressMessages(require(tidyr))
+suppressMessages(require(lemon))
+suppressMessages(source("experiments/regression/data.R"))
+suppressMessages(source("experiments/regression/utils.R"))
+
+
+## --------------------------------------------------
+## Calculation
+## --------------------------------------------------
+calc_num_init <- function(dataset, nums = c(50, 100, 200, 500, 1000)) {
+    do.call(rbind, lapply(nums, function(n) {
+        time <- system.time(
+            slise <- slise.fit(
+                dataset$X,
+                dataset$Y,
+                dataset$epsilon_sm,
+                dataset$lambda_sm,
+                num_init = n
+            )
+        )[3]
+        data.frame(
+            dataset = dataset$name,
+            inits = n,
+            time = time,
+            loss = slise$value,
+            subset = mean(slise$subset),
+            stringsAsFactors = TRUE
+        )
+    }))
+}
+
+
+## --------------------------------------------------
+## Plotting
+## --------------------------------------------------
+plot_num_init <- function(df) {
+    df <- df %>%
+        gather(m, value, c("loss", "time")) %>%
+        mutate(m = factor(m, labels = c("Loss", "Time"))) %>%
+        group_by(dataset, m) %>%
+        mutate(value = value / abs(median(value))) %>%
+        group_by(dataset, m, inits) %>%
+        summarise(value = median(value))
+    ggplot(df, aes(inits, value, col = dataset, linetype = dataset)) +
+        theme_bw() +
+        theme_paper() +
+        geom_line() +
+        facet_wrap(vars(m), scales = "free") +
+        scale_linetype_discrete(name = NULL) +
+        scale_color_brewer(type = "qual", palette = "Dark2", name = NULL) +
+        xlab("Number of Candidates") +
+        ylab("Normalised Value")
+}
+
+table_candidates <- function(df) {
+    sizes <- sort(unique(df$inits))
+    datasets <- levels(df$dataset)
+    cat("\\begin{tabular}{l ", rep("rr", length(datasets)), "}\n")
+    cat("  \\hline \\toprule\n")
+    cat("  \\textbf{Candidates}", sprintf("& \\multicolumn{2}{c}{%s}", sapply(datasets, latex_name)), "\\\\\n")
+    cat("   ", rep("& Loss & Time", length(datasets)), "\\\\\n")
+    cat("  \\midrule\n")
+    for (s in sizes) {
+        df2 <- df %>% filter(inits == s)
+        cat(
+            sprintf("  %2d", s),
+            sapply(datasets, function(d) {
+                df3 <- df2 %>% filter(dataset == d)
+                sprintf("& $%.2f$ & $%.2f$", mean(df3$loss), mean(df3$time))
+            }),
+            " \\\\\n"
+        )
+    }
+    cat("  \\bottomrule\n\\end{tabular}\n")
+}
+
+plot_scatter <- function(df) {
+    df <- df %>%
+        mutate(inits = as.factor(sprintf("%4d", inits))) %>%
+        group_by(dataset, inits) %>%
+        summarize(time = mean(time), loss = median(loss))
+    gg <- ggplot(df) +
+        theme_bw() +
+        theme_paper() +
+        geom_point(aes(time, loss, shape = inits, col = inits), size = 3) +
+        geom_text_repel(
+            aes(time, loss, label = inits),
+            # hjust = "inward",
+            # vjust = "inward",
+            force = 0.4,
+            min.segment.length = 0.75,
+            box.padding = 0.15,
+            nudge_x = 0.01
+        ) +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        scale_shape_manual(
+            name = expression(u[init]),
+            values = c(1, 2, 0, 3, 4)
+        ) +
+        scale_color_brewer(name = expression(u[init]), palette = "Dark2") +
+        scale_y_continuous(labels = function(s) sprintf("%.2f", s)) +
+        theme(legend.title = element_text()) +
+        xlab("Time") +
+        ylab("Loss")
+    reposition_legend(gg, "center", panel = "panel-4-2")
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        for (index in eval(parse(text = args[1]))) {
+            data <- data_at_index(index, seed_start = 142, normalise = TRUE)
+            cat(sprintf("[candidates] Init: %2d   %s\n", index, data$name))
+
+            df <- calc_num_init(data)
+
+            save_results(df, "candidates", index)
+            cat(sprintf("[candidates] Done: %2d   %s\n", index, data$name))
+        }
+    } else {
+        df <- load_results("candidates")
+        # plot_pdf(plot_num_init(df), "candidates", 0.8, 0.4)
+        plot_pdf(plot_scatter(df), "candidates2", 1.0, 0.4)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/exp_epsilon.R b/experiments/regression/exp_epsilon.R
new file mode 100644
index 0000000..fbf3089
--- /dev/null
+++ b/experiments/regression/exp_epsilon.R
@@ -0,0 +1,155 @@
+## --------------------------------------------------
+## Experiment showing the effect of epsilon
+##
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_epsilon.R [index]
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        The number of jobs should be a multiple of 17.
+##        Run it again without an index to compile a plot.
+##
+## Running locally:
+##
+##        Rscript --vanilla experiments/regression/exp_epsilon.R 1:17
+##        Rscript --vanilla experiments/regression/exp_epsilon.R
+##
+##
+## --------------------------------------------------
+
+suppressMessages(require(ggplot2))
+suppressMessages(require(lemon))
+suppressMessages(source("experiments/regression/data.R"))
+suppressMessages(source("experiments/regression/utils.R"))
+
+
+fit_epsilon <- function(dataset, epsilon = c(seq(0.05, 0.5, 0.05), seq(0.6, 1.2, 0.1))) {
+    do.call(rbind, lapply(epsilon, function(e) {
+        slise <- slise.fit(
+            dataset$X,
+            dataset$Y,
+            e,
+            dataset$lambda * nrow(dataset$X) * e^2
+        )
+        data.frame(
+            epsilon = e,
+            dataset = dataset$name,
+            error = c(dataset$Y - predict(slise)),
+            stringsAsFactors = TRUE
+        )
+    }))
+}
+
+
+plot_size <- function(df) {
+    df <- df %>%
+        mutate(dataset = factor(dataset, unique(dataset))) %>%
+        group_by(epsilon, dataset) %>%
+        summarize(size = mean(abs(error) < epsilon[1]))
+    ggplot(df) +
+        geom_line(aes(epsilon, size, col = dataset, linetype = dataset), size = 1) +
+        scale_linetype_discrete(name = NULL) +
+        scale_color_brewer(type = "qual", palette = "Dark2", name = NULL) +
+        ylim(c(0, 1)) +
+        theme_bw() +
+        theme_paper() +
+        xlab("\u03B5") +
+        ylab("Subset size")
+}
+
+
+plot_dist <- function(df, eps = c(0.2, 0.4, 0.6, 0.8, 1.0)) {
+    df <- df %>%
+        filter(epsilon %in% eps) %>%
+        mutate(error = error / epsilon, epsilon = sprintf("%g", epsilon))
+    gg <- ggplot(df) +
+        geom_vline(xintercept = c(-1, 1), col = "grey") +
+        geom_density(aes(error, col = epsilon, linetype = epsilon), size = 1) +
+        scale_linetype_discrete(name = expression(epsilon)) +
+        scale_color_brewer(type = "qual", palette = "Dark2", name = expression(epsilon)) +
+        theme_bw() +
+        theme_paper() +
+        scale_x_continuous(breaks = c(-1, 0, 1), labels = c("-\u03B5", "0", "\u03B5")) +
+        coord_cartesian(xlim = c(-1.2, 1.2)) +
+        xlab("Relative Error") +
+        ylab("Density") +
+        theme(legend.title = element_text()) +
+        facet_wrap(vars(dataset), ncol = 4, scale = "free")
+    reposition_legend(gg, "center", panel = "panel-4-2")
+}
+
+plot_opt <- function(df) {
+    opt <- function(ds, eps, error) {
+        losses <- df %>%
+            filter(dataset == ds) %>%
+            # filter(epsilon == max(epsilon)) %>% # Most LASSO-like
+            group_by(epsilon) %>%
+            summarize(loss = loss_sharp_res(0, error^2, eps^2))
+        loss_sharp_res(0, error^2, eps^2) / mean(losses$loss)
+    }
+    df <- df %>%
+        group_by(epsilon, dataset) %>%
+        summarize(opt = opt(dataset[[1]], epsilon[[1]], error))
+    ggplot(df) +
+        geom_line(aes(epsilon, opt, col = dataset, linetype = dataset)) +
+        scale_linetype_discrete(name = NULL) +
+        scale_color_brewer(type = "qual", palette = "Dark2", name = NULL) +
+        theme_bw() +
+        theme_paper() +
+        xlab("\u03B5") +
+        ylab("Loss / Mean Loss")
+}
+
+find_size <- function(df, target_size = 0.75) {
+    df %>%
+        group_by(dataset, epsilon) %>%
+        summarize(size = mean(abs(error) < epsilon[1])) %>%
+        group_by(dataset) %>%
+        summarise(epsilon = inverse_interpolated(epsilon, size, target_size))
+}
+
+table_size <- function(df, sizes = c(0.75, 0.5, 0.3)) {
+    epsilons <- lapply(sizes, find_size, df = df)
+    cat("\\begin{tabular}{l ", rep("r", length(sizes)), "}\n", sep = "")
+    cat("  \\hline \\toprule\n")
+    cat("  \\textbf{Dataset} &\n\\multicolumn{", length(sizes), "}{c}{\\textbf{$\\varepsilon$ for subset sizes}}\\\\\n")
+    cat("  ", sapply(sizes * 100, sprintf, fmt = "& \\unit{%.0f}{\\%%}"), "\\\\\n")
+    cat("  \\midrule\n")
+    for (d in unique(df$dataset)) {
+        cat(
+            "    ",
+            latex_name(d),
+            sapply(epsilons, function(e) sprintf("& $%.2f$", e$epsilon[e$dataset == d])),
+            " \\\\\n"
+        )
+    }
+    cat("  \\bottomrule\n\\end{tabular}\n")
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        for (index in eval(parse(text = args[1]))) {
+            data <- data_at_index(index, seed_start = 242, normalise = TRUE)
+            cat(sprintf("[epsilon] Init: %2d   %s\n", index, data$name))
+
+            res <- fit_epsilon(data)
+
+            save_results(res, "epsilon", index)
+            cat(sprintf("[epsilon] Done: %2d   %s\n", index, data$name))
+        }
+    } else {
+        df <- load_results("epsilon")
+        plot_pdf(plot_size(df), "epsilon_size", 0.5, 0.3)
+        plot_pdf(plot_dist(df), "epsilon_dist", 1.0, 0.5)
+        cat("\n\n")
+        table_size(df)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/exp_initialisation.R b/experiments/regression/exp_initialisation.R
new file mode 100644
index 0000000..4aa7577
--- /dev/null
+++ b/experiments/regression/exp_initialisation.R
@@ -0,0 +1,191 @@
+## --------------------------------------------------
+## Peform experiment for selecting the best initialisation scheme
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/dami2021/exp_initialisation.R index
+##
+## Parameters:
+##
+##        index   : Specify the job index (1-170)
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        The number of jobs should be a multiple of 17.
+##        Run it again without an index to compile a plot.
+##
+## Running Manually:
+##        Rscript --vanilla experiments/dami2021/exp_initialisation.R 1:17
+##        Rscript --vanilla experiments/dami2021/exp_initialisation.R
+##
+## --------------------------------------------------
+
+
+## --------------------------------------------------
+## Libraries
+## --------------------------------------------------
+suppressMessages(library(ggplot2))
+suppressMessages(source("experiments/regression/data.R"))
+suppressMessages(source("experiments/regression/utils.R"))
+
+## --------------------------------------------------
+## Calculation
+## --------------------------------------------------
+initialisation_calc <- function(dataset) {
+    methods <- c(
+        "lasso" = slise_initialisation_lasso,
+        "ols" = slise_initialisation_ols,
+        "zeros" = slise_initialisation_zeros,
+        "cand 1" = slise_initialisation_candidates,
+        "cand 2" = slise_initialisation_candidates2
+    )
+    do.call(rbind, lapply(seq_along(methods), function(b) {
+        time <- system.time(
+            slise <- slise.fit(
+                dataset$X,
+                dataset$Y,
+                dataset$epsilon_sm,
+                dataset$lambda_sm,
+                initialisation = methods[[b]]
+            )
+        )[3]
+        data.frame(
+            dataset = dataset$name,
+            initialisation = names(methods)[[b]],
+            loss = slise$loss,
+            subset = mean(slise$subset),
+            time = time,
+            stringsAsFactors = TRUE
+        )
+    }))
+}
+
+
+## --------------------------------------------------
+## Plotting
+## --------------------------------------------------
+plot_loss <- function(df) {
+    ggplot(df) +
+        theme_bw() +
+        theme_paper() +
+        geom_boxplot(aes(initialisation, loss)) +
+        facet_wrap(vars(dataset), ncol = 4, scales = "free") +
+        scale_y_continuous(labels = function(s) sprintf("%.2f", s)) +
+        xlab("Initialisation Methods") +
+        ylab("Loss")
+}
+
+plot_time <- function(df) {
+    ggplot(df) +
+        theme_bw() +
+        theme_paper() +
+        geom_boxplot(aes(initialisation, time)) +
+        facet_wrap(vars(dataset), ncol = 4, scales = "free") +
+        xlab("Initialisation Methods") +
+        ylab("Time")
+}
+
+table_initialisations <- function(df, narrow = FALSE, qs = 0.1) {
+    lower <- paste0("$\\qth{", round(qs * 100), "}$")
+    upper <- paste0("$\\qth{", round((1 - qs) * 100), "}$")
+
+    if (narrow) {
+        cat("\\begin{tabular}{l c rrr c rrr}\n")
+        cat("  \\hline \\toprule\n")
+        cat("  \\textbf{Dataset} && \\multicolumn{3}{c}{\\textbf{Loss}} && \\multicolumn{3}{c}{\\textbf{Time [s]}} \\\\\n")
+        cat("  Initialisation &&", lower, "& Median &", upper, "&&", lower, "& Median &", upper, "\\\\\n")
+    } else {
+        cat("\\begin{tabular}{ll c rrr c rrr}\n")
+        cat("  \\hline \\toprule\n")
+        cat("  \\textbf{Dataset} & \\textbf{Initialisation} && \\multicolumn{3}{c}{\\textbf{Loss}} && \\multicolumn{3}{c}{\\textbf{Time [s]}} \\\\\n")
+        cat("  &&&", lower, "& Median &", upper, "&&", lower, "& Median &", upper, "\\\\\n")
+    }
+    start <- if (narrow) " " else " &"
+    for (d in unique(df$dataset)) {
+        df2 <- df %>% filter(dataset == d)
+        cat("  \\midrule", if (narrow) paste0("\\textbf{", latex_name(d), "} \\\\") else latex_name(d), "\n")
+        for (m in unique(df2$initialisation)) {
+            df3 <- df2 %>% filter(initialisation == m)
+            cat(start, sprintf(
+                "%12s && %6.2f & %6.2f & %6.2f && %6.2f & %6.2f & %6.2f \\\\\n", paste0("\\", m),
+                quantile(df3$loss, qs), median(df3$loss), quantile(df3$loss, 1 - qs),
+                quantile(df3$time, qs), median(df3$time), quantile(df3$time, 1 - qs)
+            ))
+        }
+    }
+    cat("  \\bottomrule\n\\end{tabular}\n")
+}
+
+table_initialisations2 <- function(df, qs = 0.05) {
+    lower <- paste0("$\\qth{", round(qs * 100), "}$")
+    upper <- paste0("$\\qth{", round((1 - qs) * 100), "}$")
+
+    cat("\\begin{tabular}{ll c rrr c r}\n")
+    cat("  \\hline \\toprule\n")
+    cat("  \\textbf{Dataset} & \\textbf{Initialisation} &&\n   \\multicolumn{3}{c}{\\textbf{Loss}} && \\textbf{Time [s]} \\\\\n")
+    cat("  &&&", lower, "& Median &", upper, "&&", "Median \\\\\n")
+    for (d in unique(df$dataset)) {
+        df2 <- df %>% filter(dataset == d)
+        cat("  \\midrule", latex_name(d), "\n")
+        for (m in unique(df2$initialisation)) {
+            df3 <- df2 %>% filter(initialisation == m)
+            cat(" &", sprintf(
+                "%12s && %6.2f & %6.2f & %6.2f && %6.2f  \\\\\n", paste0("\\", m),
+                quantile(df3$loss, qs), median(df3$loss), quantile(df3$loss, 1 - qs),
+                median(df3$time)
+            ))
+        }
+    }
+    cat("  \\bottomrule\n\\end{tabular}\n")
+}
+
+table_times <- function(df) {
+    datasets <- levels(df$dataset)
+    methods <- levels(df$initialisation)
+
+    cat("\\begin{tabular}{l ", rep("r", length(methods)), "}\n", sep = "")
+    cat("  \\hline \\toprule\n")
+    cat("  \\textbf{Dataset} &\n  \\multicolumn{", length(methods), "}{c}{\\textbf{Median Time [s]}} \\\\\n", sep = "")
+    cat(
+        "  ",
+        sprintf("& \\%s", methods),
+        "\\\\\n",
+        "  \\midrule\n"
+    )
+    for (d in datasets) {
+        df2 <- df %>%
+            filter(dataset == d) %>%
+            group_by(initialisation) %>%
+            summarise(time = median(time))
+        cat(" ", latex_name(d), sprintf("& %.1f", df2$time), "\\\\\n")
+    }
+    cat("  \\bottomrule\n\\end{tabular}\n")
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        for (index in eval(parse(text = args[1]))) {
+            data <- data_at_index(index, seed_start = 342, normalise = TRUE)
+            cat(sprintf("[initialisation] Init: %2d   %s\n", index, data$name))
+
+            df <- initialisation_calc(data)
+
+            save_results(df, "initialisation", index)
+            cat(sprintf("[initialisation] Done: %2d   %s\n", index, data$name))
+        }
+    } else {
+        df <- load_results("initialisation")
+        df <- df[df$initialisation %in% c("lasso", "ols", "zeros", "cand 1"), ]
+        df$initialisation <- factor(df$initialisation, labels = c("lasso", "ols", "zeros", "candidates"))
+        plot_pdf(plot_loss(df), "initialisation_loss", 1.0, 0.4)
+        cat("\n\n")
+        table_times(df)
+    }
+}
diff --git a/experiments/regression/exp_initialisation2.R b/experiments/regression/exp_initialisation2.R
new file mode 100644
index 0000000..db387ed
--- /dev/null
+++ b/experiments/regression/exp_initialisation2.R
@@ -0,0 +1,68 @@
+## --------------------------------------------------
+## Peform experiment for showing failure modes for some initialisation schemes
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/dami2021/exp_initialisation2.R
+##
+## --------------------------------------------------
+
+suppressMessages(library(ggplot2))
+suppressMessages(source("experiments/regression/utils.R"))
+
+set.seed(42)
+
+# Clean data
+
+X <- seq(0, 10, length.out = 25) # + rnorm(25, 0, 0.1)
+Y <- 3.5 - X * 0.7
+
+# Main outliers
+
+X <- c(X, seq(8, 10, length.out = 5))
+Y <- c(Y, rep(0, 5))
+
+# OLS outliers
+
+X <- c(X, seq(8, 10, length.out = 3), seq(0, 2, length.out = 3))
+Y <- c(Y, seq(3, 4, length.out = 3), seq(-4, -3, length.out = 3))
+
+# Models
+
+linear <- lm(Y ~ X)
+slise <- slise.fit(X, Y, 0.2, 0.01)
+slise_ols <- slise.fit(X, Y, 0.2, 0.01, initialisation = slise_initialisation_ols)
+slise_lasso <- slise.fit(X, Y, 0.2, 0.01, initialisation = slise_initialisation_lasso)
+slise_zero <- slise.fit(X, Y, 0.2, 0.01, initialisation = slise_initialisation_zeros)
+
+# Plot
+names <- c(
+    "OLS",
+    "SLISE from LASSO",
+    "SLISE from OLS",
+    "SLISE from ZEROS",
+    "SLISE from CANDIDATES"
+)
+res <- as.data.frame(rbind(
+    coef(linear),
+    coef(slise_lasso),
+    coef(slise_ols),
+    coef(slise_zero),
+    coef(slise)
+)) %>% mutate(method = factor(names, names))
+gg <- ggplot() +
+    theme_bw() +
+    theme_paper() +
+    geom_point(aes(X, Y), size = 3, shape = 1) +
+    geom_abline(
+        aes(intercept = res[[1]], slope = res[[2]], col = res$method, linetype = res$method),
+        size = 1
+    ) +
+    scale_color_manual(values = c("#1b9e77", "#8877bb", "#8877bb", "#8877bb", "#ed9411")) +
+    scale_linetype_manual(values = c("dotted", "dashed", "dashed", "dashed", "solid")) +
+    coord_fixed()
+
+# Output
+
+plot_pdf(gg, "initialisation2", 0.6, 0.3)
\ No newline at end of file
diff --git a/experiments/regression/exp_intro.R b/experiments/regression/exp_intro.R
new file mode 100644
index 0000000..aa55631
--- /dev/null
+++ b/experiments/regression/exp_intro.R
@@ -0,0 +1,49 @@
+## --------------------------------------------------
+## Experiment for use in the introduction
+##
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_intro.R
+##
+## --------------------------------------------------
+
+library(ggplot2)
+source("experiments/regression/utils.R")
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+# if (sys.nframe() == 0L) {
+    set.seed(42)
+
+    n <- 50
+    corridor <- 0.6
+
+    x <- seq(-1, 1, length.out = n)
+    y <- -x + rnorm(n, 0, 0.15)
+    x <- c(x, rep(seq(1.6, 1.8, 0.1), 2))
+    y <- c(y, rep(c(1.8, 1.95), each = 3))
+
+    a <- lm(y ~ x)$coefficients
+    b <- slise.fit(x, y, epsilon = corridor)$coefficients
+    names <- factor(c("OLS", "SLISE"))
+
+    px <- c(-10, 10, 10, -10)
+    py <- c(px * b[2] + b[1] + c(-1, -1, 1, 1) * corridor)
+
+    gg <- ggplot() + theme_bw() + xlab("x") + ylab("y") +
+        coord_cartesian(xlim = c(min(x), max(x)), ylim = c(min(y), max(y))) +
+        theme_paper() + theme(
+            axis.text = element_blank(),
+            axis.ticks = element_blank(),
+            axis.title.y = element_text(angle = 0, vjust = 0.5)
+        ) +
+        geom_polygon(aes(x = px, y = py), fill = "#998ec333") +
+        geom_point(aes(x = x, y = y), size = 2) +
+        geom_abline(aes(intercept = c(a[1], b[1]), slope = c(a[2], b[2]), color = names, linetype = names), size = 1) +
+        scale_color_manual(values = c("#1b9e77", "#ed9411")) +
+        scale_linetype_manual(values = c("dashed", "solid"))
+
+    plot_pdf(gg, "rr_example", 0.45, 0.3)
+# }
diff --git a/experiments/regression/exp_iterations.R b/experiments/regression/exp_iterations.R
new file mode 100644
index 0000000..854bfa3
--- /dev/null
+++ b/experiments/regression/exp_iterations.R
@@ -0,0 +1,187 @@
+## --------------------------------------------------
+## Peform experiment for selecting the best number of iterations
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_iterations.R index
+##
+## Parameters:
+##
+##        index   : Specify the job index (1-170)
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        The number of jobs should be a multiple of 17.
+##        Run it again without an index to compile a plot.
+##
+## Running Manually:
+##        Rscript --vanilla experiments/regression/exp_iterations.R 1:17
+##        Rscript --vanilla experiments/regression/exp_iterations.R
+##
+## --------------------------------------------------
+
+
+## --------------------------------------------------
+## Libraries
+## --------------------------------------------------
+suppressMessages(require(ggplot2))
+suppressMessages(require(tidyr))
+suppressMessages(require(lemon))
+suppressMessages(source("experiments/regression/data.R"))
+suppressMessages(source("experiments/regression/utils.R"))
+
+
+## --------------------------------------------------
+## Calculation
+## --------------------------------------------------
+calc_iterations <- function(data,
+                            iters = c(100, 200, 250, 300, 400),
+                            approxes = c(1.05, 1.075, 1.10, 1.15, 1.20)) {
+    grid <- expand.grid(iters = iters, approx = approxes)
+    do.call(rbind, mapply(function(b, a) {
+        time <- system.time(
+            slise <- slise.fit(
+                data$X,
+                data$Y,
+                data$epsilon_sm,
+                data$lambda_sm,
+                max_iterations = b,
+                max_approx = a
+            )
+        )[3]
+        data.frame(
+            dataset = data$name,
+            iterations = b,
+            approx = a,
+            time = time,
+            loss = slise$value,
+            subset = mean(slise$subset),
+            stringsAsFactors = TRUE
+        )
+    }, grid$iters, grid$approx, SIMPLIFY = FALSE))
+}
+
+
+## --------------------------------------------------
+## Plotting
+## --------------------------------------------------
+plot_iterations <- function(df) {
+    df <- df %>%
+        mutate(approx = as.factor(sprintf("%.2f", approx))) %>%
+        gather(m, value, c("loss", "time")) %>%
+        mutate(m = factor(m, labels = c("Loss", "Time"))) %>%
+        group_by(dataset, m) %>%
+        mutate(value = value / abs(median(value))) %>%
+        group_by(dataset, m, iterations, approx) %>%
+        summarize(value = median(value))
+    levels(df$approx) <- sapply(levels(df$approx), function(v) as.expression(bquote(r[max] == .(v))))
+    ggplot(df, aes(iterations, value, col = dataset, linetype = dataset)) +
+        theme_bw() +
+        theme_paper() +
+        geom_line() +
+        facet_grid(rows = vars(m), cols = vars(approx), scales = "free", labeller = label_parsed) +
+        scale_linetype_discrete(name = NULL) +
+        scale_color_brewer(type = "qual", palette = "Dark2", name = NULL) +
+        xlab(expression(Iterations ~ ~ (i[max]))) +
+        ylab("Normalised Value") +
+        geom_hline(aes(yintercept = -Inf), size = 0.75) +
+        coord_cartesian(clip = "off")
+}
+
+plot_dataset <- function(df) {
+    df <- df %>%
+        mutate(approx = as.factor(sprintf("%.2f", approx))) %>%
+        group_by(dataset, iterations, approx) %>%
+        summarize(Time = mean(time), Loss = mean(loss)) %>%
+        gather(m, value, c("Loss", "Time"))
+    gg <- ggplot(df, aes(iterations, value, col = approx, linetype = approx, shape = approx)) +
+        theme_bw() +
+        theme_paper() +
+        geom_line(size = 1) +
+        geom_point(size = 3) +
+        facet_wrap(vars(dataset, m), scales = "free", dir = "v") +
+        scale_linetype_discrete(name = "K") +
+        scale_color_brewer(name = "K", type = "qual", palette = "Dark2") +
+        scale_shape_manual(name = "K", values = c(1, 2, 0, 3, 4)) +
+        # scale_y_continuous(labels = function(x) sprintf("%.2f", x)) +
+        xlab(expression(Iterations ~ ~ (i[max]))) +
+        ylab(NULL) +
+        theme(
+            strip.text = element_text(margin = margin(0, 0, 0.1, 0, "cm")),
+            legend.title = element_text(),
+            legend.key.width = grid::unit(2, "lines")
+        )
+    reposition_legend(gg, "center", panel = c("panel-3-4", "panel-4-4"))
+}
+
+plot_dataset2 <- function(df) {
+    df <- df %>%
+        mutate(iterations = as.factor(paste(iterations))) %>%
+        group_by(dataset, iterations, approx) %>%
+        summarize(Time = mean(time), Loss = mean(loss)) %>%
+        gather(m, value, c("Loss", "Time"))
+    ggplot(df, aes(approx, value, col = iterations, linetype = iterations, shape = iterations)) +
+        theme_bw() +
+        theme_paper() +
+        geom_line() +
+        geom_point() +
+        facet_wrap(vars(dataset, m), scales = "free") +
+        scale_linetype_discrete(name = expression(i[max])) +
+        scale_color_brewer(name = expression(i[max]), type = "qual", palette = "Dark2") +
+        scale_shape_manual(name = expression(i[max]), values = c(1, 2, 0, 3, 4)) +
+        scale_y_continuous(labels = function(x) sprintf("%.2f", x)) +
+        xlab("Approximation Ratio, K") +
+        ylab(NULL) +
+        theme(
+            strip.text = element_text(margin = margin(0, 0, 0.1, 0, "cm")),
+            legend.title = element_text()
+        )
+}
+
+plot_scatter <- function(df) {
+    df <- df %>%
+        mutate(
+            approx = as.factor(sprintf("%.2f", approx)),
+            iterations = as.factor(paste(iterations))
+        ) %>%
+        group_by(dataset, iterations, approx) %>%
+        summarize(Time = mean(time), Loss = mean(loss))
+    ggplot(df, aes(Time, Loss, shape = iterations, col = approx)) +
+        theme_bw() +
+        theme_paper() +
+        geom_point() +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        scale_shape_manual(
+            name = expression(i[max]),
+            values = c(1, 2, 0, 3, 4)
+        ) +
+        scale_color_brewer(name = "K", palette = "Dark2") +
+        scale_y_continuous(labels = function(x) sprintf("%.2f", x)) +
+        theme(legend.title = element_text())
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        for (index in eval(parse(text = args[1]))) {
+            data <- data_at_index(index, seed_start = 442, normalise = TRUE)
+            cat(sprintf("[iterations] Init: %2d   %s\n", index, data$name))
+
+            df <- calc_iterations(data)
+
+            save_results(df, "iterations", index)
+            cat(sprintf("[iterations] Done: %2d   %s\n", index, data$name))
+        }
+    } else {
+        df <- load_results("iterations")
+        # plot_pdf(plot_iterations(df), "iterations", 1.0, 0.5)
+        plot_pdf(plot_dataset(df), "iterations2", 1.0, 0.8)
+        # plot_pdf(plot_scatter(df), "iterations3", 1.0, 0.4)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/exp_no_outliers.R b/experiments/regression/exp_no_outliers.R
new file mode 100644
index 0000000..e020939
--- /dev/null
+++ b/experiments/regression/exp_no_outliers.R
@@ -0,0 +1,198 @@
+## --------------------------------------------------
+## Experiment showing the how well the algorithms work on clean data
+##
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_no_outliers.R [index]
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        The number of jobs should be a multiple of 17*(7+1).
+##        Run it again without an index to compile a plot.
+##
+## Running locally:
+##
+##        Rscript --vanilla experiments/regression/exp_no_outliers.R 1:136
+##        Rscript --vanilla experiments/regression/exp_no_outliers.R
+##
+## --------------------------------------------------
+
+
+## --------------------------------------------------
+## Libraries
+## --------------------------------------------------
+source("experiments/regression/data.R")
+source("experiments/regression/regression.R")
+source("experiments/regression/utils.R")
+
+
+## --------------------------------------------------
+## Experiment Calculations
+## --------------------------------------------------
+calculate <- function(method,
+                      data,
+                      k = 10,
+                      timelimit = 1000) {
+    folds <- rep(1:k, length.out = nrow(data$X))
+    folds <- sample(folds)
+
+    df <- data.frame()
+    for (i in 1:k) {
+        rr <- with_timelimit(
+            method$train,
+            timelimit,
+            sprintf("%s %s", data$name, method$name),
+            data$X[folds != i, ],
+            data$Y[folds != i],
+            data$epsilon_md,
+            data$lambda_md
+        )
+        if (is.rr(rr) || is.rrr(rr)) {
+            res <- predict(rr, data$X[folds == i, ]) - data$Y[folds == i]
+            df <- rbind(
+                df,
+                data.frame(
+                    method = method$name,
+                    dataset = data$name,
+                    mae = mean(abs(res)),
+                    mse = mean(res^2),
+                    stringsAsFactors = TRUE
+                )
+            )
+        } else {
+            break()
+        }
+    }
+    df
+}
+
+
+## --------------------------------------------------
+## Plotting
+## --------------------------------------------------
+plot_mae <- function(df) {
+    suppressPackageStartupMessages(library(ggplot2))
+    methods <- get_regression_methods(add_mean = TRUE)
+    df <- df %>%
+        filter(dataset != "Synthetic") %>%
+        group_by(dataset, method) %>%
+        summarize(mae = mean(mae))
+    df$dataset <- factor(df$dataset, data_name_factor())
+    df$method <- factor(df$method, get_nested(methods, "name"))
+    ggplot(df) +
+        geom_point(aes(1, 0), alpha = 0) +
+        theme_bw() +
+        theme_paper() +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        xlab(NULL) +
+        ylab("Mean Absolute Error") +
+        ggrepel::geom_text_repel(
+            aes(1.05, mae, label = method),
+            force = 0.2,
+            box.padding = 0.15,
+            segment.alpha = 0.3,
+            direction = "y",
+            hjust = 0,
+            nudge_x = 0.15
+        ) +
+        geom_point(aes(1, mae, col = method, shape = method), size = 3) +
+        theme(
+            legend.position = "none",
+            axis.ticks.x = element_blank(),
+            axis.text.x = element_blank(),
+            panel.grid.major.x = element_blank(),
+            panel.grid.minor.x = element_blank()
+        ) +
+        scale_x_continuous(limits = c(0.9, 2))
+}
+
+plot_mae2 <- function(df) {
+    suppressPackageStartupMessages(library(ggplot2))
+    methods <- get_regression_methods(add_mean = TRUE)
+    df <- df %>%
+        filter(dataset != "Synthetic") %>%
+        group_by(dataset, method) %>%
+        summarize(mae = mean(mae))
+    df2 <- df %>%
+        filter(method == "Mean") %>%
+        transmute(ref = mae, dataset = dataset)
+    df <- df %>%
+        inner_join(df2, "dataset") %>%
+        mutate(mae = mae / ref)
+    df$dataset <- factor(df$dataset, data_name_factor())
+    df$method <- factor(df$method, get_nested(methods, "name"))
+    ggplot(df, aes(dataset, mae, group = method, col = method, shape = method, linetype = method)) +
+        theme_bw() +
+        theme_paper() +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        scale_linetype_manual(values = get_nested(methods, "linetype"), name = NULL) +
+        xlab(NULL) +
+        ylab("Relative Mean Absolute Error") +
+        geom_point(size = 3) +
+        geom_line() +
+        theme(legend.key.width = grid::unit(2, "lines"))
+}
+
+print_mae <- function(df) {
+    methods <- get_regression_methods(add_mean = TRUE)
+    df$method <- factor(df$method, get_nested(methods, "name"), get_nested(methods, "latex"))
+    levels(df$dataset) <- sapply(levels(df$dataset), latex_name)
+    df <- df %>%
+        filter(dataset != "Synthetic") %>%
+        group_by(dataset, method) %>%
+        summarize(mae = mean(mae)) %>%
+        tidyr::pivot_wider(names_from = method, values_from = mae)
+    tab <- print(
+        xtable::xtable(
+            df,
+            align = c("l", "l", rep("r", ncol(df) - 1)),
+            label = "tab:exp:clean",
+            caption = "TODO"
+        ),
+        sanitize.text.function = identity,
+        sanitize.colnames.function = identity,
+        include.rownames = FALSE,
+        comment = FALSE,
+        booktabs = TRUE,
+        print.results = FALSE,
+        caption.placement = "top",
+        table.placement = NULL
+    )
+    tab <- stringr::str_replace(tab, "centering", "centering\n\\\\rowcolors{2}{white}{gray!25}")
+    tab <- stringr::str_replace(tab, "dataset", "")
+    cat(tab)
+}
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        if (length(args) > 1) {
+            reticulate::use_python(args[2])
+        }
+        for (index in eval(parse(text = args[1]))) {
+            methods <- get_regression_methods(add_mean = TRUE)
+            method_index <- (index - 1) %% length(methods) + 1
+            data_index <- floor((index - 1) / length(methods)) + 1
+            method <- methods[[method_index]]
+            data <- data_at_index(data_index, seed_start = 842, normalise = TRUE)
+            cat(sprintf("[no outliers] Init: %2d   %s   %s\n", index, method$name, data$name))
+
+            df <- calculate(method, data)
+            save_results(df, "no_outliers", data_index, method$tag)
+            cat(sprintf("[no outliers] Done: %2d   %s   %s\n", index, method$name, data$name))
+        }
+    } else {
+        df <- load_results("no_outliers")
+        print_mae(df)
+        plot_pdf(plot_mae(df), "no_outliers", 0.7, 0.5)
+        plot_pdf(plot_mae2(df), "no_outliers2", 0.7, 0.4)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/exp_optimality.R b/experiments/regression/exp_optimality.R
new file mode 100644
index 0000000..a10150e
--- /dev/null
+++ b/experiments/regression/exp_optimality.R
@@ -0,0 +1,127 @@
+## --------------------------------------------------
+## This experiment is showing that the SLISE algorithm is optimal for solving the problem.
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_optimality.R[index]
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        The number of jobs should be a multiple of 17.
+##        Run it again without an index to compile a plot.
+##
+## Running locally:
+##
+##        Rscript --vanilla experiments/regression/exp_optimality.R 1:17
+##        Rscript --vanilla experiments/regression/exp_optimality.R
+##
+## --------------------------------------------------
+
+
+## --------------------------------------------------
+## Libraries
+## --------------------------------------------------
+suppressPackageStartupMessages({
+    require(ggplot2)
+    require(lemon)
+})
+source("experiments/regression/data.R")
+source("experiments/regression/regression.R")
+source("experiments/regression/utils.R")
+
+
+## --------------------------------------------------
+## Calculation
+## --------------------------------------------------
+optimality_calculate <- function(data,
+                                 points = seq(0.5, 2.5, 0.25),
+                                 timelimit = 6000) {
+    methods <- get_regression_methods()
+    names <- get_nested(methods, "name")
+    models <- lapply(methods, function(m) {
+        with_timelimit(
+            m$train,
+            timelimit,
+            sprintf("[optimality] %s %s", data$name, m$name),
+            data$X,
+            data$Y,
+            data$epsilon_sm,
+            data$lambda_sm
+        )
+    })
+    epsilons <- points * data$epsilon_sm
+    do.call(rbind, lapply(which(sapply(models, is.rr)), function(i) {
+        mod <- models[[i]]
+        res2 <- (predict(mod, data$X) - data$Y)^2
+        losses <- sapply(epsilons, function(eps) {
+            loss_sharp_res(mod$coef, res2, eps^2)
+        })
+        data.frame(
+            dataset = data$name,
+            loss = losses,
+            epsilon = epsilons,
+            ref_eps = data$epsilon_sm,
+            method = names[[i]],
+            stringsAsFactors = TRUE
+        )
+    }))
+}
+
+
+## --------------------------------------------------
+## Plotting
+## --------------------------------------------------
+plot_optimality <- function(df) {
+    methods <- get_regression_methods()
+    df <- df %>%
+        group_by(method, dataset, epsilon, ref_eps) %>%
+        summarise(loss = mean(loss)) %>%
+        group_by(dataset, epsilon) %>%
+        mutate(loss = loss / median(abs(loss)))
+    df$dataset <- factor(df$dataset, data_name_factor())
+    df$method <- factor(df$method, get_nested(methods, "name"))
+    gg <- ggplot(df) +
+        theme_bw() +
+        theme_paper() +
+        geom_vline(aes(xintercept = ref_eps), col = "black") +
+        geom_line(aes(epsilon, loss, col = method, linetype = method, size = method)) +
+        geom_point(aes(epsilon, loss, col = method, shape = method), size = 3) +
+        scale_linetype_manual(values = get_nested(methods, "linetype"), name = NULL) +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        scale_size_manual(values = get_nested(methods, "size"), name = NULL) +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        xlab(expression("Loss function" ~ epsilon)) +
+        ylab("Relative Loss") +
+        geom_label(aes(ref_eps, Inf, label = "epsilon"), parse = TRUE, vjust = 0.8, hjust = 0.5) +
+        coord_cartesian(clip = "off") +
+        theme(legend.key.width = grid::unit(2, "lines"))
+    reposition_legend(gg, "center", panel = "panel-4-2")
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        if (length(args) > 1) {
+            reticulate::use_python(args[2])
+        }
+        for (index in eval(parse(text = args[1]))) {
+            data <- data_at_index(index, seed_start = 542, normalise = TRUE)
+            cat("[optimality] Init:", index, "\n")
+
+            df <- optimality_calculate(data)
+
+            save_results(df, "optimality", index)
+            cat("[optimality] Done:", index, "\n")
+        }
+    } else {
+        df <- load_results("optimality")
+        plot_pdf(plot_optimality(df), "optimality", 1.0, 0.6)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/exp_pca_threshold.R b/experiments/regression/exp_pca_threshold.R
new file mode 100644
index 0000000..3577222
--- /dev/null
+++ b/experiments/regression/exp_pca_threshold.R
@@ -0,0 +1,49 @@
+## --------------------------------------------------
+## Experiment for selecting the treshold for using PCA
+##
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_pca_threshold.R
+##
+## --------------------------------------------------
+
+## --------------------------------------------------
+## Libraries
+## --------------------------------------------------
+suppressMessages(library(ggplot2))
+suppressMessages(library(dplyr))
+suppressMessages(source("experiments/regression/utils.R"))
+
+
+## --------------------------------------------------
+## Plotting
+## --------------------------------------------------
+plot_pca_treshold <- function(o = seq(0.3, 0.75, 0.01), d = 7:12, u = 500) {
+    df <- expand.grid(o = o, d = d, u = u) %>%
+        mutate(p = 1 - (1 - (1 - o)^d)^u, d = sprintf("%2d", d))
+    df2 <- df %>% filter(o == 0.51) # %>% mutate(l = sprintf("%2d: %.2f", d, p))#   d = %d   p = %.2f", d, p))
+    ggplot(df) +
+        geom_vline(xintercept = 0.5, color = "grey", size = 1) +
+        geom_line(aes(o, p, linetype = d, col = d), size = 1) +
+        labs(
+            x = "Fraction of outliers",
+            y = "Probability of at least one\ncandidate with no outliers",
+            linetype = "Dimensions",
+            color = "Dimensions"
+            # , title = "Probability of finding at least one subset without outliers") +
+        ) +
+        geom_text(data = df2, aes(o, p, label = d), hjust = 0.5, nudge_x = 0.02, vjust = 1) +
+        scale_color_brewer(type = "qual", palette = "Dark2") +
+        theme_bw() +
+        theme_paper() +
+        theme(legend.title = element_text())
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    plot_pdf(plot_pca_treshold(), "pca_treshold", 0.5, 0.3)
+}
diff --git a/experiments/regression/exp_robustness.R b/experiments/regression/exp_robustness.R
new file mode 100644
index 0000000..6105207
--- /dev/null
+++ b/experiments/regression/exp_robustness.R
@@ -0,0 +1,233 @@
+## --------------------------------------------------
+## Experiment showing the robustness of different algorithms
+##
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_robustness.R [index]
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        The number of jobs should be a multiple of 17.
+##        Run it again without an index to compile a plot.
+##
+## Running locally:
+##
+##        Rscript --vanilla experiments/regression/exp_robustness.R 1:17
+##        Rscript --vanilla experiments/regression/exp_robustness.R
+##
+## --------------------------------------------------
+
+suppressPackageStartupMessages({
+    library(ggplot2)
+    require(lemon)
+})
+source("experiments/regression/data.R")
+source("experiments/regression/regression.R")
+source("experiments/regression/utils.R")
+
+
+robustness_calculate <- function(data,
+                                 noise = seq(0.0, 0.8, 0.1),
+                                 timelimit = 4000) {
+    methods <- get_regression_methods()
+    noise_fn <- list(
+        list(fn = function(noise) outlier_data(data, noise, 0), name = "Vertical Outliers"),
+        list(fn = function(noise) outlier_data(data, 0, noise), name = "Leverage Points")
+    )
+    names <- get_nested(methods, "name")
+    df <- data.frame()
+    mask <- rep(TRUE, length(methods))
+    for (n in noise) {
+        for (f in noise_fn) {
+            data2 <- f$fn(n)
+            models <- mapply(function(m, s) {
+                if (s) {
+                    with_timelimit(
+                        m$train,
+                        timelimit,
+                        sprintf("%s %s %.1f %s", data$name, m$name, n, f$name),
+                        data2$X,
+                        data2$Y,
+                        data$epsilon_sm,
+                        data$lambda_sm
+                    )
+                } else {
+                    # Skip methods that take too long
+                    NA
+                }
+            }, methods, mask, SIMPLIFY = FALSE)
+            cat(sprintf("%15s   %.1f   %s\n", data$name, n, f$name))
+            flush.console()
+            if (n == 0) {
+                cleans <- models
+            }
+            mask <- sapply(models, is.rr)
+            if (any(mask)) {
+                diff <- mapply(function(clean, model) mean(abs(clean$coef - model$coef)), cleans[mask], models[mask])
+                mae <- sapply(models[mask], function(m) mean(abs(predict(m, data$X) - data$Y)))
+                mse <- sapply(models[mask], function(m) mean((predict(m, data$X) - data$Y)^2))
+                slise <- sapply(models[mask], function(m) loss_sharp_res(m$coef, (predict(m, data$X) - data$Y)^2, data$epsilon_sm^2))
+                df <- rbind(df, data.frame(
+                    dataset = data$name,
+                    noise = n,
+                    diff = diff,
+                    mae = mae,
+                    mse = mse,
+                    slise = slise,
+                    method = names[mask],
+                    type = f$name,
+                    stringsAsFactors = TRUE
+                ))
+            }
+        }
+    }
+    df$method <- factor(df$method, names, ordered = TRUE)
+    df
+}
+
+plot_robustness <- function(df, metric = "mae") {
+    methods <- get_regression_methods()
+    df$value <- df[metric][[1]]
+    df <- df %>%
+        group_by(noise, method, type) %>%
+        summarise(value = mean(value))
+    metric <- switch(metric,
+        mse = "Mean Squared Error",
+        mae = "Mean Absolute Error",
+        diff = "Mean Coefficient Change",
+        metric
+    )
+    gg <- ggplot(df) +
+        geom_line(aes(noise, value, linetype = method, col = method, size = method)) +
+        geom_point(aes(noise, value, shape = method, col = method), size = 3) +
+        facet_wrap(vars(type), nrow = 1, scales = "free") +
+        scale_linetype_manual(values = get_nested(methods, "linetype"), name = NULL) +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        scale_size_manual(values = get_nested(methods, "size"), name = NULL) +
+        # scale_y_continuous(limits = c(0, max(df$value))) +
+        xlab("Fraction of outliers in the datasets") +
+        ylab(metric) +
+        theme_bw() +
+        theme_paper() +
+        theme(legend.key.width = grid::unit(2, "lines"))
+    gg
+}
+
+plot_robustness2 <- function(df, metric = "mae") {
+    methods <- get_regression_methods()
+    df$value <- df[metric][[1]]
+    df <- df %>%
+        group_by(noise, method, type, dataset) %>%
+        summarise(value = mean(value))
+    df$dataset <- factor(df$dataset, data_name_factor())
+    df$method <- factor(df$method, get_nested(methods, "name"))
+    metric <- switch(metric,
+        mse = "Mean Squared Error",
+        mae = "Mean Absolute Error",
+        diff = "Mean Coefficient Change",
+        metric
+    )
+    gg <- ggplot(df) +
+        geom_line(aes(noise, value, linetype = method, col = method, size = method)) +
+        geom_point(aes(noise, value, shape = method, col = method)) +
+        facet_wrap(vars(dataset, type), ncol = 4, scales = "free") +
+        scale_linetype_manual(values = get_nested(methods, "linetype"), name = NULL) +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        scale_size_manual(values = get_nested(methods, "size"), name = NULL) +
+        xlab("Fraction of outliers in the datasets") +
+        ylab(metric) +
+        theme_bw() +
+        theme_paper()
+    gg
+}
+
+plot_vertical <- function(df, metric = "mae") {
+    methods <- get_regression_methods()
+    df$value <- df[metric][[1]]
+    df <- df %>%
+        filter(type == "Vertical Outliers") %>%
+        group_by(noise, method, dataset) %>%
+        summarise(value = mean(value))
+    df$dataset <- factor(df$dataset, data_name_factor())
+    df$method <- factor(df$method, get_nested(methods, "name"))
+    metric <- switch(metric,
+        mse = "Mean Squared Error",
+        mae = "Mean Absolute Error",
+        diff = "Mean Coefficient Change",
+        metric
+    )
+    gg <- ggplot(df) +
+        geom_line(aes(noise, value, linetype = method, col = method, size = method)) +
+        geom_point(aes(noise, value, shape = method, col = method)) +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        scale_linetype_manual(values = get_nested(methods, "linetype"), name = NULL) +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        scale_size_manual(values = get_nested(methods, "size"), name = NULL) +
+        xlab("Fraction of vertical outliers in the datasets") +
+        ylab(metric) +
+        theme_bw() +
+        theme_paper()
+    reposition_legend(gg, "center", panel = "panel-4-2")
+}
+
+plot_leverage <- function(df, metric = "mae") {
+    methods <- get_regression_methods()
+    df$value <- df[metric][[1]]
+    df <- df %>%
+        filter(type == "Leverage Points") %>%
+        group_by(noise, method, dataset) %>%
+        summarise(value = mean(value))
+    df$dataset <- factor(df$dataset, data_name_factor())
+    df$method <- factor(df$method, get_nested(methods, "name"))
+    metric <- switch(metric,
+        mse = "Mean Squared Error",
+        mae = "Mean Absolute Error",
+        diff = "Mean Coefficient Change",
+        metric
+    )
+    gg <- ggplot(df) +
+        geom_line(aes(noise, value, linetype = method, col = method, size = method)) +
+        geom_point(aes(noise, value, shape = method, col = method)) +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        scale_linetype_manual(values = get_nested(methods, "linetype"), name = NULL) +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        scale_size_manual(values = get_nested(methods, "size"), name = NULL) +
+        xlab("Fraction of vertical outliers in the datasets") +
+        ylab(metric) +
+        theme_bw() +
+        theme_paper()
+    gg
+}
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        if (length(args) > 1) {
+            reticulate::use_python(args[2])
+        }
+        for (index in eval(parse(text = args[1]))) {
+            data <- data_at_index(index, seed_start = 642, normalise = TRUE)
+            cat("[robustness] Init:", index, "\n")
+
+            df <- robustness_calculate(data)
+
+            save_results(df, "robustness", index)
+            cat("[robustness] Done:", index, "\n")
+        }
+    } else {
+        df <- load_results("robustness")
+        # plot_pdf(plot_robustness(df, "mae"), "robustness", 1.0, 0.4)
+        # plot_pdf(plot_robustness2(df, "mae"), "robustness2", 1.0, 0.8)
+        plot_pdf(plot_vertical(df, "mae"), "vertical_outliers", 1.0, 0.7)
+        plot_pdf(plot_leverage(df, "mae"), "leverage_points", 1.0, 0.7)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/exp_robustness2.R b/experiments/regression/exp_robustness2.R
new file mode 100644
index 0000000..3df2643
--- /dev/null
+++ b/experiments/regression/exp_robustness2.R
@@ -0,0 +1,79 @@
+## --------------------------------------------------
+## Experiment showing the robustness of different algorithms
+##
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_robustness2.R [index]
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        Run it again without an index to compile a plot.
+##
+## Running locally:
+##
+##        Rscript --vanilla experiments/regression/exp_robustness2.R 1:10
+##        Rscript --vanilla experiments/regression/exp_robustness2.R
+##
+## --------------------------------------------------
+
+suppressMessages(source("experiments/regression/exp_robustness.R"))
+
+
+plot_vertical <- function(df, metric = "mae") {
+    methods <- get_regression_methods()
+    df$value <- df[metric][[1]]
+    df <- df %>%
+        filter(type == "Vertical Outliers") %>%
+        group_by(noise, method, dataset) %>%
+        summarise(value = mean(value))
+    metric <- switch(metric,
+        mse = "Mean Squared Error",
+        mae = "Mean Absolute Error",
+        diff = "Mean Coefficient Change",
+        metric
+    )
+    gg <- ggplot(df) +
+        geom_line(aes(noise, value, linetype = method, col = method, size = method)) +
+        geom_point(aes(noise, value, shape = method, col = method)) +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        scale_linetype_manual(values = get_nested(methods, "linetype"), name = NULL) +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        scale_size_manual(values = get_nested(methods, "size"), name = NULL) +
+        xlab("Fraction of vertical outliers in the datasets") +
+        ylab(metric) +
+        theme_bw() +
+        theme_paper()
+    gg
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        if (length(args) > 1) {
+            reticulate::use_python(args[2])
+        }
+        for (index in eval(parse(text = args[1]))) {
+            set.seed(42 + index)
+
+            n <- 10 * as.integer(100 * sqrt(index))
+            d <- as.integer(20 * sqrt(index))
+            data <- robust_data(data_create(n, d, extra_models = 0))
+            cat("[robust_synth] Init:", index, "\n")
+
+            df <- robustness_calculate(data)
+
+            save_results(df, "robust_synth", index)
+            cat("[robust_synth] Done:", index, "\n")
+        }
+    } else {
+        df <- load_results("robust_synth")
+        plot_pdf(plot_robustness(df, "mae"), "robustness_synth", 1.0, 0.5)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/exp_scalability.R b/experiments/regression/exp_scalability.R
new file mode 100644
index 0000000..ca518f7
--- /dev/null
+++ b/experiments/regression/exp_scalability.R
@@ -0,0 +1,161 @@
+## --------------------------------------------------
+## Peform experiment for Robust Regression Scalability
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_scalability.R [index]
+##
+## Parameters:
+##
+##        index    : Specify a certain set to run (integer 1 - 70).
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster
+##        since some of the configurations WILL pass the time
+##        limit without honoring interrupts. If these are run
+##        manually (without a cluster) then you might need to
+##        cancel some executions (fastLTS et.c.) after a couple
+##        of hours (all valid results are saved).
+##        Run it again without an index to compile a plot.
+##        The number of jobs should be a multiple of 7.
+##
+## Running Manually:
+##        Rscript --vanilla experiments/regression/exp_scalability.R 1:7
+##        Rscript --vanilla experiments/regression/exp_scalability.R
+## --------------------------------------------------
+
+
+## --------------------------------------------------
+## Libraries
+## --------------------------------------------------
+suppressPackageStartupMessages(library(ggplot2))
+source("experiments/regression/data.R")
+source("experiments/regression/regression.R")
+source("experiments/regression/utils.R")
+
+
+## --------------------------------------------------
+## Experiment Calculations
+## --------------------------------------------------
+calculate <- function(method,
+                      data_index,
+                      timelimit = 1000,
+                      ns = c(110, 500, 1000, 5000, 10000, 50000, 100000),
+                      ds = c(10, 50, 100, 225, 500, 1000, 2250, 5000)) {
+    # Warmup (i.e. make sure any JIT has been executed)
+    tmp <- robust_data(data_create(30, 5))
+    method$train(tmp$X, tmp$Y, tmp$epsilon_sm, tmp$lambda_sm)
+
+    # Setup configs
+    config <- expand.grid(n = ns, d = ds) %>%
+        filter(n >= d & (d == 100 | n == 10000)) %>%
+        mutate(ord = d * n + d) %>%
+        arrange(ord)
+    df <- data.frame()
+    lns <- c()
+    lds <- c()
+
+    # Run the configs
+    for (i in 1:nrow(config)) {
+        n <- config$n[[i]]
+        d <- config$d[[i]]
+        # Check if the limit has been reached previously
+        if (any(n >= lns & d >= lds)) {
+            next
+        }
+        set.seed(n * 1000 + d + data_index + 42)
+        data <- robust_data(data_create(n, d))
+        time <- system.time(rr <- with_timelimit(
+            method$train,
+            timelimit * 5,
+            sprintf("%d x %d %s", n, d, method$name),
+            data$X,
+            data$Y,
+            data$epsilon_sm,
+            data$lambda_sm
+        ), gcFirst = TRUE)
+        if (is.rr(rr)) {
+            df <- rbind(df, data.frame(
+                method = method$name,
+                n = n,
+                d = d,
+                time = time[[3]],
+                eff = (time[[1]] + time[[2]]) / time[[3]],
+                stringsAsFactors = TRUE
+            ))
+            save_results(df, "scalability", data_index, method$tag)
+        }
+        if (time[[3]] > timelimit) {
+            lns <- c(lns, n)
+            lds <- c(lds, d)
+        }
+    }
+}
+
+
+## --------------------------------------------------
+## Plotting
+## --------------------------------------------------
+plot_time <- function(df) {
+    methods <- get_regression_methods()
+    df <- df %>%
+        group_by(n, d, method) %>%
+        summarize(time = mean(time)) %>%
+        mutate(method = factor(method, get_nested(methods, "name")))
+    labels <- c("Number of Items", "Number of Dimensions")
+    labels <- factor(labels, labels, ordered = TRUE)
+    df <- rbind(
+        df %>% filter(d == 100) %>% mutate(variant = labels[1], param = n),
+        df %>% filter(n == 10000, d < 5000, d > 10) %>% mutate(variant = labels[2], param = d)
+    ) # %>% filter(time < 1500 | method == "Sparse LTS")
+    ggplot(df) +
+        theme_bw() +
+        theme_paper() +
+        geom_line(aes(param, time, col = method, linetype = method, size = method)) +
+        geom_point(aes(param, time, col = method, shape = method), size = 3) +
+        facet_wrap(vars(variant), scales = "free") +
+        scale_linetype_manual(values = get_nested(methods, "linetype"), name = NULL) +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        scale_size_manual(values = get_nested(methods, "size"), name = NULL) +
+        xlab(NULL) +
+        ylab("Time [s]") +
+        scale_x_log10(labels = function(x) format(x, big.mark = " ", scientific = FALSE)) +
+        # breaks = c(10, 50, 100, 500, 1000, 10000, 100000),
+        # labels = c("10", "50", "100", "500 ", " 1000", "10 000", "100 000")) +
+        scale_y_log10(breaks = c(0.01, 0.1, 1, 10, 100, 1000), labels = paste) +
+        # geom_rect(aes(xmin = 1080, xmax = 2800, ymin = 1080, ymax = 2000), fill = "white") +
+        # geom_rect(aes(xmin = 320, xmax = 900, ymin = 1100, ymax = 2000), fill = "white") +
+        geom_hline(yintercept = 1000, linetype = 2, size = 1, col = "black") +
+        coord_trans(ylim = c(0.02, 800)) +
+        theme(legend.key.width = grid::unit(2, "lines"))
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        if (length(args) > 1) {
+            reticulate::use_python(args[2])
+        }
+        for (index in eval(parse(text = args[1]))) {
+            set.seed(42 + index)
+
+            methods <- get_regression_methods()
+            method <- methods[[(index - 1) %% length(methods) + 1]]
+            data_index <- as.integer((index - 1) / length(methods))
+            cat(sprintf("[scalability] Init: %2d   %s\n", index, method$name))
+
+            calculate(method, data_index)
+            cat(sprintf("[scalability] Done: %2d   %s\n", index, method$name))
+        }
+    } else {
+        df <- load_results("scalability")
+        plot_pdf(plot_time(df), "scalability", 1.0, 0.4)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/exp_scalability2.R b/experiments/regression/exp_scalability2.R
new file mode 100644
index 0000000..23bfccb
--- /dev/null
+++ b/experiments/regression/exp_scalability2.R
@@ -0,0 +1,200 @@
+## --------------------------------------------------
+## Peform experiment for Robust Regression Scalability
+## --------------------------------------------------
+##
+## Usage:
+##
+##        Rscript --vanilla experiments/regression/exp_scalability.R [index]
+##
+## Parameters:
+##
+##        index    : Specify a certain set to run (integer 1 - 238).
+##
+## Notes:
+##        This is designed to be run in parallel on a cluster.
+##        Run it again without an index to compile a plot.
+##        The number of jobs should be a multiple of 17 * 7.
+##
+## Running Manually:
+##        Rscript --vanilla experiments/regression/exp_scalability2.R 1:119
+##        Rscript --vanilla experiments/regression/exp_scalability2.R
+## --------------------------------------------------
+
+
+## --------------------------------------------------
+## Libraries
+## --------------------------------------------------
+suppressPackageStartupMessages({
+    library(ggplot2)
+    require(lemon)
+    require(ggrepel)
+})
+source("experiments/regression/data.R")
+source("experiments/regression/regression.R")
+source("experiments/regression/utils.R")
+
+
+## --------------------------------------------------
+## Experiment Calculations
+## --------------------------------------------------
+calculate <- function(method,
+                      data,
+                      timelimit = 15000) {
+    # Warmup (i.e. make sure any JIT has been executed)
+    tmp <- robust_data(data_create(30, 5))
+    method$train(tmp$X, tmp$Y, tmp$epsilon_sm, tmp$lambda_sm)
+
+    # Time on real data
+    time <- system.time(rr <- with_timelimit(
+        method$train,
+        timelimit,
+        sprintf("%s %s", data$name, method$name),
+        data$X,
+        data$Y,
+        data$epsilon_sm,
+        data$lambda_sm
+    ), gcFirst = TRUE)
+    if (is.rr(rr)) {
+        data.frame(
+            method = method$name,
+            dataset = data$name,
+            n = nrow(data$X),
+            d = ncol(data$X),
+            time = time[[3]],
+            eff = (time[[1]] + time[[2]]) / time[[3]],
+            stringsAsFactors = TRUE
+        )
+    } else {
+        data.frame()
+    }
+}
+
+
+## --------------------------------------------------
+## Plotting
+## --------------------------------------------------
+plot_time <- function(df) {
+    methods <- get_regression_methods()
+    df <- df %>%
+        group_by(dataset, method) %>%
+        summarize(time = mean(time))
+    df$dataset <- factor(df$dataset, data_name_factor())
+    df$method <- factor(df$method, get_nested(methods, "name"))
+    gg <- ggplot(df) +
+        theme_bw() +
+        theme_paper() +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        xlab(NULL) +
+        ylab("Time [s]") +
+        geom_text_repel(
+            aes(1.05, time, label = method),
+            force = 0.2,
+            box.padding = 0.2,
+            segment.alpha = 0.3,
+            direction = "y",
+            hjust = 0,
+            nudge_x = 0.1
+        ) +
+        geom_point(aes(1, time, col = method, shape = method), size = 3) +
+        theme(
+            axis.ticks.x = element_blank(),
+            axis.text.x = element_blank(),
+            panel.grid.major.x = element_blank(),
+            panel.grid.minor.x = element_blank()
+        ) +
+        scale_x_continuous(limits = c(0.95, 2))
+    reposition_legend(gg, "center", panel = "panel-4-2")
+}
+
+plot_time2 <- function(df) {
+    methods <- get_regression_methods()
+    df <- df %>%
+        filter(dataset != "Synthetic") %>%
+        group_by(dataset, method) %>%
+        summarize(time = mean(time))
+    df$dataset <- factor(df$dataset, data_name_factor())
+    df$method <- factor(df$method, get_nested(methods, "name"))
+    ggplot(df) +
+        geom_point(aes(1, 0), alpha = 0) +
+        theme_bw() +
+        theme_paper() +
+        facet_wrap(vars(dataset), nrow = 2, scales = "free") +
+        scale_color_manual(values = get_nested(methods, "color"), name = NULL) +
+        scale_shape_manual(values = get_nested(methods, "shape"), name = NULL) +
+        xlab(NULL) +
+        ylab("Time [s]") +
+        geom_text_repel(
+            aes(1.05, time, label = method),
+            force = 0.2,
+            box.padding = 0.15,
+            segment.alpha = 0.3,
+            direction = "y",
+            hjust = 0,
+            nudge_x = 0.15
+        ) +
+        geom_point(aes(1, time, col = method, shape = method), size = 3) +
+        theme(
+            legend.position = "none",
+            axis.ticks.x = element_blank(),
+            axis.text.x = element_blank(),
+            panel.grid.major.x = element_blank(),
+            panel.grid.minor.x = element_blank()
+        ) +
+        scale_x_continuous(limits = c(0.9, 2))
+}
+
+table_time <- function(df) {
+    df <- df %>%
+        filter(dataset != "Synthetic") %>%
+        group_by(dataset, method) %>%
+        summarize(time = mean(time))
+    datasets <- levels(df$dataset)
+
+    cat("\\begin{tabular}{l ", rep("r", length(methods)), "}\n", sep = "")
+    cat("  \\hline \\toprule\n")
+    cat("  \\textbf{Dataset} &\n  \\multicolumn{", length(methods), "}{c}{\\textbf{Time [s]}} \\\\\n", sep = "")
+    cat(
+        "  ",
+        sprintf("& %s", get_nested(get_regression_methods(), "latex")),
+        "\\\\\n",
+        "  \\midrule\n"
+    )
+    for (d in datasets) {
+        df2 <- df %>% filter(dataset == d)
+        cat(" ", latex_name(d), sprintf("& %.1f", df2$time), "\\\\\n")
+    }
+    cat("  \\bottomrule\n\\end{tabular}\n")
+}
+
+
+## --------------------------------------------------
+## If run from Rscript
+## --------------------------------------------------
+if (sys.nframe() == 0L) {
+    args <- commandArgs(trailingOnly = TRUE)
+
+    if (length(args) > 0) {
+        if (length(args) > 1) {
+            reticulate::use_python(args[2])
+        }
+        for (index in eval(parse(text = args[1]))) {
+            methods <- get_regression_methods()
+            method_index <- (index - 1) %% length(methods) + 1
+            data_index <- floor((index - 1) / length(methods)) + 1
+            method <- methods[[method_index]]
+            data <- data_at_index(data_index, seed_start = 742, normalise = TRUE)
+            cat(sprintf("[scalability2] Init: %2d   %s   %s\n", index, method$name, data$name))
+
+            df <- calculate(method, data)
+            save_results(df, "scalability2", data_index, method$tag)
+            cat(sprintf("[scalability2] Done: %2d   %s   %s\n", index, method$name, data$name))
+        }
+    } else {
+        df <- load_results("scalability2")
+        plot_pdf(plot_time2(df), "scalability2", 0.7, 0.5)
+        # plot_pdf(plot_time2(df[df$n * df$d < 200 * 1000 | df$method != "SMDM" | df$time > 10, ]), "scalability2", 0.7, 0.5)
+        # plot_pdf(plot_time(df), "scalability2", 0.8, 0.5)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/regression.R b/experiments/regression/regression.R
new file mode 100644
index 0000000..ecbe6ad
--- /dev/null
+++ b/experiments/regression/regression.R
@@ -0,0 +1,258 @@
+suppressPackageStartupMessages({
+    library(glmnet)
+    library(robustbase)
+    library(robustHD)
+    library(conquer)
+    library(reticulate)
+    library(MTE)
+    library(R.utils) # For timeout
+})
+
+
+get_regression_methods <- function(add_random = FALSE, add_mean = FALSE) {
+    methods <- list(
+        slise = list(
+            name = "SLISE",
+            latex = "\\slise",
+            train = function(X, Y, epsilon, lambda = 0, ...) {
+                coef <- slise.fit(X, Y, epsilon, lambda, ...)$coefficients
+                rr.create(coef, X, Y, epsilon)
+            },
+            shape = 20,
+            size = 2,
+            linetype = 1,
+            color = "#ed9411"
+        ),
+        # fastlts = list(
+        #     name = "Fast LTS",
+        #     latex = "\\fastlts",
+        #     train = function(X, Y, epsilon, lambda = 0, ...) {
+        #         mod <- ltsReg(X, Y, alpha = 0.5, intercept = TRUE, mcd = FALSE)
+        #         rr.create(mod$coefficients, X, Y, epsilon)
+        #     }
+        # ),
+        sparselts = list(
+            name = "Sparse LTS",
+            latex = "\\sparselts",
+            train = function(X, Y, epsilon, lambda = 0, ..., size = 0.5) {
+                coef <- sparseLTS(
+                    X, Y, lambda / length(Y),
+                    normalize = FALSE, intercept = TRUE,
+                    alpha = size, ncores = getOption("mc.cores", NA)
+                )$coefficients
+                rr.create(coef, X, Y, epsilon)
+            },
+            shape = 1,
+            size = 1,
+            linetype = 4,
+            color = "#8877bb"
+        ),
+        # mm = list(
+        #     name = "MM-Estimator",
+        #     latex = "\\mmest",
+        #     train = function(X, Y, epsilon, lambda = 0, ...) {
+        #         library(MASS)
+        #         mod <- rlm(Y ~ 1 + X, init = "lts", psi = psi.bisquare, method = "MM")
+        #         rr.create(mod$coefficients, X, Y, epsilon)
+        #     }
+        # ),
+        smdm = list(
+            name = "SMDM",
+            latex = "\\smdm",
+            train = function(X, Y, epsilon, lambda = 0, ...) {
+                ng <- max(400, as.integer(ncol(X) * 1.5))
+                g <- min(5, as.integer((nrow(X) - 1) / (ncol(X) * 1.5)))
+                mod <- lmrob(Y ~ 1 + X, control = lmrob.control(n.group = ng, groups = g, setting = "KS2014"))
+                rr.create(mod$coefficients, X, Y, epsilon)
+            },
+            shape = 15,
+            size = 1,
+            linetype = 5,
+            color = "#66a61e"
+        ),
+        # mmlasso = list(
+        #     name = "MM Lasso",
+        #     latex = "\\mmlasso",
+        #     train = function(X, Y, epsilon, lambda = 0, ...) {
+        #         library(pense)
+        #         res <- as.vector(pensem(X, Y, alpha=1, lambda = max(1e-8, lambda),
+        #             ncores = getOption("mc.cores", 4L))$coefficients)
+        #         rr.create(res, X, Y, epsilon)
+        #     }
+        # ),
+        # quantreg = list(
+        #     name = "Quantile Regression",
+        #     latex = "\\quantreg",
+        #     train = function(X, Y, epsilon, lambda = 0, ...) {
+        #         library(quantreg)
+        #         mod <- rq(Y ~ 1 + X, 0.5, ...)
+        #         rr.create(mod$coefficients, X, Y, epsilon)
+        #     }
+        # ),
+        conquer = list(
+            name = "Conquer",
+            latex = "\\conquer",
+            train = function(X, Y, epsilon, lambda = 0, ...) {
+                mod <- conquer(X, Y, 0.5, ...)
+                rr.create(mod$coeff, X, Y, epsilon)
+            },
+            shape = 2,
+            size = 1,
+            linetype = 3,
+            color = "#e7298a"
+        ),
+        # ladlasso = list(
+        #     name = "LAD Lasso",
+        #     latex = "\\ladlasso",
+        #     train = function(X, Y, epsilon, lambda = 0, ...) {
+        #         res <- LAD(Y, X, TRUE)
+        #         if (lambda > 0) {
+        #             res <- LADlasso(Y, X,
+        #                 beta.ini = res,
+        #                 lambda = lambda / nrow(X),
+        #                 adaptive = FALSE,
+        #                 intercept = TRUE
+        #             )$beta
+        #         }
+        #         rr.create(res, X, Y, epsilon)
+        #     },
+        #     shape = 6,
+        #     size = 1,
+        #     linetype = 2,
+        #     color = "#1b9e77"
+        # ),
+        mtelasso = list(
+            name = "MTE Lasso",
+            latex = "\\mtelasso",
+            train = function(X, Y, epsilon, lambda = 0, ...) {
+                res <- LAD(Y, X, TRUE)
+                MTElasso(Y, X,
+                    p = 1,
+                    beta.ini = res,
+                    lambda = lambda / nrow(X),
+                    adaptive = FALSE,
+                    intercept = TRUE
+                )$beta
+                rr.create(res, X, Y, epsilon)
+            },
+            shape = 6,
+            size = 1,
+            linetype = 2,
+            color = "#1b9e77"
+        ),
+        ransac = list(
+            name = "RANSAC",
+            latex = "\\ransac",
+            train = function(X, Y, epsilon, lambda = 0, ..., max_trials = 20000) {
+                ransac <- reticulate::import("sklearn")$linear_model$RANSACRegressor
+                model <- ransac(residual_threshold = epsilon, max_trials = max_trials)
+                model$fit(X, Y)
+                coef <- c(model$estimator_$intercept_, model$estimator_$coef_)
+                rr.create(coef, X, Y, epsilon)
+            },
+            shape = 10,
+            size = 1,
+            linetype = 6,
+            color = "#a6761d"
+        ),
+        lasso = list(
+            name = "Lasso",
+            latex = "\\lasso",
+            train = function(X, Y, epsilon, lambda = 0, ..., alpha = 1) {
+                coef <- as.vector(predict(glmnet(
+                    X, Y,
+                    lambda = lambda / nrow(X), alpha = alpha
+                ), type = "coefficients"))
+                rr.create(coef, X, Y, epsilon)
+            },
+            shape = 4,
+            size = 1,
+            linetype = 1,
+            color = "#e6ab02"
+        )
+    )
+    if (add_random) {
+        methods$random <- list(
+            name = "Random",
+            latex = "\\textsc{random}",
+            train = function(X, y, ...) structure(list(y = y), class = "rrr"),
+            shape = 15,
+            size = 2,
+            linetype = 1,
+            color = "#000000"
+        )
+    }
+    if (add_mean) {
+        methods$mean <- list(
+            name = "Mean",
+            latex = "\\textsc{mean}",
+            train = function(X, y, ...) structure(list(y = mean(y)), class = "rrr"),
+            shape = 15,
+            size = 2,
+            linetype = 1,
+            color = "#000000"
+        )
+    }
+    for (m in names(methods)) {
+        methods[[m]]$tag <- m
+    }
+    methods
+}
+
+get_nested <- function(list, prop) {
+    res <- c(sapply(list, `[[`, prop))
+    names(res) <- c(sapply(list, `[[`, "name"))
+    res
+}
+
+rr.create <- function(coef, X, Y, epsilon) {
+    coef[is.na(coef)] <- 0.0
+    res <- c(X %*% coef[-1] + coef[1] - Y)
+    structure(list(
+        coef = coef,
+        X = X,
+        loss_mse = mean(res^2),
+        loss_mae = mean(abs(res)),
+        loss_slise = loss_sharp_res(coef, res^2, epsilon^2)
+    ), class = "rr")
+}
+
+predict.rr <- function(object, newdata = NULL, ...) {
+    if (is.null(newdata)) {
+        newdata <- object$X
+    } else {
+        newdata <- as.matrix(newdata)
+    }
+    c(newdata %*% object$coef[-1] + object$coef[1])
+}
+
+with_timelimit <- function(fn, timelimit, desc, ...) {
+    tryCatch(
+        expr = withTimeout(fn(...), timeout = timelimit),
+        TimeoutException = function(ex) {
+            cat("\n", "Timeout in:", desc, "\n")
+            NULL
+        },
+        error = function(e) {
+            cat("\n", "Error in:", desc, "\n")
+            print(e)
+            NULL
+        }
+    )
+}
+
+is.rr <- function(object) {
+    "rr" %in% class(object)
+}
+
+is.rrr <- function(object) {
+    "rrr" %in% class(object)
+}
+
+predict.rrr <- function(object, newdata = NULL, ...) {
+    if (is.null(newdata)) {
+        sample(object$y, replace = TRUE)
+    } else {
+        sample(object$y, nrow(newdata), replace = TRUE)
+    }
+}
\ No newline at end of file
diff --git a/experiments/regression/utils.R b/experiments/regression/utils.R
new file mode 100644
index 0000000..0a010e2
--- /dev/null
+++ b/experiments/regression/utils.R
@@ -0,0 +1,90 @@
+
+suppressPackageStartupMessages({
+    library(devtools)
+    devtools::load_all()
+    options(dplyr.summarise.inform = FALSE)
+    library(dplyr)
+})
+
+RESULTS_DIR <- "experiments/results"
+
+# Use the correct number of cores if running on a slurm-cluster
+cores_from_slurm <- as.integer(Sys.getenv("SLURM_CPUS_PER_TASK"))
+if (is.finite(cores_from_slurm) && cores_from_slurm > 1) {
+    options(mc.cores = cores_from_slurm)
+    try(setMKLthreads(cores_from_slurm), silent = TRUE)
+}
+
+theme_paper <- function() {
+    library(ggplot2)
+    theme(
+        legend.title = element_blank(),
+        strip.background = element_blank(),
+        strip.text = element_text(color = "black", size = 12),
+        panel.spacing.x = unit(0.5, "cm"),
+        panel.grid.minor = element_blank(),
+        panel.border = element_blank(),
+        axis.line = element_line(colour = "black"),
+        axis.text = element_text(size = 8),
+        axis.title = element_text(size = 11)
+    )
+}
+
+get_results_dir <- function(path = RESULTS_DIR) {
+    dir.create(file.path(path), showWarnings = FALSE, recursive = TRUE)
+    path
+}
+
+plot_pdf <- function(plot_obj, name, width, height, path = RESULTS_DIR, verbose = TRUE) {
+    path <- get_results_dir(RESULTS_DIR)
+    path <- file.path(path, paste0(name, ".pdf"))
+    cairo_pdf(path, width * 9, height * 9)
+    plot(plot_obj)
+    dev.off()
+    if (verbose) {
+        cat("Plotted to", path, "\n")
+    }
+}
+
+save_results <- function(data, name, index = -1, tag = NULL, path = RESULTS_DIR) {
+    if (index >= 0) {
+        path <- get_results_dir(file.path(RESULTS_DIR, name))
+        if (!is.null(tag)) {
+            path <- file.path(path, sprintf("%s_%s_%02d.rds", name, tag, index))
+        } else {
+            path <- file.path(path, sprintf("%s_%03d.rds", name, index))
+        }
+    } else {
+        path <- get_results_dir(path)
+        if (!is.null(tag)) {
+            path <- file.path(path, sprintf("%s_%s.rds", name))
+        } else {
+            path <- file.path(path, paste0(name, ".rds"))
+        }
+    }
+    saveRDS(data, path, compress = "xz")
+}
+
+load_results <- function(name, path = RESULTS_DIR) {
+    file <- file.path(RESULTS_DIR, paste0(name, ".rds"))
+    dir <- file.path(RESULTS_DIR, name)
+    if (file.exists(file)) {
+        readRDS(file)
+    } else if (file.exists(dir)) {
+        do.call(rbind, lapply(list.files(dir, full.names = TRUE), readRDS))
+    } else {
+        stop("Could not find results")
+    }
+}
+
+inverse_interpolated <- function(x, y, target) {
+    if (target < min(y)) {
+        x[which.min(y)]
+    } else if (target > max(y)) {
+        x[which.max(y)]
+    } else {
+        diff <- abs(y - target)
+        closest <- which_min_n(diff, 2)
+        weighted.mean(x[closest], diff[rev(closest)])
+    }
+}
\ No newline at end of file
diff --git a/experiments/results/ex1.jpg b/experiments/results/ex1.jpg
deleted file mode 100644
index 6bb5464..0000000
Binary files a/experiments/results/ex1.jpg and /dev/null differ
diff --git a/experiments/results/ex1.png b/experiments/results/ex1.png
new file mode 100644
index 0000000..6f53957
Binary files /dev/null and b/experiments/results/ex1.png differ
diff --git a/experiments/results/ex2.jpg b/experiments/results/ex2.jpg
deleted file mode 100644
index ba5aaa6..0000000
Binary files a/experiments/results/ex2.jpg and /dev/null differ
diff --git a/experiments/results/ex2.png b/experiments/results/ex2.png
new file mode 100644
index 0000000..3fb4493
Binary files /dev/null and b/experiments/results/ex2.png differ
diff --git a/inst/CITATION b/inst/CITATION
index 2d9a36d..146c885 100644
--- a/inst/CITATION
+++ b/inst/CITATION
@@ -1,21 +1,46 @@
-citHeader("To cite SLISE in publications use:")
-
-citEntry(entry = "InProceedings",
-  title        = "Sparse Robust Regression for Explaining Classifiers",
-  author       = personList(
-                    as.person("Anton Björklund"),
-                    as.person("Andreas Henelius"),
-                    as.person("Emilia Oikarinen"),
-                    as.person("Kimmo Kallonen"),
-                    as.person("Kai Puolamäki")),
-                  
-  journal      = "Lecture Notes in Computer Science",
-  booktitle    = "Proceedings of the 22nd International Conference on Discovery Science",
-  year         = "2019",
-
-  textVersion  = paste(
-    "Anton Björklund, Andreas Henelius, Emilia Oikarinen, Kimmo Kallonen, and Kai Puolamäki.",
-    "Sparse robust regression for explaining classifiers.",
-    "Proceedings of the 22nd International Conference on Discovery Science.",
-    "2019.")
+citHeader("When citing SLISE in publications, cite one of the following:")
+
+citEntry(
+  entry = "InProceedings",
+  title = "Sparse Robust Regression for Explaining Classifiers",
+  author = personList(
+    person("Anton", "Björklund"),
+    person("Andreas", "Henelius"),
+    person("Emilia", "Oikarinen"),
+    person("Kimmo", "Kallonen"),
+    person("Kai", "Puolamäki")
+  ),
+  journal = "Lecture Notes in Computer Science",
+  booktitle = "Proceedings of the 22nd International Conference on Discovery Science",
+  year = "2019",
+  doi = "10.1007/978-3-030-33778-0_27",
+
+  textVersion = paste(
+    "Anton Björklund, Andreas Henelius, Emilia Oikarinen, Kimmo Kallonen, and Kai Puolamäki (2019).",
+    "Sparse robust regression for explaining classifiers.",
+    "Proceedings of the 22nd International Conference on Discovery Science,",
+    "DOI: 10.1007/978-3-030-33778-0_27"
+  )
+)
+
+citEntry(
+  entry = "article",
+  title = "Robust Regression via Error Tolerance",
+  author = personList(
+    person("Anton", "Björklund"),
+    person("Andreas", "Henelius"),
+    person("Emilia", "Oikarinen"),
+    person("Kimmo", "Kallonen"),
+    person("Kai", "Puolamäki")
+  ),
+  journal = "Data Mining and Knowledge Discovery",
+  year = "2022",
+  doi = "10.1007/s10618-022-00819-2",
+
+  textVersion = paste(
+    "Anton Björklund, Andreas Henelius, Emilia Oikarinen, Kimmo Kallonen, and Kai Puolamäki (2022).",
+    "Robust Regression via Error Tolerance.",
+    "Data Mining and Knowledge Discovery,",
+    "DOI: 10.1007/s10618-022-00819-2"
+  )
 )
\ No newline at end of file
diff --git a/man/create_slise.Rd b/man/create_slise.Rd
deleted file mode 100644
index 8f9438e..0000000
--- a/man/create_slise.Rd
+++ /dev/null
@@ -1,29 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/main.R
-\name{create_slise}
-\alias{create_slise}
-\title{Create a result object for SLISE that is similar to other regression method results}
-\usage{
-create_slise(alpha, X, Y, epsilon, lambda = 0, data = NULL, ...)
-}
-\arguments{
-\item{alpha}{linear model}
-
-\item{X}{data matrix}
-
-\item{Y}{response vector}
-
-\item{epsilon}{error tolerance}
-
-\item{lambda}{L1 weight}
-
-\item{data}{data_preprocess(X, Y)}
-
-\item{...}{other variables to add to the SLISE object}
-}
-\value{
-list(coefficients=unscale(alpha), X, Y, scaled=data, lambda, alpha, subset=[r_i<epsilon], value=loss, epsilon, loss, ...)
-}
-\description{
-Create a result object for SLISE that is similar to other regression method results
-}
diff --git a/man/data_container.Rd b/man/data_container.Rd
new file mode 100644
index 0000000..1aa9b69
--- /dev/null
+++ b/man/data_container.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/optimisation.R
+\name{data_container}
+\alias{data_container}
+\title{Wrapper for creating a C++ DataContainer that parses parameter names}
+\usage{
+data_container(X, Y, epsilon, beta, lambda1 = 0, lambda2 = 0, weight = NULL)
+}
+\arguments{
+\item{X}{data matrix}
+
+\item{Y}{response vector}
+
+\item{epsilon}{error tolerance}
+
+\item{beta}{sigmoid steepness}
+
+\item{lambda1}{L1 regularisation (default: 0)}
+
+\item{lambda2}{L2 regularisation (default: 0)}
+
+\item{weight}{weight vector (default: NULL)}
+}
+\value{
+DataContainer
+}
+\description{
+Wrapper for creating a C++ DataContainer that parses parameter names
+}
diff --git a/man/data_identity.Rd b/man/data_identity.Rd
deleted file mode 100644
index ced12b3..0000000
--- a/man/data_identity.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data.R
-\name{data_identity}
-\alias{data_identity}
-\title{SLISE Data Preprocessing without changing anything}
-\usage{
-data_identity(X, Y)
-}
-\arguments{
-\item{X}{the data matrix}
-
-\item{Y}{the response vector}
-}
-\value{
-list(X, Y, scaling_functions...)
-}
-\description{
-SLISE Data Preprocessing without changing anything
-}
diff --git a/man/data_local.Rd b/man/data_local.Rd
deleted file mode 100644
index 37884ba..0000000
--- a/man/data_local.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data.R
-\name{data_local}
-\alias{data_local}
-\title{Shift the data to be local around a point}
-\usage{
-data_local(X, Y, x, y = NULL)
-}
-\arguments{
-\item{X}{data matrix}
-
-\item{Y}{response vector}
-
-\item{x}{local data point}
-
-\item{y}{local response}
-}
-\value{
-list(X = X - x, Y = Y - y, scaling_functions...)
-}
-\description{
-Shift the data to be local around a point
-}
diff --git a/man/data_preprocess.Rd b/man/data_preprocess.Rd
deleted file mode 100644
index bf3cdbe..0000000
--- a/man/data_preprocess.Rd
+++ /dev/null
@@ -1,30 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data.R
-\name{data_preprocess}
-\alias{data_preprocess}
-\title{SLISE Data Preprocessing
-Scales data and optionally adds intercept}
-\usage{
-data_preprocess(X, Y, scale = FALSE, intercept = FALSE,
-  logit_tr = FALSE, scale_y = TRUE)
-}
-\arguments{
-\item{X}{the data matrix}
-
-\item{Y}{the response vector}
-
-\item{scale}{should the data be (columnwise) normalised (FALSE)}
-
-\item{intercept}{should an intercept column be added (FALSE)}
-
-\item{logit_tr}{should the response be logit-transformed (FALSE)}
-
-\item{scale_y}{should the response be scaled (FALSE)}
-}
-\value{
-list(X, Y, scaling_functions...)
-}
-\description{
-SLISE Data Preprocessing
-Scales data and optionally adds intercept
-}
diff --git a/man/dlog_sigmoid.Rd b/man/dlog_sigmoid.Rd
index a05dcbc..442b2e5 100644
--- a/man/dlog_sigmoid.Rd
+++ b/man/dlog_sigmoid.Rd
@@ -7,7 +7,7 @@
 dlog_sigmoid(x)
 }
 \arguments{
-\item{x}{vector}
+\item{x}{vector of real values}
 }
 \value{
 Derivative of log(sigmoid(x)).
diff --git a/man/dsigmoid.Rd b/man/dsigmoid.Rd
index ff1bb2e..b7472ea 100644
--- a/man/dsigmoid.Rd
+++ b/man/dsigmoid.Rd
@@ -7,7 +7,7 @@
 dsigmoid(x)
 }
 \arguments{
-\item{x}{vector}
+\item{x}{vector of real values}
 }
 \value{
 Derivative of sigmoid(x).
diff --git a/man/explain.Rd b/man/explain.Rd
deleted file mode 100644
index 29b1bd3..0000000
--- a/man/explain.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/plot.R
-\name{explain}
-\alias{explain}
-\title{Explain an Object}
-\usage{
-explain(x, ...)
-}
-\arguments{
-\item{x}{the object to explain}
-
-\item{...}{additional parameters}
-}
-\description{
-Explain an Object
-}
-\examples{
-data <- matrix(rnorm(200), 100, 2)
-response <- rnorm(100)
-slise <- slise.explain(data, response, 10, epsilon=0.1)
-explain(slise)
-}
diff --git a/man/explain.slise.Rd b/man/explain.slise.Rd
deleted file mode 100644
index 3789be2..0000000
--- a/man/explain.slise.Rd
+++ /dev/null
@@ -1,34 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/plot.R
-\name{explain.slise}
-\alias{explain.slise}
-\title{Show a SLISE explanation}
-\usage{
-\method{explain}{slise}(slise, type = "bar",
-  class_labels = c("Class 0", "Class 1"), title = "SLISE",
-  real_value = NULL, probability = TRUE, ...)
-}
-\arguments{
-\item{slise}{the slise object to show}
-
-\item{type}{Type of explanation (bar, distribution, image, image2, text, image_scatter)}
-
-\item{class_labels}{(Optional) names of the two classes}
-
-\item{title}{(Optional) A title to add to the explanation}
-
-\item{real_value}{(Optional) the real response for the explained instance}
-
-\item{probability}{(Optional) is The prediction a probability (TRUE)}
-
-\item{...}{Additional parameters to the visualisation}
-}
-\description{
-Show a SLISE explanation
-}
-\examples{
-data <- matrix(rnorm(200), 100, 2)
-response <- rnorm(100)
-slise <- slise.explain(data, response, 10, epsilon=0.1)
-explain(slise)
-}
diff --git a/man/fast_ols.Rd b/man/fast_ols.Rd
new file mode 100644
index 0000000..a7a86a4
--- /dev/null
+++ b/man/fast_ols.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/initialisation.R
+\name{fast_ols}
+\alias{fast_ols}
+\title{OLS solver that falls back to an optimisation if ncol(X) is huge
+Also supports LASSO via optimisation}
+\usage{
+fast_ols(X, Y, weight = NULL, lambda = 0, max_iterations = 300)
+}
+\arguments{
+\item{X}{data matrix}
+
+\item{Y}{response vector}
+
+\item{weight}{weight vector (default: NULL)}
+
+\item{lambda}{LASSO regularisation (default: 0)}
+
+\item{max_iterations}{if ncol(X) is huge, then ols is replaced with optimisation (default:300)}
+}
+\value{
+coefficient vector
+}
+\description{
+OLS solver that falls back to an optimisation if ncol(X) is huge
+Also supports LASSO via optimisation
+}
diff --git a/man/grad_opt_debug.Rd b/man/grad_opt_debug.Rd
index 2934872..05a7cda 100644
--- a/man/grad_opt_debug.Rd
+++ b/man/grad_opt_debug.Rd
@@ -4,7 +4,7 @@
 \alias{grad_opt_debug}
 \title{Print debug statement for how the graduated optimisation is going}
 \usage{
-grad_opt_debug(alpha, X, Y, epsilon, lambda, beta, beta_max)
+grad_opt_debug(alpha, X, Y, epsilon, beta, lambda1, lambda2, weight, beta_max)
 }
 \arguments{
 \item{alpha}{linear model}
@@ -15,10 +15,14 @@ grad_opt_debug(alpha, X, Y, epsilon, lambda, beta, beta_max)
 
 \item{epsilon}{error tolerance}
 
-\item{lambda}{L1 weight}
-
 \item{beta}{current sigmoid steepness}
 
+\item{lambda1}{L1 coefficients}
+
+\item{lambda2}{L2 coefficients}
+
+\item{weight}{weight vector}
+
 \item{beta_max}{max sigmoid steepness}
 }
 \description{
diff --git a/man/graduated_optimisation.Rd b/man/graduated_optimisation.Rd
index e202046..f34985e 100644
--- a/man/graduated_optimisation.Rd
+++ b/man/graduated_optimisation.Rd
@@ -4,37 +4,51 @@
 \alias{graduated_optimisation}
 \title{Graduated Optimisation to solve the SLISE problem}
 \usage{
-graduated_optimisation(alpha, X, Y, epsilon = 0.1, lambda = 0,
-  beta = 0, beta_max = 25, max_approx = 1.2, max_iterations = 100,
-  debug = FALSE, ..., beta_min_increase = beta_max * 5e-04,
-  beta_start_max = NULL)
+graduated_optimisation(
+  alpha,
+  X,
+  Y,
+  epsilon,
+  beta = 0,
+  lambda1 = 0,
+  lambda2 = 0,
+  weight = NULL,
+  beta_max = 20/epsilon^2,
+  max_approx = 1.15,
+  max_iterations = 300,
+  beta_min_increase = beta_max * 5e-04,
+  debug = FALSE,
+  ...
+)
 }
 \arguments{
-\item{alpha}{initial linear model (if NULL then OLS)}
+\item{alpha}{Initial linear model (if NULL then OLS)}
 
-\item{X}{data matrix}
+\item{X}{Data matrix}
 
-\item{Y}{response vector}
+\item{Y}{Response vector}
 
-\item{epsilon}{error tolerance}
+\item{epsilon}{Error tolerance}
 
-\item{lambda}{L1 coefficient (0)}
+\item{beta}{Starting sigmoid steepness (default: 0 == convex problem)}
 
-\item{beta}{starting sigmoid steepness (0 => convex problem)}
+\item{lambda1}{L1 coefficient (default: 0)}
 
-\item{beta_max}{stopping sigmoid steepness (25)}
+\item{lambda2}{L1 coefficient (default: 0)}
 
-\item{max_approx}{approximation ratio when selecting the next beta (1.2)}
+\item{weight}{Weight vector (default: NULL == no weights)}
 
-\item{max_iterations}{maximum number of OWL-QN iterations (100)}
+\item{beta_max}{Stopping sigmoid steepness (default: 20 / epsilon^2)}
 
-\item{debug}{should debug statement be printed each iteration (FALSE)}
+\item{max_approx}{Approximation ratio when selecting the next beta (default: 1.15)}
 
-\item{...}{Additional parameters to OWL-QN}
+\item{max_iterations}{Maximum number of OWL-QN iterations (default: 300)}
+
+\item{beta_min_increase}{Minimum amount to increase beta (default: beta_max * 0.0005)}
 
-\item{beta_min_increase}{the minimum increase of beta each iteration (beta_max * 0.0005)}
+\item{debug}{Should debug statement be printed each iteration (default: FALSE)}
 
-\item{beta_start_max}{Ignored}
+\item{...}{Additional parameters to OWL-QN}
 }
 \value{
 lbfgs object with beta (max) and the number of iteration steps
diff --git a/man/logit.Rd b/man/limited_logit.Rd
similarity index 81%
rename from man/logit.Rd
rename to man/limited_logit.Rd
index e315616..43579d2 100644
--- a/man/logit.Rd
+++ b/man/limited_logit.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/utils.R
-\name{logit}
-\alias{logit}
+\name{limited_logit}
+\alias{limited_logit}
 \title{Computes the logits from probabilities}
 \usage{
-logit(p, stab = 0.001)
+limited_logit(p, stab = 0.001)
 }
 \arguments{
 \item{p}{probability (vector)}
diff --git a/man/log_approximation_ratio.Rd b/man/log_approximation_ratio.Rd
index caf8b49..6896d45 100644
--- a/man/log_approximation_ratio.Rd
+++ b/man/log_approximation_ratio.Rd
@@ -6,16 +6,18 @@
 (logarithms are used for numerically stable calculations)
 See Theorem 3 from the paper for more details}
 \usage{
-log_approximation_ratio(residuals, epsilon, beta1, beta2)
+log_approximation_ratio(residuals2, epsilon2, beta1, beta2, weight = NULL)
 }
 \arguments{
-\item{residuals}{squared residuals}
+\item{residuals2}{squared residuals}
 
-\item{epsilon}{error tolerance}
+\item{epsilon2}{squared error tolerance}
 
 \item{beta1}{current sigmoid steepness}
 
 \item{beta2}{next sigmoid steepness}
+
+\item{weight}{weight vector (default: NULL)}
 }
 \value{
 log(approximation_ratio)
diff --git a/man/log_sigmoid.Rd b/man/log_sigmoid.Rd
index 8bb1dae..b00d935 100644
--- a/man/log_sigmoid.Rd
+++ b/man/log_sigmoid.Rd
@@ -7,7 +7,7 @@
 log_sigmoid(x)
 }
 \arguments{
-\item{x}{vector}
+\item{x}{vector of real values}
 }
 \value{
 log(sigmoid(x))
diff --git a/man/log_sum_special.Rd b/man/log_sum_special.Rd
new file mode 100644
index 0000000..6917d5d
--- /dev/null
+++ b/man/log_sum_special.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{log_sum_special}
+\alias{log_sum_special}
+\title{Computes log(sum(exp(x) * y)),
+or log(sum(exp(x))) if all(y == 0),
+in a numerically robust way.}
+\usage{
+log_sum_special(x, y)
+}
+\arguments{
+\item{x}{vector of length n}
+
+\item{y}{multiplier}
+}
+\value{
+log(sum(exp(x))).
+}
+\description{
+Computes log(sum(exp(x) * y)),
+or log(sum(exp(x))) if all(y == 0),
+in a numerically robust way.
+}
diff --git a/man/loss_sharp.Rd b/man/loss_sharp.Rd
index 62a0fe8..b0f0ddf 100644
--- a/man/loss_sharp.Rd
+++ b/man/loss_sharp.Rd
@@ -2,9 +2,10 @@
 % Please edit documentation in R/optimisation.R
 \name{loss_sharp}
 \alias{loss_sharp}
-\title{Sharp Loss Function}
+\title{Sharp Loss Function
+Exact loss function without gradients}
 \usage{
-loss_sharp(alpha, X, Y, epsilon = 0.1, lambda = 0)
+loss_sharp(alpha, X, Y, epsilon, lambda1 = 0, lambda2 = 0, weight = NULL)
 }
 \arguments{
 \item{alpha}{The vector to calculate loss for}
@@ -13,13 +14,18 @@ loss_sharp(alpha, X, Y, epsilon = 0.1, lambda = 0)
 
 \item{Y}{The response vector}
 
-\item{epsilon}{(Optional) The acceptable error}
+\item{epsilon}{The acceptable error}
 
-\item{lambda}{(Optional) The sparsity reguraliser}
+\item{lambda1}{The L1 regulariser (default: 0)}
+
+\item{lambda2}{The L2 regulariser (default: 0)}
+
+\item{weight}{weight vector (default: NULL)}
 }
 \value{
 The loss value
 }
 \description{
 Sharp Loss Function
+Exact loss function without gradients
 }
diff --git a/man/loss_sharp_res.Rd b/man/loss_sharp_res.Rd
new file mode 100644
index 0000000..5afc147
--- /dev/null
+++ b/man/loss_sharp_res.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/optimisation.R
+\name{loss_sharp_res}
+\alias{loss_sharp_res}
+\title{Sharp Loss Function
+Exact loss function without gradients for when the residuals are already calculated}
+\usage{
+loss_sharp_res(
+  alpha,
+  residuals2,
+  epsilon2,
+  lambda1 = 0,
+  lambda2 = 0,
+  weight = NULL
+)
+}
+\arguments{
+\item{alpha}{The vector to calculate loss for}
+
+\item{residuals2}{The squared error vector: (X \%*\% alpha - Y)^2}
+
+\item{epsilon2}{The squared acceptable error}
+
+\item{lambda1}{The L1 regulariser (default: 0)}
+
+\item{lambda2}{The L2 regulariser (default: 0)}
+
+\item{weight}{weight vector (default: NULL)}
+}
+\value{
+The loss value
+}
+\description{
+Sharp Loss Function
+Exact loss function without gradients for when the residuals are already calculated
+}
diff --git a/man/loss_smooth.Rd b/man/loss_smooth.Rd
index 1eb2752..d33de62 100644
--- a/man/loss_smooth.Rd
+++ b/man/loss_smooth.Rd
@@ -2,9 +2,19 @@
 % Please edit documentation in R/optimisation.R
 \name{loss_smooth}
 \alias{loss_smooth}
-\title{Smooth Loss}
+\title{Smooth Loss
+A loss function for when you want gradients}
 \usage{
-loss_smooth(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3)
+loss_smooth(
+  alpha,
+  X,
+  Y,
+  epsilon,
+  beta,
+  lambda1 = 0,
+  lambda2 = 0,
+  weight = NULL
+)
 }
 \arguments{
 \item{alpha}{The vector to calculate loss for}
@@ -13,15 +23,20 @@ loss_smooth(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3)
 
 \item{Y}{The response vector}
 
-\item{epsilon}{(Optional) The acceptable error}
+\item{epsilon}{The acceptable error}
 
-\item{lambda}{(Optional) The sparsity reguraliser}
+\item{beta}{The steepness of the sigmoid}
 
-\item{beta}{(Optional) The steepness of the sigmoid (default: 3)}
+\item{lambda1}{The L1 regulariser (default: 0)}
+
+\item{lambda2}{The L2 regulariser (default: 0)}
+
+\item{weight}{weight vector (default: NULL)}
 }
 \value{
 The loss value
 }
 \description{
 Smooth Loss
+A loss function for when you want gradients
 }
diff --git a/man/loss_smooth_grad.Rd b/man/loss_smooth_grad.Rd
index bf0d60c..f28c78d 100644
--- a/man/loss_smooth_grad.Rd
+++ b/man/loss_smooth_grad.Rd
@@ -2,9 +2,19 @@
 % Please edit documentation in R/optimisation.R
 \name{loss_smooth_grad}
 \alias{loss_smooth_grad}
-\title{Smooth Loss Gradient}
+\title{Smooth Loss Gradient
+Gradient for the smooth loss function}
 \usage{
-loss_smooth_grad(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3)
+loss_smooth_grad(
+  alpha,
+  X,
+  Y,
+  epsilon,
+  beta,
+  lambda1 = 0,
+  lambda2 = 0,
+  weight = NULL
+)
 }
 \arguments{
 \item{alpha}{The vector to calculate loss-gradient for}
@@ -13,15 +23,20 @@ loss_smooth_grad(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3)
 
 \item{Y}{The response vector}
 
-\item{epsilon}{(Optional) The acceptable error}
+\item{epsilon}{The acceptable error}
 
-\item{lambda}{(Optional) The sparsity reguraliser}
+\item{beta}{The steepness of the sigmoid}
 
-\item{beta}{(Optional) The steepness of the sigmoid (default: 3)}
+\item{lambda1}{The L1 regulariser (default: 0)}
+
+\item{lambda2}{The L2 regulariser (default: 0)}
+
+\item{weight}{weight vector (default: NULL)}
 }
 \value{
 The gradients for alpha
 }
 \description{
 Smooth Loss Gradient
+Gradient for the smooth loss function
 }
diff --git a/man/loss_smooth_res.Rd b/man/loss_smooth_res.Rd
new file mode 100644
index 0000000..88748e5
--- /dev/null
+++ b/man/loss_smooth_res.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/optimisation.R
+\name{loss_smooth_res}
+\alias{loss_smooth_res}
+\title{Smooth Loss
+A loss function for when you want gradients and the residuals are already calculated}
+\usage{
+loss_smooth_res(
+  alpha,
+  residuals2,
+  epsilon2,
+  beta,
+  lambda1 = 0,
+  lambda2 = 0,
+  weight = NULL
+)
+}
+\arguments{
+\item{alpha}{The vector to calculate loss for}
+
+\item{residuals2}{Vector of squared residuals}
+
+\item{epsilon2}{The squared acceptable error}
+
+\item{beta}{The steepness of the sigmoid}
+
+\item{lambda1}{The L1 regulariser (default: 0)}
+
+\item{lambda2}{The L2 regulariser (default: 0)}
+
+\item{weight}{weight vector (default: NULL)}
+}
+\value{
+The loss value
+}
+\description{
+Smooth Loss
+A loss function for when you want gradients and the residuals are already calculated
+}
diff --git a/man/matching_epsilon.Rd b/man/matching_epsilon.Rd
index 7d97f92..e6860c3 100644
--- a/man/matching_epsilon.Rd
+++ b/man/matching_epsilon.Rd
@@ -4,14 +4,16 @@
 \alias{matching_epsilon}
 \title{Find the matching *epsilon}
 \usage{
-matching_epsilon(residuals, epsilon, beta)
+matching_epsilon(residuals2, epsilon2, beta, weight = NULL)
 }
 \arguments{
-\item{residuals}{squared residuals}
+\item{residuals2}{squared residuals}
 
-\item{epsilon}{error tolerance}
+\item{epsilon2}{squared error tolerance}
 
 \item{beta}{sigmoid steepness}
+
+\item{weight}{weight vector (default: NULL)}
 }
 \value{
 *epsilon
diff --git a/man/next_beta.Rd b/man/next_beta.Rd
index 4ddc20a..269c6ef 100644
--- a/man/next_beta.Rd
+++ b/man/next_beta.Rd
@@ -4,25 +4,31 @@
 \alias{next_beta}
 \title{Find the next beta according to:
   ¤ approximation_ratio(alpha, beta_old, beta_new) == max_approx
-  ¤ beta_new >= beta_old + min_increase}
+  ¤ beta_new >= beta_old + min_increase
+  ¤ beta_new <= beta_max}
 \usage{
-next_beta(alpha, X, Y, epsilon = 0.1, beta = 0, beta_max = 25,
-  max_approx = 1.2, beta_min_increase = beta_max * 5e-04)
+next_beta(
+  residuals2,
+  epsilon2,
+  beta = 0,
+  weight = NULL,
+  beta_max = 20/epsilon2,
+  log_max_approx = log(1.15),
+  beta_min_increase = (beta_max + beta) * 5e-04
+)
 }
 \arguments{
-\item{alpha}{linear model}
-
-\item{X}{data matrix}
+\item{residuals2}{squared residuals}
 
-\item{Y}{response vector}
-
-\item{epsilon}{error tolerance}
+\item{epsilon2}{squared error tolerance}
 
 \item{beta}{current sigmoid steepness}
 
+\item{weight}{weight vector (default: NULL)}
+
 \item{beta_max}{max sigmoid steepnsess}
 
-\item{max_approx}{approximation ratio target for increasing beta}
+\item{log_max_approx}{logarithm of the approximation ratio target for increasing beta}
 
 \item{beta_min_increase}{minimum beta step}
 }
@@ -33,4 +39,5 @@ beta_new
 Find the next beta according to:
   ¤ approximation_ratio(alpha, beta_old, beta_new) == max_approx
   ¤ beta_new >= beta_old + min_increase
+  ¤ beta_new <= beta_max
 }
diff --git a/man/owlqn_c.Rd b/man/owlqn_c.Rd
index e90a8af..c009d90 100644
--- a/man/owlqn_c.Rd
+++ b/man/owlqn_c.Rd
@@ -2,31 +2,28 @@
 % Please edit documentation in R/optimisation.R
 \name{owlqn_c}
 \alias{owlqn_c}
-\title{OWL-QN for optimising loss_smooth (Cpp implementation)}
+\title{OWL-QN for optimising loss_smooth
+Cpp implementation}
 \usage{
-owlqn_c(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3,
-  max_iterations = 250, ...)
+owlqn_c(alpha, dc, lambda1 = 0, max_iterations = 300, ..., invisible = TRUE)
 }
 \arguments{
 \item{alpha}{linear model to optimise}
 
-\item{X}{data matrix}
+\item{dc}{DataContainer containing the data and parameters}
 
-\item{Y}{response vector}
+\item{lambda1}{L1 coefficient (default: 0)}
 
-\item{epsilon}{error tolerance}
-
-\item{lambda}{L1 coefficient}
-
-\item{beta}{sigmoid steepness}
-
-\item{max_iterations}{number of OWL-QN iterations}
+\item{max_iterations}{number of OWL-QN iterations (default: 300)}
 
 \item{...}{other parameters to OWL-QN}
+
+\item{invisible}{no terminal output (default: TRUE)}
 }
 \value{
 lbfgs object
 }
 \description{
-OWL-QN for optimising loss_smooth (Cpp implementation)
+OWL-QN for optimising loss_smooth
+Cpp implementation
 }
diff --git a/man/owlqn_r.Rd b/man/owlqn_r.Rd
index e40ea62..14540e7 100644
--- a/man/owlqn_r.Rd
+++ b/man/owlqn_r.Rd
@@ -2,10 +2,22 @@
 % Please edit documentation in R/optimisation.R
 \name{owlqn_r}
 \alias{owlqn_r}
-\title{OWL-QN for optimising loss_smooth (R implementation)}
+\title{OWL-QN for optimising loss_smooth
+R implementation}
 \usage{
-owlqn_r(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3,
-  max_iterations = 250, ...)
+owlqn_r(
+  alpha,
+  X,
+  Y,
+  epsilon,
+  beta,
+  lambda1 = 0,
+  lambda2 = 0,
+  weight = NULL,
+  max_iterations = 300,
+  ...,
+  invisible = TRUE
+)
 }
 \arguments{
 \item{alpha}{linear model to optimise}
@@ -16,17 +28,24 @@ owlqn_r(alpha, X, Y, epsilon = 0.1, lambda = 0, beta = 3,
 
 \item{epsilon}{error tolerance}
 
-\item{lambda}{L1 coefficient}
-
 \item{beta}{sigmoid steepness}
 
-\item{max_iterations}{number of OWL-QN iterations}
+\item{lambda1}{L1 coefficient (default: 0)}
+
+\item{lambda2}{L1 coefficient(default: 0)}
+
+\item{weight}{weight vector (default: NULL)}
+
+\item{max_iterations}{number of OWL-QN iterations (default: 300)}
 
 \item{...}{other parameters to OWL-QN}
+
+\item{invisible}{no terminal output (default: TRUE)}
 }
 \value{
 lbfgs object
 }
 \description{
-OWL-QN for optimising loss_smooth (R implementation)
+OWL-QN for optimising loss_smooth
+R implementation
 }
diff --git a/man/plot.slise.Rd b/man/plot.slise.Rd
index d3603b9..68c9dec 100644
--- a/man/plot.slise.Rd
+++ b/man/plot.slise.Rd
@@ -4,30 +4,37 @@
 \alias{plot.slise}
 \title{Plot the robust regression or explanation from slise}
 \usage{
-\method{plot}{slise}(slise, cols = 1, title = "SLISE", labels = NULL,
-  other = NULL, threed = FALSE, ...)
+\method{plot}{slise}(x, type = NULL, title = NULL, ...)
 }
 \arguments{
-\item{slise}{The slise object}
+\item{x}{The slise object}
 
-\item{cols}{The columns in the data to plot}
+\item{type}{The type of plot ("2D", "bar", "distribution", "mnist", "prediction", "wordcloud")}
 
-\item{title}{(Optional) The title of the plot (and result names when using other)}
+\item{title}{The title of the plot}
 
-\item{labels}{(Optional) The labels for the x, y, and legend (in that order, can be partial)}
-
-\item{other}{list of other slise objects to include in the plot}
-
-\item{threed}{plot in 3D with two columns}
-
-\item{...}{not used}
+\item{...}{
+  Arguments passed on to \code{\link[=plot.slise_2d]{plot.slise_2d}}, \code{\link[=plot.slise_bar]{plot.slise_bar}}, \code{\link[=plot.slise_distribution]{plot.slise_distribution}}, \code{\link[=plot.slise_mnist]{plot.slise_mnist}}, \code{\link[=plot.slise_prediction]{plot.slise_prediction}}, \code{\link[=plot.slise_wordcloud]{plot.slise_wordcloud}}
+  \describe{
+    \item{\code{labels}}{The axis labels (default: c("X", "Y") or c("x", "f(x)"))}
+    \item{\code{partial}}{Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)}
+    \item{\code{size}}{The size of the plot elements (default: 2)}
+    \item{\code{signif}}{The number of significant digits to display (default: 3)}
+    \item{\code{width}}{The width of the image (width * height == ncol(X))}
+    \item{\code{height}}{The height of the image (width * height == ncol(X))}
+    \item{\code{plots}}{The number of plots to split the explanation into (default: 1)}
+    \item{\code{treshold}}{Treshold for ignored value (default: 1e-8)}
+    \item{\code{local}}{Only display the words relevant for the explained item (default: TRUE)}
+  }}
+}
+\value{
+plot or ggplot2 objects
 }
 \description{
 Plot the robust regression or explanation from slise
 }
 \examples{
-data <- matrix(rnorm(200), 100, 2)
-response <- rnorm(100)
-slise <- slise.fit(data, response, epsilon=0.1)
-plot(slise, 1:2, threed = TRUE)
+X <- matrix(rnorm(30), 30, 1)
+Y <- runif(30, 0, 1)
+plot(slise.fit(X, Y, epsilon = 0.1))
 }
diff --git a/man/plot.slise_2d.Rd b/man/plot.slise_2d.Rd
new file mode 100644
index 0000000..87d1554
--- /dev/null
+++ b/man/plot.slise_2d.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot.R
+\name{plot.slise_2d}
+\alias{plot.slise_2d}
+\title{Plot the robust regression or explanation from slise in 2D}
+\usage{
+\method{plot}{slise_2d}(slise, title, labels = NULL, partial = FALSE, size = 2, ...)
+}
+\arguments{
+\item{slise}{The slise object}
+
+\item{title}{The title of the plot}
+
+\item{labels}{The axis labels (default: c("X", "Y") or c("x", "f(x)"))}
+
+\item{partial}{Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)}
+
+\item{size}{The size of the plot elements (default: 2)}
+
+\item{...}{Ignored parameters}
+}
+\value{
+ggplot object or plot
+}
+\description{
+Plot the robust regression or explanation from slise in 2D
+}
diff --git a/man/plot.slise_bar.Rd b/man/plot.slise_bar.Rd
new file mode 100644
index 0000000..960ea4b
--- /dev/null
+++ b/man/plot.slise_bar.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot.R
+\name{plot.slise_bar}
+\alias{plot.slise_bar}
+\title{Plot the robust regression or explanation from slise as bar plots}
+\usage{
+\method{plot}{slise_bar}(slise, title, labels = c("Low", "High"), partial = FALSE, size = 8, ...)
+}
+\arguments{
+\item{slise}{The slise object}
+
+\item{title}{The title of the plot}
+
+\item{labels}{The class labels (vector with two strings: c(y_low, y_high), default: c("Low", "High"))}
+
+\item{partial}{Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)}
+
+\item{size}{The size of the segments (default: 8)}
+
+\item{...}{Ignored parameters}
+}
+\value{
+List of ggplot objects or plot
+}
+\description{
+Plot the robust regression or explanation from slise as bar plots
+}
diff --git a/man/plot.slise_distribution.Rd b/man/plot.slise_distribution.Rd
new file mode 100644
index 0000000..fea1405
--- /dev/null
+++ b/man/plot.slise_distribution.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot.R
+\name{plot.slise_distribution}
+\alias{plot.slise_distribution}
+\title{Plot the robust regression or explanation from slise with distributions}
+\usage{
+\method{plot}{slise_distribution}(slise, title, labels = c("Low", "High"), partial = FALSE, signif = 3, ...)
+}
+\arguments{
+\item{slise}{The slise object}
+
+\item{title}{The title of the plot}
+
+\item{labels}{The class labels (vector with two strings: c(y_low, y_high), default: c("Low", "High"))}
+
+\item{partial}{Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)}
+
+\item{signif}{The number of significant digits to display (default: 3)}
+
+\item{...}{Ignored parameters}
+}
+\value{
+List of ggplot objects or plot
+}
+\description{
+Plot the robust regression or explanation from slise with distributions
+}
diff --git a/man/plot.slise_mnist.Rd b/man/plot.slise_mnist.Rd
new file mode 100644
index 0000000..a592305
--- /dev/null
+++ b/man/plot.slise_mnist.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot.R
+\name{plot.slise_mnist}
+\alias{plot.slise_mnist}
+\title{Plot the robust regression or explanation from slise as an image}
+\usage{
+\method{plot}{slise_mnist}(
+  slise,
+  title,
+  labels = c("Low", "High"),
+  partial = FALSE,
+  width = floor(sqrt(ncol(slise$X))),
+  height = width,
+  plots = 1,
+  enhance_colours = TRUE,
+  ...
+)
+}
+\arguments{
+\item{slise}{The slise object}
+
+\item{title}{The title of the plot}
+
+\item{labels}{The class labels (vector with two strings: c(y_low, y_high), default: c("Low", "High"))}
+
+\item{partial}{Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)}
+
+\item{width}{The width of the image (width * height == ncol(X))}
+
+\item{height}{The height of the image (width * height == ncol(X))}
+
+\item{plots}{The number of plots to split the explanation into (default: 1)}
+
+\item{enhance_colours}{Increse the saturation of the explanation (default: TRUE)}
+
+\item{...}{Ignored parameters}
+}
+\value{
+ggplot object(s) or plot
+}
+\description{
+Plot the robust regression or explanation from slise as an image
+}
diff --git a/man/plot.slise_prediction.Rd b/man/plot.slise_prediction.Rd
new file mode 100644
index 0000000..7ed7089
--- /dev/null
+++ b/man/plot.slise_prediction.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot.R
+\name{plot.slise_prediction}
+\alias{plot.slise_prediction}
+\title{Plot the robust regression or explanation from slise based on predictions}
+\usage{
+\method{plot}{slise_prediction}(
+  slise,
+  title,
+  labels = c("Response", "Count"),
+  partial = FALSE,
+  approximation = TRUE,
+  signif = 3,
+  ...
+)
+}
+\arguments{
+\item{slise}{The slise object}
+
+\item{title}{The title of the plot}
+
+\item{labels}{The axis labels (default: c("Response", "Count"))}
+
+\item{partial}{Should the raw ggplot2 objects be returned instead of directly plotting (default: FALSE)}
+
+\item{approximation}{Should the approximation density be added (default: TRUE)}
+
+\item{signif}{The number of significant digits to display (default: 3)}
+
+\item{...}{Ignored parameters}
+}
+\value{
+ggplot object or plot
+}
+\description{
+Plot the robust regression or explanation from slise based on predictions
+}
diff --git a/man/plot.slise_wordcloud.Rd b/man/plot.slise_wordcloud.Rd
new file mode 100644
index 0000000..aadc729
--- /dev/null
+++ b/man/plot.slise_wordcloud.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot.R
+\name{plot.slise_wordcloud}
+\alias{plot.slise_wordcloud}
+\title{Plot the robust regression or explanation from slise as a wordcloud}
+\usage{
+\method{plot}{slise_wordcloud}(
+  slise,
+  title,
+  labels = c("Low", "High"),
+  treshold = 1e-08,
+  local = TRUE,
+  ...
+)
+}
+\arguments{
+\item{slise}{The slise object}
+
+\item{title}{The title of the plot}
+
+\item{labels}{The class labels (vector with two strings: c(y_low, y_high), default: c("Low", "High"))}
+
+\item{treshold}{Treshold for ignored value (default: 1e-8)}
+
+\item{local}{Only display the words relevant for the explained item (default: TRUE)}
+
+\item{...}{Ignored parameters}
+}
+\value{
+plot
+}
+\description{
+Plot the robust regression or explanation from slise as a wordcloud
+}
diff --git a/man/predict.slise.Rd b/man/predict.slise.Rd
index 3fe865d..bbb68ac 100644
--- a/man/predict.slise.Rd
+++ b/man/predict.slise.Rd
@@ -1,17 +1,19 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/main.R
+% Please edit documentation in R/slise.R
 \name{predict.slise}
 \alias{predict.slise}
 \title{Predict with a SLISE}
 \usage{
-\method{predict}{slise}(object, newdata = NULL, ...)
+\method{predict}{slise}(object, newdata = NULL, ..., logit = FALSE)
 }
 \arguments{
 \item{object}{SLISE object}
 
 \item{newdata}{data matrix}
 
-\item{...}{not used}
+\item{...}{ignored additional parameters}
+
+\item{logit}{return the result in logit space if a logit has been applied to Y (default: FALSE)}
 }
 \value{
 prediction vector
@@ -19,10 +21,3 @@ prediction vector
 \description{
 Predict with a SLISE
 }
-\examples{
-X <- matrix(rnorm(200), 100, 2)
-Y <- rnorm(100)
-index <- 10
-model <- slise.explain(X, Y, index)
-prediction <- predict(model, X)
-}
diff --git a/man/print.slise.Rd b/man/print.slise.Rd
index 60a3d64..d39698a 100644
--- a/man/print.slise.Rd
+++ b/man/print.slise.Rd
@@ -4,21 +4,23 @@
 \alias{print.slise}
 \title{Print the robust regression or explanation from slise}
 \usage{
-\method{print}{slise}(slise, ..., title = "SLISE")
+\method{print}{slise}(x, num_vars = 10, ...)
 }
 \arguments{
-\item{slise}{The slise object}
+\item{x}{The slise object}
 
-\item{...}{not used}
+\item{num_vars}{Minimum number of variables to show without filtering (default: 10)}
 
-\item{title}{(Optional) The title of the result}
+\item{...}{Ignored additional parameters}
+}
+\value{
+invisible(x)
 }
 \description{
 Print the robust regression or explanation from slise
 }
 \examples{
-data <- matrix(rnorm(200), 100, 2)
-response <- rnorm(100)
-slise <- slise.fit(data, response, epsilon=0.1)
-print(slise)
+X <- matrix(rnorm(30), 15, 2)
+Y <- runif(15, 0, 1)
+print(slise.fit(X, Y, epsilon = 0.1))
 }
diff --git a/man/scale_identity.Rd b/man/scale_identity.Rd
index 02232c4..f598768 100644
--- a/man/scale_identity.Rd
+++ b/man/scale_identity.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/data.R
 \name{scale_identity}
 \alias{scale_identity}
-\title{Create a scale list without any changes}
+\title{A variant of `scale` that only adds the attributes}
 \usage{
 scale_identity(x)
 }
@@ -10,8 +10,8 @@ scale_identity(x)
 \item{x}{the vector to (not) scale}
 }
 \value{
-a list(scaled = x, center = 0, scale = 1)
+x (with attributes "scaled:center" and "scaled:scale")
 }
 \description{
-Create a scale list without any changes
+A variant of `scale` that only adds the attributes
 }
diff --git a/man/scale_one_range.Rd b/man/scale_one_range.Rd
deleted file mode 100644
index 86068ef..0000000
--- a/man/scale_one_range.Rd
+++ /dev/null
@@ -1,17 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data.R
-\name{scale_one_range}
-\alias{scale_one_range}
-\title{Scale a vector to a range of approx one}
-\usage{
-scale_one_range(x)
-}
-\arguments{
-\item{x}{the vector to normalise}
-}
-\value{
-a list(scaled, center, scale)
-}
-\description{
-Scale a vector to a range of approx one
-}
diff --git a/man/scale_robust.Rd b/man/scale_robust.Rd
index fda063b..795b303 100644
--- a/man/scale_robust.Rd
+++ b/man/scale_robust.Rd
@@ -3,23 +3,21 @@
 \name{scale_robust}
 \alias{scale_robust}
 \title{Robust Scale
-A scale that can handle zero variance}
+A variant of 'scale' that is based on median and mad (instead of mean and sd).
+It can handle zero variance without producing nan:s.}
 \usage{
-scale_robust(x, center = TRUE, scale = TRUE, remove_constant = TRUE)
+scale_robust(x, th = .Machine$double.eps)
 }
 \arguments{
 \item{x}{the vector/matrix to normalise}
 
-\item{center}{Should constant columns be centered (TRUE)}
-
-\item{scale}{Should constant columns be scaled (TRUE)}
-
-\item{remove_constant}{Should constant columns be removed (TRUE)}
+\item{th}{threshold for the scale being zero}
 }
 \value{
-a list(scaled, center, scale, mask)
+scaled_x (with attributes "scaled:center" and "scaled:scale")
 }
 \description{
 Robust Scale
-A scale that can handle zero variance
+A variant of 'scale' that is based on median and mad (instead of mean and sd).
+It can handle zero variance without producing nan:s.
 }
diff --git a/man/sigmoid.Rd b/man/sigmoid.Rd
index 611232a..305d861 100644
--- a/man/sigmoid.Rd
+++ b/man/sigmoid.Rd
@@ -7,7 +7,7 @@
 sigmoid(x)
 }
 \arguments{
-\item{x}{vector}
+\item{x}{vector of real values}
 }
 \value{
 sigmoid(x)
diff --git a/man/simple_pca.Rd b/man/simple_pca.Rd
new file mode 100644
index 0000000..734409b
--- /dev/null
+++ b/man/simple_pca.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\name{simple_pca}
+\alias{simple_pca}
+\title{Calculate the PCA rotation matrix
+The implementation is based on stats::prcomp.
+Assumes the data has already been centered and scaled (if that is desired).}
+\usage{
+simple_pca(X, dimensions, tolerance = 1e-10)
+}
+\arguments{
+\item{X}{the matrix to reduce}
+
+\item{dimensions}{the number of dimensions after PCA}
+
+\item{tolerance}{remove components with variance less than the tolerance}
+}
+\value{
+pca rotation matrix
+}
+\description{
+Calculate the PCA rotation matrix
+The implementation is based on stats::prcomp.
+Assumes the data has already been centered and scaled (if that is desired).
+}
diff --git a/man/slise-package.Rd b/man/slise-package.Rd
index 6ce61c4..7018dbf 100644
--- a/man/slise-package.Rd
+++ b/man/slise-package.Rd
@@ -1,14 +1,12 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/slise.R
+% Please edit documentation in R/cpp.R
 \docType{package}
 \name{slise-package}
 \alias{slise}
 \alias{slise-package}
 \title{slise: Sparse Linear Subset Explanations}
 \description{
-An implementation of the SLISE
-    algorithm for robust regression and
-    explaining outcomes from black box models.
+An implementation of the SLISE algorithm (for robust regression and explaining outcomes from black box models).
 }
 \seealso{
 Useful links:
@@ -18,6 +16,12 @@ Useful links:
 
 }
 \author{
-\strong{Maintainer}: Anton Björklund \email{anton.bjorklund@helsinki.fi}
+\strong{Maintainer}: Anton Björklund \email{anton.bjorklund@helsinki.fi} (\href{https://orcid.org/0000-0002-7749-2918}{ORCID})
+
+Authors:
+\itemize{
+  \item Andreas Henelius (\href{https://orcid.org/0000-0002-4040-6967}{ORCID})
+  \item Kai Puolamäki (\href{https://orcid.org/0000-0003-1819-1047}{ORCID})
+}
 
 }
diff --git a/man/slise.explain.Rd b/man/slise.explain.Rd
index 633494c..5be6ff2 100644
--- a/man/slise.explain.Rd
+++ b/man/slise.explain.Rd
@@ -1,52 +1,70 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/main.R
+% Please edit documentation in R/slise.R
 \name{slise.explain}
 \alias{slise.explain}
 \title{SLISE Black Box Explainer
 Use SLISE for explaining predictions made by a black box.}
 \usage{
-slise.explain(X, Y, x, y = NULL, epsilon = 0.1, lambda = 0, ...,
-  scale = FALSE, logit = FALSE, scale_y = TRUE)
+slise.explain(
+  X,
+  Y,
+  epsilon,
+  x,
+  y = NULL,
+  lambda1 = 0,
+  lambda2 = 0,
+  weight = NULL,
+  normalise = FALSE,
+  logit = FALSE,
+  initialisation = slise_initialisation_candidates,
+  ...
+)
 }
 \arguments{
-\item{X}{matrix of independent variables}
+\item{X}{Matrix of independent variables}
 
-\item{Y}{vector of the dependent variable}
+\item{Y}{Vector of the dependent variable}
 
-\item{x}{the sample to be explained (or index if y is null)}
+\item{epsilon}{Error tolerance}
 
-\item{y}{the prediction to be explained}
+\item{x}{The sample to be explained (or index if y is null)}
 
-\item{epsilon}{error tolerance (will be scaled to represent a percentage, e.g. 0.1 == 10\%)}
+\item{y}{The prediction to be explained (default: NULL)}
 
-\item{lambda}{sparsity reguraliser}
+\item{lambda1}{L1 regularisation coefficient (default: 0)}
 
-\item{...}{other parameters to the optimiser}
+\item{lambda2}{L2 regularisation coefficient (default: 0)}
 
-\item{scale}{Scale X by mean and standard deviation (FALSE)}
+\item{weight}{Optional weight vector (default: NULL)}
 
-\item{logit}{Should Y be logit-transformed (recommended for probabilities) (FALSE)}
+\item{normalise}{Preprocess X and Y by scaling, note that epsilon is not scaled (default: FALSE)}
 
-\item{scale_y}{Scales Y to roughly be in [-0.5, 0.5] (based on 95th and 5th quantile if not in [0, 1]) (TRUE)}
+\item{logit}{Logit transform Y from probabilities to real values (default: FALSE)}
 
-\item{max_approx}{Target approximation ratio for selecting graduated optimisation step size (1.2)}
+\item{initialisation}{function that gives the initial alpha and beta, or a list containing the initial alpha and beta (default: slise_initialisation_candidates)}
 
-\item{beta_max}{Stopping sigmoid steepness (25)}
-
-\item{beta_start_max}{Maximum beta-step during the initialisation (1.0)}
-
-\item{max_iterations}{Maximum number of OWL-QN steps per graduated optimisation step (250)}
+\item{...}{
+  Arguments passed on to \code{\link[=graduated_optimisation]{graduated_optimisation}}, \code{\link[=slise_initialisation_candidates]{slise_initialisation_candidates}}
+  \describe{
+    \item{\code{beta_max}}{Stopping sigmoid steepness (default: 20 / epsilon^2)}
+    \item{\code{max_approx}}{Approximation ratio when selecting the next beta (default: 1.15)}
+    \item{\code{max_iterations}}{Maximum number of OWL-QN iterations (default: 300)}
+    \item{\code{debug}}{Should debug statement be printed each iteration (default: FALSE)}
+    \item{\code{num_init}}{the number of initial subsets to generate (default: 500)}
+    \item{\code{beta_max_init}}{the maximum sigmoid steepness in the initialisation}
+    \item{\code{pca_treshold}}{the maximum number of columns without using PCA (default: 10)}
+  }}
 }
 \value{
-slise object (coefficients, subset, value, X, Y, lambda, epsilon, scaled, alpha, x, y)
+slise object (coefficients, subset, value, X, Y, lambda1, lambda2, epsilon, scaled, alpha, x, y)
 }
 \description{
-SLISE Black Box Explainer
-Use SLISE for explaining predictions made by a black box.
+It is highly recommended that you normalise the data,
+either before using SLISE or by setting normalise = TRUE.
 }
 \examples{
-X <- matrix(rnorm(200), 100, 2)
-Y <- rnorm(100)
-index <- 10
-model <- slise.explain(X, Y, index)
+X <- matrix(rnorm(32), 8, 4)
+Y <- runif(8, 0, 1)
+expl <- slise.explain(X, Y, 0.1, 3, lambda1 = 0.01, logit = TRUE)
+plot(expl, "bar", labels = c("class 1", "class 2"))
 }
diff --git a/man/slise.explain_comb.Rd b/man/slise.explain_comb.Rd
index 0ef5c39..325a62e 100644
--- a/man/slise.explain_comb.Rd
+++ b/man/slise.explain_comb.Rd
@@ -1,23 +1,34 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/main.R
+% Please edit documentation in R/slise.R
 \name{slise.explain_comb}
 \alias{slise.explain_comb}
 \title{SLISE Black Box Explainer
 Use SLISE for explaining predictions made by a black box.
 BUT with sparsity from a combinatorial search rather than Lasso!}
 \usage{
-slise.explain_comb(X, Y, x, y = NULL, ..., variables = 4)
+slise.explain_comb(X, Y, epsilon, x, y = NULL, ..., variables = 4)
 }
 \arguments{
 \item{X}{matrix of independent variables}
 
 \item{Y}{vector of the dependent variable}
 
+\item{epsilon}{error tolerance}
+
 \item{x}{the sample to be explained (or index if y is null)}
 
 \item{y}{the prediction to be explained}
 
-\item{...}{other parameters to slise.explain}
+\item{...}{
+  Arguments passed on to \code{\link[=slise.explain]{slise.explain}}
+  \describe{
+    \item{\code{lambda1}}{L1 regularisation coefficient (default: 0)}
+    \item{\code{lambda2}}{L2 regularisation coefficient (default: 0)}
+    \item{\code{weight}}{Optional weight vector (default: NULL)}
+    \item{\code{normalise}}{Preprocess X and Y by scaling, note that epsilon is not scaled (default: FALSE)}
+    \item{\code{logit}}{Logit transform Y from probabilities to real values (default: FALSE)}
+    \item{\code{initialisation}}{function that gives the initial alpha and beta, or a list containing the initial alpha and beta (default: slise_initialisation_candidates)}
+  }}
 
 \item{variables}{the number of non-zero coefficients}
 }
@@ -29,9 +40,3 @@ SLISE Black Box Explainer
 Use SLISE for explaining predictions made by a black box.
 BUT with sparsity from a combinatorial search rather than Lasso!
 }
-\examples{
-X <- matrix(rnorm(400), 100, 4)
-Y <- rnorm(100)
-index <- 10
-model <- slise.explain_comb(X, Y, index, variables = 2)
-}
diff --git a/man/slise.explain_find.Rd b/man/slise.explain_find.Rd
index 4a2e01e..2042557 100644
--- a/man/slise.explain_find.Rd
+++ b/man/slise.explain_find.Rd
@@ -1,18 +1,36 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/main.R
+% Please edit documentation in R/slise.R
 \name{slise.explain_find}
 \alias{slise.explain_find}
 \title{SLISE Black Box Explainer
 Use SLISE for explaining predictions made by a black box.
 BUT with a binary search for sparsity!}
 \usage{
-slise.explain_find(..., lambda = 5, variables = 4, iters = 10,
-  treshold = 1e-04)
+slise.explain_find(
+  ...,
+  lambda1 = 5,
+  variables = 4,
+  iters = 10,
+  treshold = 1e-04
+)
 }
 \arguments{
-\item{...}{parameters to slise.explain}
+\item{...}{
+  Arguments passed on to \code{\link[=slise.explain]{slise.explain}}
+  \describe{
+    \item{\code{X}}{Matrix of independent variables}
+    \item{\code{Y}}{Vector of the dependent variable}
+    \item{\code{epsilon}}{Error tolerance}
+    \item{\code{x}}{The sample to be explained (or index if y is null)}
+    \item{\code{y}}{The prediction to be explained (default: NULL)}
+    \item{\code{lambda2}}{L2 regularisation coefficient (default: 0)}
+    \item{\code{weight}}{Optional weight vector (default: NULL)}
+    \item{\code{normalise}}{Preprocess X and Y by scaling, note that epsilon is not scaled (default: FALSE)}
+    \item{\code{logit}}{Logit transform Y from probabilities to real values (default: FALSE)}
+    \item{\code{initialisation}}{function that gives the initial alpha and beta, or a list containing the initial alpha and beta (default: slise_initialisation_candidates)}
+  }}
 
-\item{lambda}{the starting value of the search}
+\item{lambda1}{the starting value of the search}
 
 \item{variables}{number of non-zero coefficients}
 
@@ -28,9 +46,3 @@ SLISE Black Box Explainer
 Use SLISE for explaining predictions made by a black box.
 BUT with a binary search for sparsity!
 }
-\examples{
-X <- matrix(rnorm(800), 100, 8)
-Y <- rnorm(100)
-index <- 10
-model <- slise.explain_find(X, Y, index, variables = 4)
-}
diff --git a/man/slise.fit.Rd b/man/slise.fit.Rd
index a9aac25..6d03a82 100644
--- a/man/slise.fit.Rd
+++ b/man/slise.fit.Rd
@@ -1,50 +1,66 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/main.R
+% Please edit documentation in R/slise.R
 \name{slise.fit}
 \alias{slise.fit}
 \title{SLISE Regression
 Use SLISE for robust regression.}
 \usage{
-slise.fit(X, Y, epsilon = 0.1, lambda = 0, ..., scale = FALSE,
-  logit = FALSE, intercept = TRUE, scale_y = TRUE)
+slise.fit(
+  X,
+  Y,
+  epsilon,
+  lambda1 = 0,
+  lambda2 = 0,
+  weight = NULL,
+  intercept = TRUE,
+  normalise = FALSE,
+  initialisation = slise_initialisation_candidates,
+  ...
+)
 }
 \arguments{
-\item{X}{matrix of independent variables}
+\item{X}{Matrix of independent variables}
 
-\item{Y}{vector of the response variable}
+\item{Y}{Vector of the response variable}
 
-\item{epsilon}{error tolerance (will be scaled to represent a percentage, e.g. 0.1 == 10\%)}
+\item{epsilon}{Error tolerance}
 
-\item{lambda}{sparsity reguraliser}
+\item{lambda1}{L1 regularisation coefficient (default: 0)}
 
-\item{...}{other parameters to the optimiser}
+\item{lambda2}{L2 regularisation coefficient (default: 0)}
 
-\item{scale}{Scale X by mean and standard deviation (FALSE)}
+\item{weight}{Optional weight vector (default: NULL)}
 
-\item{logit}{Should Y be logit-transformed (recommended for probabilities) (FALSE)}
+\item{intercept}{Should an intercept be added (default: TRUE)}
 
-\item{intercept}{Should an intercept be added (TRUE)}
+\item{normalise}{Preprocess X and Y by scaling, note that epsilon is not scaled (default: FALSE)}
 
-\item{scale_y}{Scales Y to roughly be in [-0.5, 0.5] (based on 95th and 5th quantile if not in [0, 1]) (TRUE)}
+\item{initialisation}{Function that gives the initial alpha and beta, or a list containing the initial alpha and beta (default: slise_initialisation_candidates)}
 
-\item{max_approx}{Target approximation ratio for selecting graduated optimisation step size (1.2)}
-
-\item{beta_max}{Stopping sigmoid steepness (25)}
-
-\item{beta_start_max}{Maximum beta-step during the initialisation (1.0)}
-
-\item{max_iterations}{Maximum number of OWL-QN steps per graduated optimisation step (250)}
+\item{...}{
+  Arguments passed on to \code{\link[=graduated_optimisation]{graduated_optimisation}}, \code{\link[=slise_initialisation_candidates]{slise_initialisation_candidates}}
+  \describe{
+    \item{\code{beta_max}}{Stopping sigmoid steepness (default: 20 / epsilon^2)}
+    \item{\code{max_approx}}{Approximation ratio when selecting the next beta (default: 1.15)}
+    \item{\code{max_iterations}}{Maximum number of OWL-QN iterations (default: 300)}
+    \item{\code{debug}}{Should debug statement be printed each iteration (default: FALSE)}
+    \item{\code{num_init}}{the number of initial subsets to generate (default: 500)}
+    \item{\code{beta_max_init}}{the maximum sigmoid steepness in the initialisation}
+    \item{\code{pca_treshold}}{the maximum number of columns without using PCA (default: 10)}
+  }}
 }
 \value{
-slise object (coefficients, subset, value, X, Y, lambda, epsilon, scaled, alpha)
+slise object (coefficients, subset, value, X, Y, lambda1, lambda2, epsilon, scaled, alpha)
 }
 \description{
-SLISE Regression
-Use SLISE for robust regression.
+It is highly recommended that you normalise the data,
+either before using SLISE or by setting normalise = TRUE.
 }
 \examples{
-X <- matrix(rnorm(200), 100, 2)
-Y <- rnorm(100)
-model <- slise.fit(X, Y)
-prediction <- predict(model, X)
+# Assuming data is a data.frame with the first column containing the response
+# Further assuming newdata is a similar data.frame with the response missing
+X <- matrix(rnorm(32), 8, 4)
+Y <- rnorm(8)
+model <- slise.fit(X, Y, (max(Y) - min(Y)) * 0.1)
+predicted <- predict(model, X)
 }
diff --git a/man/slise.object.Rd b/man/slise.object.Rd
new file mode 100644
index 0000000..0c2da87
--- /dev/null
+++ b/man/slise.object.Rd
@@ -0,0 +1,52 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/slise.R
+\name{slise.object}
+\alias{slise.object}
+\title{Create a result object for SLISE that is similar to other regression method results}
+\usage{
+slise.object(
+  alpha,
+  X,
+  Y,
+  epsilon,
+  lambda1 = 0,
+  lambda2 = 0,
+  weight = NULL,
+  intercept = FALSE,
+  logit = FALSE,
+  x = NULL,
+  y = NULL,
+  ...
+)
+}
+\arguments{
+\item{alpha}{linear model}
+
+\item{X}{data matrix}
+
+\item{Y}{response vector}
+
+\item{epsilon}{error tolerance}
+
+\item{lambda1}{L1 regularisation coefficient (default: 0)}
+
+\item{lambda2}{L2 regularisation coefficient (default: 0)}
+
+\item{weight}{weight vector (default: NULL)}
+
+\item{intercept}{does the model have an intercept (default: FALSE)}
+
+\item{logit}{has the target been logit-transformed (default: FALSE)}
+
+\item{x}{explained item x (default: NULL)}
+
+\item{y}{explained item y (default: NULL)}
+
+\item{...}{other variables to add to the SLISE object}
+}
+\value{
+list(coefficients=unscale(alpha), X, Y, scaled=data, lambda1, lambda2, alpha, subset=[r_i<epsilon], value=loss, epsilon, loss, weight, ...)
+}
+\description{
+Create a result object for SLISE that is similar to other regression method results
+}
diff --git a/man/slise.raw.Rd b/man/slise.raw.Rd
deleted file mode 100644
index 76eed10..0000000
--- a/man/slise.raw.Rd
+++ /dev/null
@@ -1,49 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/main.R
-\name{slise.raw}
-\alias{slise.raw}
-\title{SLISE Regression
-The raw interface for SLISE, initialisation and scaling
-  (including locality) has to be done in advance.
-This function essentially wraps graduated_optimisation
-  and create_slise.}
-\usage{
-slise.raw(X, Y, alpha = rep(0, ncol(X)), epsilon = 0.1, lambda = 0,
-  beta = 0, ...)
-}
-\arguments{
-\item{X}{matrix of independent variables}
-
-\item{Y}{vector of the response variable}
-
-\item{alpha}{Starting alpha}
-
-\item{epsilon}{error tolerance (will be scaled to represent a percentage, e.g. 0.1 == 10\%)}
-
-\item{lambda}{sparsity reguraliser}
-
-\item{beta}{Starting sigmoid steepness}
-
-\item{...}{other parameters to the optimiser}
-
-\item{max_approx}{Target approximation ratio for selecting graduated optimisation step size (1.2)}
-
-\item{beta_max}{Stopping sigmoid steepness (25)}
-
-\item{max_iterations}{Maximum number of OWL-QN steps per graduated optimisation step (250)}
-}
-\value{
-slise object (coefficients, subset, value, X, Y, lambda, epsilon, scaled, alpha)
-}
-\description{
-SLISE Regression
-The raw interface for SLISE, initialisation and scaling
-  (including locality) has to be done in advance.
-This function essentially wraps graduated_optimisation
-  and create_slise.
-}
-\examples{
-X <- matrix(rnorm(200), 100, 2)
-Y <- rnorm(100)
-model <- slise.raw(X, Y)
-}
diff --git a/man/slise_initialisation_candidates.Rd b/man/slise_initialisation_candidates.Rd
new file mode 100644
index 0000000..f05f3d1
--- /dev/null
+++ b/man/slise_initialisation_candidates.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/initialisation.R
+\name{slise_initialisation_candidates}
+\alias{slise_initialisation_candidates}
+\title{Initialise the graduated optimisation by sampling candidates}
+\usage{
+slise_initialisation_candidates(
+  X,
+  Y,
+  epsilon,
+  weight = NULL,
+  beta_max = 20/epsilon^2,
+  max_approx = 1.15,
+  num_init = 500,
+  beta_max_init = 2.5/epsilon^2,
+  pca_treshold = 10,
+  max_iterations = 300,
+  ...
+)
+}
+\arguments{
+\item{X}{data matrix}
+
+\item{Y}{response vector}
+
+\item{epsilon}{error tolerance}
+
+\item{weight}{weight vector (default: NULL)}
+
+\item{beta_max}{the maximum sigmoid steepness (default: 20/epsilon^2)}
+
+\item{max_approx}{the target approximation ratio (default: 1.15)}
+
+\item{num_init}{the number of initial subsets to generate (default: 500)}
+
+\item{beta_max_init}{the maximum sigmoid steepness in the initialisation}
+
+\item{pca_treshold}{the maximum number of columns without using PCA (default: 10)}
+
+\item{max_iterations}{if ncol(X) is huge, then ols is replaced with optimisation (default:300)}
+
+\item{...}{unused parameters}
+}
+\value{
+list(alpha, beta)
+}
+\description{
+The procedure starts with creating num_init subsets of size d.
+For each subset a linear model is fitted and the model that has
+the smallest loss is selected.
+}
+\details{
+The chance that one of these subsets contains only "clean" data is:
+  $$ 1-(1-(1-noise_fraction)^d)^num_init $$
+This means that high-dimensional data (large d) can cause issues,
+which is solved by using PCA (allows for sampling smaller subsets
+than d).
+}
diff --git a/man/slise_initialisation_candidates2.Rd b/man/slise_initialisation_candidates2.Rd
new file mode 100644
index 0000000..ee89900
--- /dev/null
+++ b/man/slise_initialisation_candidates2.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/initialisation.R
+\name{slise_initialisation_candidates2}
+\alias{slise_initialisation_candidates2}
+\title{Initialise the graduated optimisation by sampling candidates}
+\usage{
+slise_initialisation_candidates2(
+  X,
+  Y,
+  epsilon,
+  weight = NULL,
+  beta_max = 20/epsilon^2,
+  max_approx = 1.15,
+  num_init = 500,
+  beta_max_init = 2.5/epsilon^2,
+  max_iterations = 300,
+  ...
+)
+}
+\arguments{
+\item{X}{data matrix}
+
+\item{Y}{response vector}
+
+\item{epsilon}{error tolerance}
+
+\item{weight}{weight vector (default: NULL)}
+
+\item{beta_max}{the maximum sigmoid steepness (default: 20/epsilon^2)}
+
+\item{max_approx}{the target approximation ratio (default: 1.15)}
+
+\item{num_init}{the number of initial subsets to generate (default: 400)}
+
+\item{beta_max_init}{the maximum sigmoid steepness in the initialisation}
+
+\item{max_iterations}{if ncol(X) is huge, then ols is replaced with optimisation (default:300)}
+
+\item{...}{unused parameters}
+}
+\value{
+list(alpha, beta)
+}
+\description{
+The procedure starts with creating num_init subsets of size d.
+For each subset a linear model is fitted and the model that has
+the smallest loss is selected.
+}
+\details{
+The chance that one of these subsets contains only "clean" data is:
+  $$ 1-(1-(1-noise_fraction)^d)^num_init $$
+This means that high-dimensional data (large d) can cause issues,
+which is solved by using LASSO-regularisation (which enables fitting
+of linear models with smaller subsets than d).
+}
diff --git a/man/slise_initialisation_lasso.Rd b/man/slise_initialisation_lasso.Rd
new file mode 100644
index 0000000..aee026e
--- /dev/null
+++ b/man/slise_initialisation_lasso.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/initialisation.R
+\name{slise_initialisation_lasso}
+\alias{slise_initialisation_lasso}
+\title{Initialise the graduated optimisation with a LASSO solution}
+\usage{
+slise_initialisation_lasso(
+  X,
+  Y,
+  weight = NULL,
+  lambda1 = 0,
+  max_iterations = 300,
+  ...
+)
+}
+\arguments{
+\item{X}{data matrix}
+
+\item{Y}{response vector}
+
+\item{weight}{weight vector (default: NULL)}
+
+\item{lambda1}{L1 regularisation (default: 0)}
+
+\item{max_iterations}{if ncol(X) is huge, then ols is replaced with optimisation (default:300)}
+
+\item{...}{unused parameters}
+}
+\value{
+list(alpha, beta)
+}
+\description{
+Initialise the graduated optimisation with a LASSO solution
+}
diff --git a/man/slise_initialisation_ols.Rd b/man/slise_initialisation_ols.Rd
new file mode 100644
index 0000000..060d1ea
--- /dev/null
+++ b/man/slise_initialisation_ols.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/initialisation.R
+\name{slise_initialisation_ols}
+\alias{slise_initialisation_ols}
+\title{Initialise the graduated optimisation with an "Ordinary Least Squares" solution}
+\usage{
+slise_initialisation_ols(
+  X,
+  Y,
+  epsilon,
+  weight = NULL,
+  beta_max = 20/epsilon^2,
+  max_approx = 1.15,
+  max_iterations = 300,
+  beta_max_init = 2.5/epsilon^2,
+  ...
+)
+}
+\arguments{
+\item{X}{data matrix}
+
+\item{Y}{response vector}
+
+\item{epsilon}{error tolerance}
+
+\item{weight}{weight vector (default: NULL)}
+
+\item{beta_max}{the maximum starting sigmoid steepness (default: 20/epsilon^2)}
+
+\item{max_approx}{the target approximation ratio (default: 1.15)}
+
+\item{max_iterations}{if ncol(X) is huge, then ols is replaced with optimisation (default:300)}
+
+\item{beta_max_init}{the maximum sigmoid steepness in the initialisation}
+
+\item{...}{unused parameters}
+}
+\value{
+list(alpha, beta)
+}
+\description{
+Initialise the graduated optimisation with an "Ordinary Least Squares" solution
+}
diff --git a/man/slise_initialisation_zeros.Rd b/man/slise_initialisation_zeros.Rd
new file mode 100644
index 0000000..8ff4ca1
--- /dev/null
+++ b/man/slise_initialisation_zeros.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/initialisation.R
+\name{slise_initialisation_zeros}
+\alias{slise_initialisation_zeros}
+\title{Initialise the graduated optimisation with a zero-vector}
+\usage{
+slise_initialisation_zeros(
+  X,
+  Y,
+  epsilon,
+  weight = NULL,
+  beta_max = 20/epsilon^2,
+  max_approx = 1.15,
+  beta_max_init = 2.5/epsilon^2,
+  ...
+)
+}
+\arguments{
+\item{X}{data matrix}
+
+\item{Y}{response vector}
+
+\item{epsilon}{error tolerance}
+
+\item{weight}{weight vector (default: NULL)}
+
+\item{beta_max}{the maximum starting sigmoid steepness (default: 20/epsilon^2)}
+
+\item{max_approx}{the target approximation ratio (default: 1.15)}
+
+\item{beta_max_init}{the maximum sigmoid steepness in the initialisation}
+
+\item{...}{unused parameters}
+}
+\value{
+list(alpha, beta)
+}
+\description{
+Initialise the graduated optimisation with a zero-vector
+}
diff --git a/man/sparsity.Rd b/man/sparsity.Rd
index 90e9875..97aedc0 100644
--- a/man/sparsity.Rd
+++ b/man/sparsity.Rd
@@ -5,12 +5,12 @@
 \title{Sparsity
 Count the non-zero coefficients}
 \usage{
-sparsity(x, treshold = 0)
+sparsity(x, treshold = .Machine$double.eps)
 }
 \arguments{
 \item{x}{vector}
 
-\item{treshold}{treshold for approximately zero (0)}
+\item{treshold}{threshold for zero}
 }
 \value{
 number of non-zero values
diff --git a/man/which_min_n.Rd b/man/which_min_n.Rd
index ce2ce44..8d5bd7a 100644
--- a/man/which_min_n.Rd
+++ b/man/which_min_n.Rd
@@ -10,7 +10,7 @@ which_min_n(x, n)
 \arguments{
 \item{x}{vector}
 
-\item{n}{the number of indecies}
+\item{n}{the number of indices}
 }
 \value{
 vector of indecies
diff --git a/src/.gitignore b/src/.gitignore
deleted file mode 100644
index 22034c4..0000000
--- a/src/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*.o
-*.so
-*.dll
diff --git a/src/Makevars b/src/Makevars
new file mode 100644
index 0000000..b55c899
--- /dev/null
+++ b/src/Makevars
@@ -0,0 +1,14 @@
+
+## With R 3.1.0 or later, you can uncomment the following line to tell R to 
+## enable compilation with C++11 (where available)
+##
+## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
+## availability of the package we do not yet enforce this here.  It is however
+## recommended for client packages to set it.
+##
+## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
+## support within Armadillo prefers / requires it
+CXX_STD = CXX11
+
+PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 
+PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
diff --git a/src/Makevars.win b/src/Makevars.win
new file mode 100644
index 0000000..b55c899
--- /dev/null
+++ b/src/Makevars.win
@@ -0,0 +1,14 @@
+
+## With R 3.1.0 or later, you can uncomment the following line to tell R to 
+## enable compilation with C++11 (where available)
+##
+## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
+## availability of the package we do not yet enforce this here.  It is however
+## recommended for client packages to set it.
+##
+## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
+## support within Armadillo prefers / requires it
+CXX_STD = CXX11
+
+PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 
+PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index b6baa88..2b3ee04 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -1,141 +1,157 @@
-// Generated by using Rcpp::compileAttributes() -> do not edit by hand
-// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
-
-#include "slise_types.h"
-#include <RcppArmadillo.h>
-#include <Rcpp.h>
-
-using namespace Rcpp;
-
-// sigmoidc
-arma::vec sigmoidc(const arma::vec& x);
-RcppExport SEXP _slise_sigmoidc(SEXP xSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP);
-    rcpp_result_gen = Rcpp::wrap(sigmoidc(x));
-    return rcpp_result_gen;
-END_RCPP
-}
-// loss_smooth_c
-double loss_smooth_c(const arma::vec& alpha, const arma::mat& data, const arma::vec& response, const double& beta, const double& epsilon, const double& lambda);
-RcppExport SEXP _slise_loss_smooth_c(SEXP alphaSEXP, SEXP dataSEXP, SEXP responseSEXP, SEXP betaSEXP, SEXP epsilonSEXP, SEXP lambdaSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const arma::vec& >::type alpha(alphaSEXP);
-    Rcpp::traits::input_parameter< const arma::mat& >::type data(dataSEXP);
-    Rcpp::traits::input_parameter< const arma::vec& >::type response(responseSEXP);
-    Rcpp::traits::input_parameter< const double& >::type beta(betaSEXP);
-    Rcpp::traits::input_parameter< const double& >::type epsilon(epsilonSEXP);
-    Rcpp::traits::input_parameter< const double& >::type lambda(lambdaSEXP);
-    rcpp_result_gen = Rcpp::wrap(loss_smooth_c(alpha, data, response, beta, epsilon, lambda));
-    return rcpp_result_gen;
-END_RCPP
-}
-// loss_smooth_c_dc
-Rcpp::NumericVector loss_smooth_c_dc(const SEXP xs, const SEXP dcptr);
-RcppExport SEXP _slise_loss_smooth_c_dc(SEXP xsSEXP, SEXP dcptrSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const SEXP >::type xs(xsSEXP);
-    Rcpp::traits::input_parameter< const SEXP >::type dcptr(dcptrSEXP);
-    rcpp_result_gen = Rcpp::wrap(loss_smooth_c_dc(xs, dcptr));
-    return rcpp_result_gen;
-END_RCPP
-}
-// loss_smooth_grad_c
-Rcpp::NumericVector loss_smooth_grad_c(const arma::vec& alpha, const arma::mat& data, const arma::vec& response, const double& beta, const double& epsilon, const double& lambda);
-RcppExport SEXP _slise_loss_smooth_grad_c(SEXP alphaSEXP, SEXP dataSEXP, SEXP responseSEXP, SEXP betaSEXP, SEXP epsilonSEXP, SEXP lambdaSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const arma::vec& >::type alpha(alphaSEXP);
-    Rcpp::traits::input_parameter< const arma::mat& >::type data(dataSEXP);
-    Rcpp::traits::input_parameter< const arma::vec& >::type response(responseSEXP);
-    Rcpp::traits::input_parameter< const double& >::type beta(betaSEXP);
-    Rcpp::traits::input_parameter< const double& >::type epsilon(epsilonSEXP);
-    Rcpp::traits::input_parameter< const double& >::type lambda(lambdaSEXP);
-    rcpp_result_gen = Rcpp::wrap(loss_smooth_grad_c(alpha, data, response, beta, epsilon, lambda));
-    return rcpp_result_gen;
-END_RCPP
-}
-// loss_smooth_grad_c_dc
-Rcpp::NumericVector loss_smooth_grad_c_dc(const SEXP xs, const SEXP dcptr);
-RcppExport SEXP _slise_loss_smooth_grad_c_dc(SEXP xsSEXP, SEXP dcptrSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const SEXP >::type xs(xsSEXP);
-    Rcpp::traits::input_parameter< const SEXP >::type dcptr(dcptrSEXP);
-    rcpp_result_gen = Rcpp::wrap(loss_smooth_grad_c_dc(xs, dcptr));
-    return rcpp_result_gen;
-END_RCPP
-}
-// lg_combined_smooth_c_dc
-Rcpp::NumericVector lg_combined_smooth_c_dc(SEXP xs, SEXP dcptr);
-RcppExport SEXP _slise_lg_combined_smooth_c_dc(SEXP xsSEXP, SEXP dcptrSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< SEXP >::type xs(xsSEXP);
-    Rcpp::traits::input_parameter< SEXP >::type dcptr(dcptrSEXP);
-    rcpp_result_gen = Rcpp::wrap(lg_combined_smooth_c_dc(xs, dcptr));
-    return rcpp_result_gen;
-END_RCPP
-}
-// lg_getgrad_c_dc
-Rcpp::NumericVector lg_getgrad_c_dc(SEXP xs, SEXP dcptr);
-RcppExport SEXP _slise_lg_getgrad_c_dc(SEXP xsSEXP, SEXP dcptrSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< SEXP >::type xs(xsSEXP);
-    Rcpp::traits::input_parameter< SEXP >::type dcptr(dcptrSEXP);
-    rcpp_result_gen = Rcpp::wrap(lg_getgrad_c_dc(xs, dcptr));
-    return rcpp_result_gen;
-END_RCPP
-}
-// loss_smooth_c_ptr
-Rcpp::XPtr<funcPtr> loss_smooth_c_ptr();
-RcppExport SEXP _slise_loss_smooth_c_ptr() {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    rcpp_result_gen = Rcpp::wrap(loss_smooth_c_ptr());
-    return rcpp_result_gen;
-END_RCPP
-}
-// loss_smooth_grad_c_ptr
-Rcpp::XPtr<funcPtr> loss_smooth_grad_c_ptr();
-RcppExport SEXP _slise_loss_smooth_grad_c_ptr() {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    rcpp_result_gen = Rcpp::wrap(loss_smooth_grad_c_ptr());
-    return rcpp_result_gen;
-END_RCPP
-}
-
-RcppExport SEXP _rcpp_module_boot_slise_mod();
-
-static const R_CallMethodDef CallEntries[] = {
-    {"_slise_sigmoidc", (DL_FUNC) &_slise_sigmoidc, 1},
-    {"_slise_loss_smooth_c", (DL_FUNC) &_slise_loss_smooth_c, 6},
-    {"_slise_loss_smooth_c_dc", (DL_FUNC) &_slise_loss_smooth_c_dc, 2},
-    {"_slise_loss_smooth_grad_c", (DL_FUNC) &_slise_loss_smooth_grad_c, 6},
-    {"_slise_loss_smooth_grad_c_dc", (DL_FUNC) &_slise_loss_smooth_grad_c_dc, 2},
-    {"_slise_lg_combined_smooth_c_dc", (DL_FUNC) &_slise_lg_combined_smooth_c_dc, 2},
-    {"_slise_lg_getgrad_c_dc", (DL_FUNC) &_slise_lg_getgrad_c_dc, 2},
-    {"_slise_loss_smooth_c_ptr", (DL_FUNC) &_slise_loss_smooth_c_ptr, 0},
-    {"_slise_loss_smooth_grad_c_ptr", (DL_FUNC) &_slise_loss_smooth_grad_c_ptr, 0},
-    {"_rcpp_module_boot_slise_mod", (DL_FUNC) &_rcpp_module_boot_slise_mod, 0},
-    {NULL, NULL, 0}
-};
-
-RcppExport void R_init_slise(DllInfo *dll) {
-    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
-    R_useDynamicSymbols(dll, FALSE);
-}
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include "slise_types.h"
+#include <RcppArmadillo.h>
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// sigmoidc
+arma::vec sigmoidc(const arma::vec& x);
+RcppExport SEXP _slise_sigmoidc(SEXP xSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP);
+    rcpp_result_gen = Rcpp::wrap(sigmoidc(x));
+    return rcpp_result_gen;
+END_RCPP
+}
+// log_sigmoidc
+arma::vec log_sigmoidc(const arma::vec& x);
+RcppExport SEXP _slise_log_sigmoidc(SEXP xSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP);
+    rcpp_result_gen = Rcpp::wrap(log_sigmoidc(x));
+    return rcpp_result_gen;
+END_RCPP
+}
+// loss_smooth_c
+double loss_smooth_c(const arma::vec& alpha, const arma::mat& data, const arma::vec& response, const double& epsilon, const double& beta, const double& lambda1, const double& lambda2, const arma::vec& weight);
+RcppExport SEXP _slise_loss_smooth_c(SEXP alphaSEXP, SEXP dataSEXP, SEXP responseSEXP, SEXP epsilonSEXP, SEXP betaSEXP, SEXP lambda1SEXP, SEXP lambda2SEXP, SEXP weightSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const arma::vec& >::type alpha(alphaSEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type data(dataSEXP);
+    Rcpp::traits::input_parameter< const arma::vec& >::type response(responseSEXP);
+    Rcpp::traits::input_parameter< const double& >::type epsilon(epsilonSEXP);
+    Rcpp::traits::input_parameter< const double& >::type beta(betaSEXP);
+    Rcpp::traits::input_parameter< const double& >::type lambda1(lambda1SEXP);
+    Rcpp::traits::input_parameter< const double& >::type lambda2(lambda2SEXP);
+    Rcpp::traits::input_parameter< const arma::vec& >::type weight(weightSEXP);
+    rcpp_result_gen = Rcpp::wrap(loss_smooth_c(alpha, data, response, epsilon, beta, lambda1, lambda2, weight));
+    return rcpp_result_gen;
+END_RCPP
+}
+// loss_smooth_c_dc
+Rcpp::NumericVector loss_smooth_c_dc(const SEXP xs, const SEXP dcptr);
+RcppExport SEXP _slise_loss_smooth_c_dc(SEXP xsSEXP, SEXP dcptrSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const SEXP >::type xs(xsSEXP);
+    Rcpp::traits::input_parameter< const SEXP >::type dcptr(dcptrSEXP);
+    rcpp_result_gen = Rcpp::wrap(loss_smooth_c_dc(xs, dcptr));
+    return rcpp_result_gen;
+END_RCPP
+}
+// loss_smooth_grad_c
+Rcpp::NumericVector loss_smooth_grad_c(const arma::vec& alpha, const arma::mat& data, const arma::vec& response, const double& epsilon, const double& beta, const double& lambda1, const double& lambda2, const arma::vec& weight);
+RcppExport SEXP _slise_loss_smooth_grad_c(SEXP alphaSEXP, SEXP dataSEXP, SEXP responseSEXP, SEXP epsilonSEXP, SEXP betaSEXP, SEXP lambda1SEXP, SEXP lambda2SEXP, SEXP weightSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const arma::vec& >::type alpha(alphaSEXP);
+    Rcpp::traits::input_parameter< const arma::mat& >::type data(dataSEXP);
+    Rcpp::traits::input_parameter< const arma::vec& >::type response(responseSEXP);
+    Rcpp::traits::input_parameter< const double& >::type epsilon(epsilonSEXP);
+    Rcpp::traits::input_parameter< const double& >::type beta(betaSEXP);
+    Rcpp::traits::input_parameter< const double& >::type lambda1(lambda1SEXP);
+    Rcpp::traits::input_parameter< const double& >::type lambda2(lambda2SEXP);
+    Rcpp::traits::input_parameter< const arma::vec& >::type weight(weightSEXP);
+    rcpp_result_gen = Rcpp::wrap(loss_smooth_grad_c(alpha, data, response, epsilon, beta, lambda1, lambda2, weight));
+    return rcpp_result_gen;
+END_RCPP
+}
+// loss_smooth_grad_c_dc
+Rcpp::NumericVector loss_smooth_grad_c_dc(const SEXP xs, const SEXP dcptr);
+RcppExport SEXP _slise_loss_smooth_grad_c_dc(SEXP xsSEXP, SEXP dcptrSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const SEXP >::type xs(xsSEXP);
+    Rcpp::traits::input_parameter< const SEXP >::type dcptr(dcptrSEXP);
+    rcpp_result_gen = Rcpp::wrap(loss_smooth_grad_c_dc(xs, dcptr));
+    return rcpp_result_gen;
+END_RCPP
+}
+// lg_combined_smooth_c_dc
+Rcpp::NumericVector lg_combined_smooth_c_dc(SEXP xs, SEXP dcptr);
+RcppExport SEXP _slise_lg_combined_smooth_c_dc(SEXP xsSEXP, SEXP dcptrSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< SEXP >::type xs(xsSEXP);
+    Rcpp::traits::input_parameter< SEXP >::type dcptr(dcptrSEXP);
+    rcpp_result_gen = Rcpp::wrap(lg_combined_smooth_c_dc(xs, dcptr));
+    return rcpp_result_gen;
+END_RCPP
+}
+// lg_getgrad_c_dc
+Rcpp::NumericVector lg_getgrad_c_dc(SEXP xs, SEXP dcptr);
+RcppExport SEXP _slise_lg_getgrad_c_dc(SEXP xsSEXP, SEXP dcptrSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< SEXP >::type xs(xsSEXP);
+    Rcpp::traits::input_parameter< SEXP >::type dcptr(dcptrSEXP);
+    rcpp_result_gen = Rcpp::wrap(lg_getgrad_c_dc(xs, dcptr));
+    return rcpp_result_gen;
+END_RCPP
+}
+// loss_smooth_c_ptr
+Rcpp::XPtr<funcPtr> loss_smooth_c_ptr();
+RcppExport SEXP _slise_loss_smooth_c_ptr() {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    rcpp_result_gen = Rcpp::wrap(loss_smooth_c_ptr());
+    return rcpp_result_gen;
+END_RCPP
+}
+// loss_smooth_grad_c_ptr
+Rcpp::XPtr<funcPtr> loss_smooth_grad_c_ptr();
+RcppExport SEXP _slise_loss_smooth_grad_c_ptr() {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    rcpp_result_gen = Rcpp::wrap(loss_smooth_grad_c_ptr());
+    return rcpp_result_gen;
+END_RCPP
+}
+
+RcppExport SEXP _rcpp_module_boot_slise_mod();
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_slise_sigmoidc", (DL_FUNC) &_slise_sigmoidc, 1},
+    {"_slise_log_sigmoidc", (DL_FUNC) &_slise_log_sigmoidc, 1},
+    {"_slise_loss_smooth_c", (DL_FUNC) &_slise_loss_smooth_c, 8},
+    {"_slise_loss_smooth_c_dc", (DL_FUNC) &_slise_loss_smooth_c_dc, 2},
+    {"_slise_loss_smooth_grad_c", (DL_FUNC) &_slise_loss_smooth_grad_c, 8},
+    {"_slise_loss_smooth_grad_c_dc", (DL_FUNC) &_slise_loss_smooth_grad_c_dc, 2},
+    {"_slise_lg_combined_smooth_c_dc", (DL_FUNC) &_slise_lg_combined_smooth_c_dc, 2},
+    {"_slise_lg_getgrad_c_dc", (DL_FUNC) &_slise_lg_getgrad_c_dc, 2},
+    {"_slise_loss_smooth_c_ptr", (DL_FUNC) &_slise_loss_smooth_c_ptr, 0},
+    {"_slise_loss_smooth_grad_c_ptr", (DL_FUNC) &_slise_loss_smooth_grad_c_ptr, 0},
+    {"_rcpp_module_boot_slise_mod", (DL_FUNC) &_rcpp_module_boot_slise_mod, 0},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_slise(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/src/loss_functions.cpp b/src/loss_functions.cpp
index 7d1de2a..708a25c 100644
--- a/src/loss_functions.cpp
+++ b/src/loss_functions.cpp
@@ -1,397 +1,439 @@
-/* 
-   Loss and gradient functions in Rcpp / RcppArmadillo.
-   
-   Usage in R:
-      library(Rcpp)
-      sourceCpp("loss_functions.cpp")
-
-   Classes:
-       (1) DataContainer
-           - Contains the following fields:
-	      * data (matrix)
-	      * response (vector)
-	      * beta, epsilon, lambda (doubles)
-
-	  - Each field has a getter / setter: getData(), setData() etc.
-
-       - To use this in R:
-          dc <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), beta = 1, epsilon = 0.4, lambda = 0.3)
-	  dc$getBeta()  ## returns 1
-	  dc$setBeta(5) ## set the value of beta to 5
-	  dc$getBeta()  ## returns 5
-
-
-   Functions:
-      (1) loss_smooth_c(vec alpha, mat data, vec response, double beta, double epsilon, double lambda)
-          - This function returns the value of the loss function (double).
-          - Call from R by giving the appropriate parameters.
-
-      (2) loss_smooth_grad_c(vec alpha, mat data, vec response, double beta, double epsilon, double lambda)
-          - This function returns the gradient of the loss function (vector).
-          - Call from R by giving the appropriate parameters.
-
-
-      (3) loss_smooth_c_dc(vec alpha, DataContainer dc)
-          - This function returns the value of the loss function (double).
-          - This is a wrapper for loss_smooth_c.
-          - To use this function, first create a DataContainer in R:
-          
-          dc <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), beta = 1, epsilon = 0.4, lambda = 0.3)
-	  loss_smooth_c_dc(xs = runif(5), dcptr = dc$.pointer)
-
-      (4) loss_smooth_grad_c_dc(vec alpha, DataContainer dc)
-          - This function returns the gradient of the loss function (vector).
-          - This is a wrapper for loss_smooth_grad_c.
-          - To use this function, first create a DataContainer in R:
-          
-          dc <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), beta = 1, epsilon = 0.4, lambda = 0.3)
-	  loss_smooth_c_dc(xs = runif(5), dcptr = dc$.pointer)
-
-
-      (5) lg_combined_smooth_c_dc(vec alpha, DataContainer dc)
-          - This function calculates both the loss and the gradient at the same time
-          - The loss (double) is returned and both the loss (double) and gradient (vector) are stored in the DataContainer.
-	  - To use this function, first create a DataContainer in R:
-
-          dc <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), beta = 1, epsilon = 0.4, lambda = 0.3)
-	  lg_combined_smooth_c_dc(xs = runif(5), dcptr = dc$.pointer) ## returns the loss
-	  
-	  dc$getLoss() # get the loss value
-	  dc$getGrad() # get the gradient
-
-      (6) lg_getgrad_c_dc(vec alpha, DataContainer dc)
-          - This function only returns the gradient found in dc (similar to dc$getGrad())
-          - This is to be used with LBFGS.
-
-
-      Usage with LBFGS:
-           dc    <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), beta = 1, epsilon = 0.4, lambda = 0.3)
-           alpha <- runif(5)
-           lbfgs(loss_smooth_c_ptr(), loss_smooth_grad_c_ptr(), alpha, dc$.pointer, max_iterations = 100, invisible = TRUE, orthantwise_c = dc$getLambda()),
- */
-
-
-
-// [[Rcpp::plugins(cpp14)]]
-// [[Rcpp::depends(RcppArmadillo)]]
-#include <RcppArmadillo.h>
-
-
-/* --------------------------------------------------
-  Class for holding the data 
-  -------------------------------------------------- */
-
-class DataContainer;
-
-//_' @export DataContainer
-class DataContainer {
-public:
-    arma::mat data;
-    arma::vec response;
-    double beta, epsilon, lambda;
-
-    double loss;
-    arma::vec grad;
-
-    DataContainer(Rcpp::NumericMatrix data_, Rcpp::NumericVector response_, double beta_, double epsilon_, double lambda_) : data ( Rcpp::as<arma::mat>(data_) ), response ( Rcpp::as<arma::colvec>(response_) ), beta( beta_ ), epsilon( epsilon_ ), lambda( lambda_ ) {};
-    DataContainer(Rcpp::NumericMatrix data_, Rcpp::NumericVector response_, double beta_, double epsilon_) : DataContainer(data_, response_, beta_, epsilon_, 0) {};
-    DataContainer(const DataContainer & i) : data( i.data ), response( i.response), beta( i.beta), epsilon( i.epsilon ), lambda( i.lambda) {}
-
-    Rcpp::NumericMatrix getData()     { return Rcpp::wrap(data); }
-    Rcpp::NumericVector getResponse() { return Rcpp::wrap(response); }
-    double getBeta()                  { return beta; }
-    double getEpsilon()               { return epsilon; }
-    double getLambda ()               { return lambda; }
-    double getLoss()                  { return loss; }
-    Rcpp::NumericVector getGrad()     { return Rcpp::wrap(grad); }
-    
-    void setData(Rcpp::NumericMatrix M)     { data = Rcpp::as<arma::mat>(M); } 
-    void setResponse(Rcpp::NumericVector v) { response = Rcpp::as<arma::colvec>(v); }
-    void setBeta(double x)            { beta = x; }
-    void setEpsilon(double x)         { epsilon = x; }
-    void setLambda(double x)          { lambda = x; }
-    void setGrad(Rcpp::NumericVector v)     { grad = Rcpp::as<arma::colvec>(v); }
-
-};
-
-RCPP_EXPOSED_CLASS(DataContainer)
-
-RCPP_MODULE(slise_mod) {
-    Rcpp::class_<DataContainer>("DataContainer")
-        .constructor< Rcpp::NumericMatrix, Rcpp::NumericVector, double, double, double >()
-        .constructor< Rcpp::NumericMatrix, Rcpp::NumericVector, double, double >()
-        .constructor< DataContainer >()
-        .method("getData", &DataContainer::getData)
-	.method("getResponse", &DataContainer::getResponse)
-	.method("getBeta", &DataContainer::getBeta)
-	.method("getEpsilon", &DataContainer::getEpsilon)
-	.method("getLambda", &DataContainer::getLambda)
-	.method("getLoss", &DataContainer::getLoss)
-	.method("getGrad", &DataContainer::getGrad)
-	
-	
-	.method("setData", &DataContainer::setData)
-	.method("setResponse", &DataContainer::setResponse)
-	.method("setBeta", &DataContainer::setBeta)
-	.method("setEpsilon", &DataContainer::setEpsilon)
-	.method("setLambda", &DataContainer::setLambda)
-	.method("setGrad", &DataContainer::setGrad)
-
-	;
-}
-
-
-/* --------------------------------------------------
-   Sigmoid function written in c++.
-  -------------------------------------------------- */
-// [[Rcpp::export]]
-arma::vec sigmoidc(const arma::vec& x) {
-    return (1 / (1 + arma::exp(-x)));
-    // return (arma::exp(x) / (arma::exp(x) + 1));
-}
-inline arma::vec i_sigmoidc(const arma::vec& x) {
-    return (1 / (1 + arma::exp(-x)));
-}
-
-/* --------------------------------------------------
-   Dot product.
-  -------------------------------------------------- */
-double dotprod(const arma::vec& a, const arma::vec& b) {
-    double res = 0;
-    for (int i = 0; i < a.size(); i++)
-	res += a[i] * b[i];
-    return res;
-}
-
-/* --------------------------------------------------
-   Clamp max.
-  -------------------------------------------------- */
-inline arma::vec clamp_max(const arma::vec& a, double b) {
-    arma::vec res(a.size());
-    for(int i = 0; i < a.size(); i++)
-        res[i] = a[i] < b ? a[i] : b;
-    return res;
-}
-
-/* --------------------------------------------------
-   Clamp min.
-  -------------------------------------------------- */
-inline arma::vec clamp_min(const arma::vec& a, double b) {
-    arma::vec res(a.size());
-    for(int i = 0; i < a.size(); i++)
-        res[i] = a[i] > b ? a[i] : b;
-    return res;
-}
-
-
-/* --------------------------------------------------
-   Clamp min according to another vector.
-  -------------------------------------------------- */
-inline arma::vec clamp_max_other(const arma::vec& a, const arma::vec& b, double c) {
-    arma::vec res(a.size());
-    for(int i = 0; i < a.size(); i++)
-        res[i] = b[i] < c ? a[i] : c;
-    return res;
-}
-
-
-/* --------------------------------------------------
-   Smooth loss function.
-
-   alpha	: alpha vector
-   data		: data matrix
-   response	: response vector
-   beta		: beta (double)
-   epsilon	: epsilon (double)
-   lambda	: lambda (double)
-  -------------------------------------------------- */
-
-// [[Rcpp::export]]
-double loss_smooth_c(const arma::vec& alpha,
-			    const arma::mat& data,
-			    const arma::vec& response,
-			    const double& beta,
-			    const double& epsilon,
-			    const double& lambda = 0) {
-    
-    // calculate loss
-    double epsilonx = pow(epsilon, 2);
-    double epsilony = epsilonx * response.size();
-    double betax = beta / epsilonx;
-    
-    arma::vec distances = arma::pow(data * alpha - response, 2);
-    arma::vec subsize   = i_sigmoidc(betax * (epsilonx - distances));
-    arma::vec loss      = clamp_max(distances - epsilony, 0); //phi(x) ~ clamp_max(x, 0)
-
-    double out = arma::accu(subsize % loss) / distances.size();
-
-    if (lambda > 0)
-        out += (lambda * arma::accu(arma::abs(alpha)));
-
-    return out;
-}
-
-
-/* --------------------------------------------------
-   Smooth loss function using a DataContainer.
-   The DataContainer contains the parameters
-   (data, response, beta, epsilon, lambda) which
-   are then used to call loss_smooth_c().
-   
-   xs    : alpha vector
-   dcptr : pointer to a DataContainer (from R)
-  -------------------------------------------------- */
-
-// [[Rcpp::export]]
-Rcpp::NumericVector loss_smooth_c_dc(const SEXP xs, const SEXP dcptr) {
-    const Rcpp::XPtr<DataContainer> dc(dcptr);
-
-    Rcpp::NumericVector out(1);
-    out[0] = loss_smooth_c(Rcpp::as<arma::vec>(xs), dc->data, dc->response, dc->beta, dc->epsilon, dc->lambda);
-    
-    return out;
-}
-
-
-/* --------------------------------------------------
-   Gradient of the smooth loss function.
-
-   alpha	: alpha vector
-   data		: data matrix
-   response	: response vector
-   beta		: beta (double)
-   epsilon	: epsilon (double)
-   lambda	: lambda (double)
-  -------------------------------------------------- */
-
-// [[Rcpp::export]]
-Rcpp::NumericVector loss_smooth_grad_c(const arma::vec& alpha,
-					      const arma::mat& data,
-					      const arma::vec& response,
-					      const double& beta,
-					      const double& epsilon,
-					      const double& lambda = 0) {
-
-    double epsilonx = pow(epsilon, 2);
-    double betax = beta / epsilonx;
-
-    arma::vec distances  = data * alpha - response;
-    arma::colvec distances2 = arma::pow(distances, 2);
-
-    arma::colvec f = distances2 / data.n_rows - epsilonx;
-    arma::colvec s = i_sigmoidc(betax * (epsilonx - distances2));
-
-    double k1 = 2.0 / data.n_rows;
-    arma::colvec k2 = -2.0 * betax * (s - pow(s, 2));
-
-    distances = clamp_max_other(distances, f, 0); //phi(x) ~ clamp_max(x, 0)
-
-    arma::vec out = (data.each_col() % distances).t() * ((s * k1) + (f % k2));
-    
-    if (lambda > 0)
-        out += (lambda * arma::sign(alpha));
-
-    return Rcpp::wrap(out);
-}
-
-
-/* --------------------------------------------------
-
-   Gradient of smooth loss function using a DataContainer.
-   The DataContainer contains the parameters
-   data, response, beta, epsilon, lambda) which are
-   then used to call loss_smooth_c().
-   
-   xs    : alpha vector
-   dcptr : pointer to a DataContainer (from R)
-  -------------------------------------------------- */
-
-// [[Rcpp::export]]
-Rcpp::NumericVector loss_smooth_grad_c_dc(const SEXP xs, const SEXP dcptr) {
-    const Rcpp::XPtr<DataContainer> dc(dcptr);
-
-    return loss_smooth_grad_c(Rcpp::as<arma::vec>(xs), dc->data, dc->response, dc->beta, dc->epsilon, dc->lambda);
-
-}
-
-
-/* --------------------------------------------------
-   Calculate loss and gradient simultaneously,
-   using a data container.
-   
-   xs    : alpha vector
-   dcptr : pointer to a DataContainer (from R)
-  -------------------------------------------------- */
-
-// [[Rcpp::export]]
-Rcpp::NumericVector lg_combined_smooth_c_dc(SEXP xs, SEXP dcptr) {
-    const arma::vec alpha = Rcpp::as<arma::vec>(xs);
-    const Rcpp::XPtr<DataContainer> dc(dcptr);
-
-    // calculate loss
-    double epsilonx = pow(dc->epsilon, 2);
-    double betax = dc->beta / epsilonx;
-    
-    arma::colvec distances  = dc->data * alpha - dc->response;
-    arma::colvec distances2 = arma::pow(distances, 2);
-    
-    arma::colvec subsize   = i_sigmoidc(betax * (epsilonx - distances2));
-    arma::colvec loss = distances2 / dc->data.n_rows - epsilonx;
-
-    dc->loss = arma::accu(subsize % clamp_max(loss, 0));  //phi(x) ~ clamp_max(x, 0)
-    
-    // calculate gradient
-    double k1 = 2.0 / dc->data.n_rows;
-    arma::colvec k2 = -2.0 * betax * (subsize - pow(subsize, 2));
-
-    distances = clamp_max_other(distances, loss, 0); //phi(x) ~ clamp_max(x, 0)
-    dc->grad = (dc->data.each_col() % distances).t() * ((subsize * k1) + (loss % k2));
-	
-    // check lambda
-    if (dc->lambda > 0) {
-        dc->loss += (dc->lambda * arma::accu(arma::abs(alpha)));
-        dc->grad += (dc->lambda * arma::sign(alpha));
-    }
-
-    Rcpp::NumericVector out(1);
-    out[0] = dc->loss;
-    return out;
-}
-
-
-/* --------------------------------------------------
-   Return gradient.
-   This assumes that the gradient has first been
-   calculated and stored in the DataContainer.
-   
-   xs    : alpha vector
-   dcptr : pointer to a DataContainer (from R)
-  -------------------------------------------------- */
-
-// [[Rcpp::export]]
-Rcpp::NumericVector lg_getgrad_c_dc(SEXP xs, SEXP dcptr) {
-    const Rcpp::XPtr<DataContainer> dc(dcptr);
-    return Rcpp::wrap(dc->grad);
-}
-
-
-
-/* --------------------------------------------------
-   Return function pointers to the loss and gradient
-   functions for use with lbfgs.
-  -------------------------------------------------- */
-
-typedef Rcpp::NumericVector (*funcPtr)(const SEXP, const SEXP);
-
-// [[Rcpp::export]]
-Rcpp::XPtr<funcPtr> loss_smooth_c_ptr() {
-    // return(XPtr<funcPtr>(new funcPtr(&loss_smooth_c_dc)));
-    return(Rcpp::XPtr<funcPtr>(new funcPtr(&lg_combined_smooth_c_dc)));
-}
-
-
-// [[Rcpp::export]]
-Rcpp::XPtr<funcPtr> loss_smooth_grad_c_ptr() {
-    // return(XPtr<funcPtr>(new funcPtr(&loss_smooth_grad_c_dc)));
-    return(Rcpp::XPtr<funcPtr>(new funcPtr(&lg_getgrad_c_dc)));
-}
+/*
+   Loss and gradient functions in Rcpp / RcppArmadillo.
+
+   Usage in R:
+      library(Rcpp)
+      sourceCpp("loss_functions.cpp")
+
+   Classes:
+       (1) DataContainer
+           - Contains the following fields:
+	      * data (matrix)
+	      * response (vector)
+	      * beta, epsilon, lambda1, lambda2 (doubles)
+
+	  - Each field has a getter / setter: getData(), setData() etc.
+
+       - To use this in R:
+          dc <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), epsilon = 0.4, beta = 1, lambda1 = 0.3, lambda2 = 0.0)
+	  dc$getBeta()  ## returns 1
+	  dc$setBeta(5) ## set the value of beta to 5
+	  dc$getBeta()  ## returns 5
+
+
+   Functions:
+      (1) loss_smooth_c(vec alpha, mat data, vec response, double epsilon, double beta, double lambda1, double lambda2)
+          - This function returns the value of the loss function (double).
+          - Call from R by giving the appropriate parameters.
+
+      (2) loss_smooth_grad_c(vec alpha, mat data, vec response, double epsilon, double beta, double lambda1, double lambda2)
+          - This function returns the gradient of the loss function (vector).
+          - Call from R by giving the appropriate parameters.
+
+
+      (3) loss_smooth_c_dc(vec alpha, DataContainer dc)
+          - This function returns the value of the loss function (double).
+          - This is a wrapper for loss_smooth_c.
+          - To use this function, first create a DataContainer in R:
+
+          dc <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), epsilon = 0.4, beta = 1, lambda1 = 0.3, lambda2 = 0.0)
+	  loss_smooth_c_dc(xs = runif(5), dcptr = dc$.pointer)
+
+      (4) loss_smooth_grad_c_dc(vec alpha, DataContainer dc)
+          - This function returns the gradient of the loss function (vector).
+          - This is a wrapper for loss_smooth_grad_c.
+          - To use this function, first create a DataContainer in R:
+
+          dc <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), epsilon = 0.4, beta = 1, lambda1 = 0.3, lambda2 = 0.0)
+	  loss_smooth_c_dc(xs = runif(5), dcptr = dc$.pointer)
+
+
+      (5) lg_combined_smooth_c_dc(vec alpha, DataContainer dc)
+          - This function calculates both the loss and the gradient at the same time
+          - The loss (double) is returned and both the loss (double) and gradient (vector) are stored in the DataContainer.
+	  - To use this function, first create a DataContainer in R:
+
+          dc <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), epsilon = 0.4, beta = 1, lambda1 = 0.3, lambda2 = 0.0)
+	  lg_combined_smooth_c_dc(xs = runif(5), dcptr = dc$.pointer) ## returns the loss
+
+	  dc$getLoss() # get the loss value
+	  dc$getGrad() # get the gradient
+
+      (6) lg_getgrad_c_dc(vec alpha, DataContainer dc)
+          - This function only returns the gradient found in dc (similar to dc$getGrad())
+          - This is to be used with LBFGS.
+
+
+      Usage with LBFGS:
+           dc    <- new(DataContainer, data = matrix(runif(50), nrow = 10), response = runif(10), epsilon = 0.4, beta = 1, lambda1 = 0.0, lambda2 = 0.0)
+           alpha <- runif(5)
+           lbfgs(loss_smooth_c_ptr(), loss_smooth_grad_c_ptr(), alpha, dc$.pointer, max_iterations = 100, invisible = TRUE, orthantwise_c = lambda1),
+ */
+
+// [[Rcpp::plugins(cpp11)]]
+// [[Rcpp::depends(RcppArmadillo)]]
+#include <RcppArmadillo.h>
+
+/* --------------------------------------------------
+  Class for holding the data
+  -------------------------------------------------- */
+
+class DataContainer;
+
+class DataContainer
+{
+public:
+    arma::mat data;
+    arma::vec response;
+    double epsilon, beta, lambda1, lambda2;
+    arma::vec weight;
+    double ws; // Should always be updated to arma::accu(weight) (== sum(weight))
+
+    double loss;
+    arma::vec grad;
+
+    DataContainer(arma::mat data_, arma::vec response_, double epsilon_, double beta_, double lambda1_, double lambda2_, arma::vec weight_) : data(data_), response(response_), epsilon(epsilon_), beta(beta_), lambda1(lambda1_), lambda2(lambda2_), weight(weight_), ws(arma::accu(weight_)){};
+    DataContainer(Rcpp::NumericMatrix data_, Rcpp::NumericVector response_, double epsilon_, double beta_, double lambda1_, double lambda2_, Rcpp::NumericVector weight_) : DataContainer(Rcpp::as<arma::mat>(data_), Rcpp::as<arma::colvec>(response_), epsilon_, beta_, lambda1_, lambda2_, Rcpp::as<arma::colvec>(weight_)){};
+    DataContainer(Rcpp::NumericMatrix data_, Rcpp::NumericVector response_, double epsilon_, double beta_, double lambda1_, double lambda2_) : DataContainer(Rcpp::as<arma::mat>(data_), Rcpp::as<arma::colvec>(response_), epsilon_, beta_, lambda1_, lambda2_, arma::zeros(0)){};
+    DataContainer(Rcpp::NumericMatrix data_, Rcpp::NumericVector response_, double epsilon_, double beta_, Rcpp::NumericVector weight_) : DataContainer(Rcpp::as<arma::mat>(data_), Rcpp::as<arma::colvec>(response_), epsilon_, beta_, 0.0, 0.0, Rcpp::as<arma::colvec>(weight_)){};
+    DataContainer(Rcpp::NumericMatrix data_, Rcpp::NumericVector response_, double epsilon_, double beta_) : DataContainer(Rcpp::as<arma::mat>(data_), Rcpp::as<arma::colvec>(response_), epsilon_, beta_, 0.0, 0.0, arma::zeros(0)){};
+    DataContainer(const DataContainer &i) : data(i.data), response(i.response), epsilon(i.epsilon), beta(i.beta), lambda1(i.lambda1), lambda2(i.lambda2), weight(i.weight), ws(i.ws) {}
+
+    Rcpp::NumericMatrix getData() { return Rcpp::wrap(data); }
+    Rcpp::NumericVector getResponse() { return Rcpp::wrap(response); }
+    double getEpsilon() { return epsilon; }
+    double getBeta() { return beta; }
+    double getLambda1() { return lambda1; }
+    double getLambda2() { return lambda2; }
+    double getLoss() { return loss; }
+    Rcpp::NumericVector getWeight() { return Rcpp::wrap(weight); }
+    Rcpp::NumericVector getGrad() { return Rcpp::wrap(grad); }
+
+    void setData(Rcpp::NumericMatrix M) { data = Rcpp::as<arma::mat>(M); }
+    void setResponse(Rcpp::NumericVector v) { response = Rcpp::as<arma::colvec>(v); }
+    void setEpsilon(double x) { epsilon = x; }
+    void setBeta(double x) { beta = x; }
+    void setLambda1(double x) { lambda1 = x; }
+    void setLambda2(double x) { lambda2 = x; }
+    void setWeight(Rcpp::NumericVector v)
+    {
+        weight = Rcpp::as<arma::colvec>(v);
+        ws = arma::accu(weight);
+    }
+    void setGrad(Rcpp::NumericVector v) { grad = Rcpp::as<arma::colvec>(v); }
+};
+
+RCPP_EXPOSED_CLASS(DataContainer)
+
+RCPP_MODULE(slise_mod)
+{
+    Rcpp::class_<DataContainer>("DataContainer")
+        .constructor<Rcpp::NumericMatrix, Rcpp::NumericVector, double, double, double, double, Rcpp::NumericVector>()
+        .constructor<Rcpp::NumericMatrix, Rcpp::NumericVector, double, double, double, double>()
+        .constructor<Rcpp::NumericMatrix, Rcpp::NumericVector, double, double, Rcpp::NumericVector>()
+        .constructor<Rcpp::NumericMatrix, Rcpp::NumericVector, double, double>()
+        .constructor<DataContainer>()
+
+        .method("getData", &DataContainer::getData)
+        .method("getResponse", &DataContainer::getResponse)
+        .method("getBeta", &DataContainer::getBeta)
+        .method("getEpsilon", &DataContainer::getEpsilon)
+        .method("getLambda1", &DataContainer::getLambda1)
+        .method("getLambda2", &DataContainer::getLambda2)
+        .method("getWeight", &DataContainer::getWeight)
+        .method("getLoss", &DataContainer::getLoss)
+        .method("getGrad", &DataContainer::getGrad)
+
+        .method("setData", &DataContainer::setData)
+        .method("setResponse", &DataContainer::setResponse)
+        .method("setBeta", &DataContainer::setBeta)
+        .method("setEpsilon", &DataContainer::setEpsilon)
+        .method("setLambda1", &DataContainer::setLambda1)
+        .method("setLambda2", &DataContainer::setLambda2)
+        .method("setWeight", &DataContainer::setWeight)
+        .method("setGrad", &DataContainer::setGrad)
+
+        ;
+}
+
+/* --------------------------------------------------
+   Utility functions.
+  -------------------------------------------------- */
+
+// [[Rcpp::export]]
+arma::vec sigmoidc(const arma::vec &x)
+{
+    return (1 / (1 + arma::exp(-x)));
+}
+
+arma::vec i_sigmoidc(const arma::vec &x)
+{
+    return (1 / (1 + arma::exp(-x)));
+}
+
+// [[Rcpp::export]]
+arma::vec log_sigmoidc(const arma::vec &x)
+{
+    arma::vec res(x.size());
+    for (arma::uword i = 0; i < x.size(); i++)
+    {
+        const double val = x[i];
+        res[i] = val >= 0 ? -log(1 + exp(-val)) : val - log(1 + exp(val));
+    }
+    return res;
+}
+
+arma::vec pmin(const arma::vec &a, const double b)
+{
+    arma::vec res(a.size());
+    for (arma::uword i = 0; i < a.size(); i++)
+    {
+        const double val = a[i];
+        res[i] = val < b ? val : b;
+    }
+    return res;
+}
+
+arma::vec pmin_other(const arma::vec &a, const arma::vec &b, const double c)
+{
+    arma::vec res(a.size());
+    for (arma::uword i = 0; i < a.size(); i++)
+        res[i] = b[i] < c ? a[i] : c;
+    return res;
+}
+
+/* --------------------------------------------------
+   Smooth loss function.
+
+   alpha	: alpha vector
+   data		: data matrix
+   response	: response vector
+   epsilon	: epsilon (double)
+   beta		: beta (double)
+   lambda1	: lambda1 (double)
+   lambda2	: lambda2 (double)
+   weight   : weight vector
+  -------------------------------------------------- */
+
+// [[Rcpp::export]]
+double loss_smooth_c(const arma::vec &alpha,
+                     const arma::mat &data,
+                     const arma::vec &response,
+                     const double &epsilon,
+                     const double &beta,
+                     const double &lambda1,
+                     const double &lambda2,
+                     const arma::vec &weight)
+{
+
+    // calculate loss
+    double epsilon2 = pow(epsilon, 2);
+    arma::vec distances = arma::pow(data * alpha - response, 2);
+    arma::vec subset = i_sigmoidc(beta * (epsilon2 - distances));
+
+    double loss;
+    if (weight.size() > 0)
+    {
+        double length = arma::accu(weight);
+        arma::vec residuals = pmin(distances - (epsilon2 * length), 0);
+        loss = arma::accu(subset % residuals % weight) / length;
+    }
+    else
+    {
+        arma::vec residuals = pmin(distances - epsilon2 * response.size(), 0);
+        loss = arma::accu(subset % residuals) / distances.size();
+    }
+
+    if (lambda1 > 0)
+        loss += (lambda1 * arma::accu(arma::abs(alpha)));
+    if (lambda2 > 0)
+        loss += (lambda2 * arma::accu(alpha % alpha));
+
+    return loss;
+}
+
+/* --------------------------------------------------
+   Smooth loss function using a DataContainer.
+   The DataContainer contains the parameters
+   (data, response, beta, epsilon, lambda1, lambda2)
+   which are then used to call loss_smooth_c().
+
+   xs    : alpha vector
+   dcptr : pointer to a DataContainer (from R)
+  -------------------------------------------------- */
+
+// [[Rcpp::export]]
+Rcpp::NumericVector loss_smooth_c_dc(const SEXP xs, const SEXP dcptr)
+{
+    const Rcpp::XPtr<DataContainer> dc(dcptr);
+
+    Rcpp::NumericVector out(1);
+    out[0] = loss_smooth_c(Rcpp::as<arma::vec>(xs), dc->data, dc->response, dc->epsilon, dc->beta, dc->lambda1, dc->lambda2, dc->weight);
+
+    return out;
+}
+
+/* --------------------------------------------------
+   Gradient of the smooth loss function.
+
+   alpha	: alpha vector
+   data		: data matrix
+   response	: response vector
+   epsilon	: epsilon (double)
+   beta		: beta (double)
+   lambda1	: lambda1 (double)
+   lambda2	: lambda2 (double)
+   weight   : weight vector
+  -------------------------------------------------- */
+
+// [[Rcpp::export]]
+Rcpp::NumericVector loss_smooth_grad_c(const arma::vec &alpha,
+                                       const arma::mat &data,
+                                       const arma::vec &response,
+                                       const double &epsilon,
+                                       const double &beta,
+                                       const double &lambda1,
+                                       const double &lambda2,
+                                       const arma::vec &weight)
+{
+
+    double epsilon2 = pow(epsilon, 2);
+    double length;
+    if (weight.size() > 0)
+        length = arma::accu(weight);
+    else
+        length = data.n_rows;
+    arma::vec distances = data * alpha - response;
+    arma::colvec distances2 = arma::pow(distances, 2);
+
+    arma::colvec f = distances2 - epsilon2 * length;
+    arma::colvec s = i_sigmoidc(beta * (epsilon2 - distances2));
+    double k1 = 2.0 / length;
+    arma::colvec k2 = (-2.0 * beta / length) * (s - pow(s, 2));
+    distances = pmin_other(distances, f, 0); //phi(x) ~ pmin(x, 0)
+
+    arma::vec grad;
+    if (weight.size() > 0)
+        grad = (data.each_col() % (distances % weight)).t() * ((s * k1) + (f % k2));
+    else
+        grad = (data.each_col() % distances).t() * ((s * k1) + (f % k2));
+
+    if (lambda1 > 0)
+        grad += (lambda1 * arma::sign(alpha));
+    if (lambda2 > 0)
+        grad += ((lambda2 * 2) * alpha);
+
+    return Rcpp::wrap(grad);
+}
+
+/* --------------------------------------------------
+   Gradient of smooth loss function using a DataContainer.
+   The DataContainer contains the parameters
+   data, response, beta, epsilon, lambda1, lambda2)
+   which are then used to call loss_smooth_c().
+
+   xs    : alpha vector
+   dcptr : pointer to a DataContainer (from R)
+  -------------------------------------------------- */
+
+// [[Rcpp::export]]
+Rcpp::NumericVector loss_smooth_grad_c_dc(const SEXP xs, const SEXP dcptr)
+{
+    const Rcpp::XPtr<DataContainer> dc(dcptr);
+    return loss_smooth_grad_c(Rcpp::as<arma::vec>(xs), dc->data, dc->response, dc->epsilon, dc->beta, dc->lambda1, dc->lambda2, dc->weight);
+}
+
+/* --------------------------------------------------
+   Calculate loss and gradient simultaneously,
+   using a data container.
+
+   xs    : alpha vector
+   dcptr : pointer to a DataContainer (from R)
+  -------------------------------------------------- */
+
+// [[Rcpp::export]]
+Rcpp::NumericVector lg_combined_smooth_c_dc(SEXP xs, SEXP dcptr)
+{
+    const arma::vec alpha = Rcpp::as<arma::vec>(xs);
+    const Rcpp::XPtr<DataContainer> dc(dcptr);
+
+    // calculate loss
+    double epsilon2 = pow(dc->epsilon, 2);
+    double length;
+    if (dc->weight.size() > 0)
+        length = dc->ws;
+    else
+        length = dc->data.n_rows;
+
+    arma::colvec distances = dc->data * alpha - dc->response;
+    arma::colvec distances2 = arma::pow(distances, 2);
+
+    arma::colvec subset = i_sigmoidc(dc->beta * (epsilon2 - distances2));
+    arma::colvec residuals = distances2 - epsilon2 * length;
+
+    if (dc->weight.size() > 0)
+        dc->loss = arma::accu(subset % pmin(residuals, 0) % dc->weight) / length;
+    else
+        dc->loss = arma::accu(subset % pmin(residuals, 0)) / length;
+
+    // calculate gradient
+    double k1 = 2.0 / length;
+    arma::colvec k2 = (-2.0 * dc->beta / length) * (subset - pow(subset, 2));
+    distances = pmin_other(distances, residuals, 0);
+
+    if (dc->weight.size() > 0)
+        dc->grad = (dc->data.each_col() % (distances % dc->weight)).t() * ((subset * k1) + (residuals % k2));
+    else
+        dc->grad = (dc->data.each_col() % distances).t() * ((subset * k1) + (residuals % k2));
+
+    // check lambda
+    if (dc->lambda1 > 0)
+    {
+        dc->loss += (dc->lambda1 * arma::accu(arma::abs(alpha)));
+        dc->grad += (dc->lambda1 * arma::sign(alpha));
+    }
+    if (dc->lambda2 > 0)
+    {
+        dc->loss += (dc->lambda2 * arma::accu(alpha % alpha));
+        dc->grad += ((dc->lambda2 * 2) * alpha);
+    }
+
+    Rcpp::NumericVector out(1);
+    out[0] = dc->loss;
+    return out;
+}
+
+/* --------------------------------------------------
+   Return gradient.
+   This assumes that the gradient has first been
+   calculated and stored in the DataContainer.
+
+   xs    : alpha vector
+   dcptr : pointer to a DataContainer (from R)
+  -------------------------------------------------- */
+
+// [[Rcpp::export]]
+Rcpp::NumericVector lg_getgrad_c_dc(SEXP xs, SEXP dcptr)
+{
+    const Rcpp::XPtr<DataContainer> dc(dcptr);
+    return Rcpp::wrap(dc->grad);
+}
+
+/* --------------------------------------------------
+   Return function pointers to the loss and gradient
+   functions for use with lbfgs.
+  -------------------------------------------------- */
+
+typedef Rcpp::NumericVector (*funcPtr)(const SEXP, const SEXP);
+
+// [[Rcpp::export]]
+Rcpp::XPtr<funcPtr> loss_smooth_c_ptr()
+{
+    return (Rcpp::XPtr<funcPtr>(new funcPtr(&lg_combined_smooth_c_dc)));
+}
+
+// [[Rcpp::export]]
+Rcpp::XPtr<funcPtr> loss_smooth_grad_c_ptr()
+{
+    return (Rcpp::XPtr<funcPtr>(new funcPtr(&lg_getgrad_c_dc)));
+}
diff --git a/src/slise_types.h b/src/slise_types.h
index d58200d..3a28c4d 100644
--- a/src/slise_types.h
+++ b/src/slise_types.h
@@ -1,5 +1,4 @@
-
-#include <RcppArmadillo.h>
-
-typedef Rcpp::NumericVector (*funcPtr)(const SEXP, const SEXP);
-
+
+#include <RcppArmadillo.h>
+
+typedef Rcpp::NumericVector (*funcPtr)(const SEXP, const SEXP);
diff --git a/tests/testthat.R b/tests/testthat.R
index 56298e6..3c65abe 100644
--- a/tests/testthat.R
+++ b/tests/testthat.R
@@ -1,4 +1,6 @@
-library(testthat)
-library(slise)
-
-test_check("slise")
+library(testthat)
+library(slise)
+
+set.seed(42)
+
+test_check("slise")
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index fb34f02..4b711b0 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -1,38 +1,38 @@
-
-library(slise)
-
-#' Create Synthetic data
-#'
-#' @param n number of items
-#' @param d number of columns
-#' @param num_zero number of irrelevant features
-#' @param epsilon 
-#' @param rnd_unif fraction of uniform noise
-#' @param rnd_adver fraction of adversarial noise models (list)
-#'
-#' @return list(X, Y, alpha, clean)
-#'
-data_create <- function(n, d, num_zero = floor(d * 0.3), epsilon = 0.1, rnd_unif = 0.0, rnd_adver = rep(0.1, 8)) {
-    X <- matrix(rnorm(d * n), n, d)
-    X <- sweep(X, 2, rnorm(d))
-    alpha <- runif(d + 1, -1, 1)
-    if (num_zero > 0) alpha[which_min_n(abs(alpha), num_zero)] <- 0
-    Y <- X %*% alpha[-1] + alpha[[1]]
-    Y <- Y + rnorm(n, sd = sd(Y) * epsilon / 2)
-    clean <- Y
-    start <- 1
-    for (i in rnd_adver) {
-        if (i > 0) {
-            size <- floor(n * i)
-            a2 <- runif(d + 1, -1, 1)
-            mask <- (start + 1):(start + size)
-            Y[mask] <- X[mask, ] %*% a2[-1] + a2[[1]] + rnorm(size, sd = sd(Y) * epsilon / 2)
-            start <- start + size
-        }
-    }
-    if (rnd_unif > 0) {
-        size <- floor(n * rnd_unif)
-        Y[(start + 1):(start + size)] <- runif(size, min(Y), max(Y))
-    }
-    list(X = X, Y = c(Y), alpha = alpha, clean = c(clean))
-}
+
+library(slise)
+
+#' Create Synthetic data
+#'
+#' @param n number of items
+#' @param d number of columns
+#' @param num_zero number of irrelevant features
+#' @param epsilon 
+#' @param rnd_unif fraction of uniform noise
+#' @param rnd_adver fraction of adversarial noise models (list)
+#'
+#' @return list(X, Y, alpha, clean)
+#'
+data_create <- function(n, d, num_zero = floor(d * 0.3), epsilon = 0.1, rnd_unif = 0.0, rnd_adver = rep(0.1, 8)) {
+    X <- matrix(rnorm(d * n), n, d)
+    X <- sweep(X, 2, rnorm(d))
+    alpha <- runif(d + 1, -1, 1)
+    if (num_zero > 0) alpha[which_min_n(abs(alpha), num_zero)] <- 0
+    Y <- X %*% alpha[-1] + alpha[[1]]
+    Y <- Y + rnorm(n, sd = sd(Y) * epsilon / 2)
+    clean <- Y
+    start <- 1
+    for (i in rnd_adver) {
+        if (i > 0) {
+            size <- floor(n * i)
+            a2 <- runif(d + 1, -1, 1)
+            mask <- (start + 1):(start + size)
+            Y[mask] <- X[mask, ] %*% a2[-1] + a2[[1]] + rnorm(size, sd = sd(Y) * epsilon / 2)
+            start <- start + size
+        }
+    }
+    if (rnd_unif > 0) {
+        size <- floor(n * rnd_unif)
+        Y[(start + 1):(start + size)] <- runif(size, min(Y), max(Y))
+    }
+    list(X = X, Y = c(Y), alpha = alpha, clean = c(clean))
+}
diff --git a/tests/testthat/test_data.R b/tests/testthat/test_data.R
index a7f6e18..924da02 100644
--- a/tests/testthat/test_data.R
+++ b/tests/testthat/test_data.R
@@ -1,32 +1,44 @@
-context("Tests for the data functions")
-source("setup.R")
-
-test_that("Check data_preprocess", {
-    for (i in c(rep(c(4, 8), 2))) {
-        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
-        for(i in 0:7) {
-            sc <- i%%2 > 0
-            ic <- i%%4 < 2
-            lg <- i >= 4
-            Y <- if(lg) sigmoid(data$Y) else data$Y
-            prep <- data_preprocess(data$X, Y, sc, ic, lg)
-            if (ic) {
-                expect_equal(prep$X[,-1], prep$scale_x(data$X))
-            } else {
-                expect_equal(prep$X, prep$scale_x(data$X))
-            }
-            expect_equal(prep$Y[1], prep$scale_y(Y[1]))
-            expect_equal(Y[1], prep$unscale_y(prep$Y[1]))
-            expect_equal(data$alpha, prep$unscale_alpha(prep$scale_alpha(data$alpha)))
-        }
-    }
-})
-
-test_that("Check data_local", {
-    for (i in c(rep(c(4, 8), 2))) {
-        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
-        local <- data_local(data$X, data$Y, sample(2:30, 1))
-        expect_equal(local$X[1,], local$scale_x(data$X[1,]))
-        expect_equal(local$Y[1], local$scale_y(data$Y[1]))
-    }
-})
\ No newline at end of file
+context("Tests for the data functions")
+source("setup.R")
+
+test_that("Check data preprocessing", {
+    for (i in c(rep(c(4, 8), 2))) {
+        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
+        for(i in 0:7) {
+            expect_equal(
+                c(data$X),
+                c(remove_intercept_column(add_intercept_column(data$X)))
+            )
+            X <- data$X
+            X[, 3] <- 0
+            X2 <- remove_constant_columns(X)
+            expect_equal(c(X[, -3]), c(X2))
+            expect_equal(attr(X2, "constant_columns"), 3)
+            X3 <- scale_robust(X2)
+            expect_equal(
+                c(X2),
+                c(sweep(sweep(X3, 2, attr(X3, "scaled:scale"), `*`), 2, attr(X3, "scaled:center"), `+`))
+            )
+            expect_equal(X3, scale_same(data$X, X3))
+            expect_equal(c(scale_same(X[4, ], X3)), c(X3[4, ]))
+            expect_equal(c(scale_same(X[1:3, ], X3)), c(X3[1:3, ]))
+            Y2 <- scale_robust(data$Y)
+            ols1 <- .lm.fit(add_intercept_column(X3), Y2)$coefficients
+            ols2 <- .lm.fit(add_intercept_column(X2), data$Y)$coefficients
+            ols3 <- unscale_alpha(ols1, X3, Y2)
+            expect_equal(ols2, ols3)
+        }
+    }
+})
+
+test_that("Check simple_pca", {
+    for (i in c(rep(c(4, 8), 2))) {
+        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
+        pca1 <- simple_pca(data$X, ceiling(i * 0.3))
+        pca2 <- prcomp(data$X, center = FALSE, scale = FALSE, rank = ceiling(i * 0.3))$rotation
+        expect_equal(c(pca1), c(pca2))
+        pca3 <- simple_pca(data$X, i)
+        X2 <- (data$X %*% pca3) %*% t(pca3)
+        expect_equal(X2, data$X)
+    }
+})
diff --git a/tests/testthat/test_expl.R b/tests/testthat/test_expl.R
deleted file mode 100644
index 88891c9..0000000
--- a/tests/testthat/test_expl.R
+++ /dev/null
@@ -1,60 +0,0 @@
-context("Tests for plotting and printing")
-source("setup.R")
-
-.test_plot <- function(expr) {
-    pdf(NULL)
-    tryCatch(expr, finally = { dev.off() })
-    expect_true(TRUE)
-}
-
-test_that("Check print", {
-    data <- data_create(300, 5, 1)
-    expl <- slise.fit(data$X, data$Y)
-    cap <- capture.output(print(expl, title="TEST"))[[1]]
-    expect_true(startsWith(cap, "TEST"))
-    expl <- slise.explain(data$X, data$Y, 3)
-    cap <- capture.output(print(expl, title="TEST"))[[1]]
-    expect_true(startsWith(cap, "TEST"))
-})
-
-test_that("Check plot", {
-    .test_plot({
-        data <- data_create(300, 5, 1)
-        expl <- slise.fit(data$X, data$Y)
-        cap <- plot(expl, cols = 1)
-        cap <- plot(expl, cols = 1, other=list(expl))
-        cap <- plot(expl, cols = c(1, 2))
-        cap <- plot(expl, cols = c(1, 2), threed=TRUE)
-        expl <- slise.explain(data$X, data$Y, 3)
-        cap <- plot(expl, cols = 1)
-        cap <- plot(expl, cols = 1, other=list(expl))
-        cap <- plot(expl, cols = c(1, 2))
-        cap <- plot(expl, cols = c(1, 2), threed=TRUE)
-    })
-})
-
-test_that("Check explain bar", {
-    .test_plot({
-        data <- data_create(300, 5, 1)
-        expl <- slise.explain(data$X, data$Y, 3)
-        explain(expl, "bar")
-    })
-})
-
-test_that("Check explain dist", {
-    .test_plot({
-        data <- data_create(300, 5)
-        expl <- slise.fit(data$X, data$Y)
-        expect_error(explain(expl, "dist"))
-        expl <- slise.explain(data$X, data$Y, 3)
-        explain(expl, "distribution")
-    })
-})
-
-test_that("Check explain image", {
-    .test_plot({
-        data <- data_create(300, 9, 1, 0.1, 0.3, 0.3)
-        expl <- slise.explain(data$X, data$Y, 3)
-        explain(expl, "image")
-    })
-})
\ No newline at end of file
diff --git a/tests/testthat/test_gradients.R b/tests/testthat/test_gradients.R
index f6a66f4..7e2354d 100644
--- a/tests/testthat/test_gradients.R
+++ b/tests/testthat/test_gradients.R
@@ -1,137 +1,199 @@
-context("Tests for comparing R/C++ calculations")
-source("setup.R")
-
-test_that("Compare loss_smooth_grad to numerical gradient approximation", {
-    if (require(numDeriv)) {
-        for (j in 1:8) {
-            data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
-            alpha <- rnorm(ncol(data$X))
-            expect_equal(
-                c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.1, lambda=0)),
-                grad(loss_smooth, alpha, X=data$X, Y=data$Y, epsilon=0.1, lambda=0),
-                tolerance = 1e-6
-            )
-            expect_equal(
-                c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.1, lambda=0.1)),
-                grad(loss_smooth, alpha, X=data$X, Y=data$Y, epsilon=0.1, lambda=0.1),
-                tolerance = 1e-6
-            )
-            i <- sample.int(length(data$Y), 1)
-            data <- data_local(data$X, data$Y, data$X[i,], data$Y[[i]])
-            alpha <- rnorm(ncol(data$X))
-            expect_equal(
-                c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.03, lambda=0)),
-                grad(loss_smooth, alpha, X=data$X, Y=data$Y, epsilon=0.03, lambda=0),
-                tolerance = 1e-6
-            )
-            expect_equal(
-                c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.03, lambda=0.1)),
-                grad(loss_smooth, alpha, X=data$X, Y=data$Y, epsilon=0.03, lambda=0.1),
-                tolerance = 1e-6
-            )
-        }
-    } else {
-        skip("Package numDeriv required for comparing the manual gradients to numerical gradients")
-    }
-})
-
-
-test_that("Compare loss_smooth and loss_smooth_grad to cpp", {
-    for (j in 1:8) {
-        # global
-        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
-        alpha <- rnorm(ncol(data$X))
-        # fn
-        expect_equal(
-            c(loss_smooth(alpha, X=data$X, Y=data$Y, epsilon=0.1, beta=3, lambda=0)),
-            c(loss_smooth_c(alpha, data=data$X, response=data$Y, beta=3, epsilon=0.1)),
-            tolerance = 1e-6
-        )
-        expect_equal(
-            c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.1, beta=3, lambda=0)),
-            c(loss_smooth_grad_c(alpha, data=data$X, response=data$Y, beta=3, epsilon=0.1)),
-            tolerance = 1e-6
-        )
-        # dc
-        dc <- new(DataContainer, data = data$X, response = data$Y, beta = 3, epsilon = 0.1)
-        expect_equal(
-            c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.1, beta=3, lambda=0)),
-            c(loss_smooth_grad_c_dc(xs=alpha, dcptr = dc$.pointer)),
-            tolerance = 1e-6
-        )
-        expect_equal(
-            c(loss_smooth(alpha, X=data$X, Y=data$Y, epsilon=0.1, beta=3, lambda=0)),
-            c(loss_smooth_c_dc(xs=alpha, dcptr = dc$.pointer)),
-            tolerance = 1e-6
-        )
-        # ptr
-        expect_equal(
-            c(loss_smooth(alpha, X=data$X, Y=data$Y, epsilon=0.1, beta=3, lambda=0)),
-            c(lg_combined_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
-            tolerance = 1e-6
-        )
-        expect_equal(
-            c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.1, beta=3, lambda=0)),
-            c(lg_getgrad_c_dc(xs = alpha, dcptr = dc$.pointer)),
-            tolerance = 1e-6
-        )
-        # local
-        i <- sample.int(length(data$Y), 1)
-        data <- data_local(data$X, data$Y, data$X[i,], data$Y[[i]])
-        alpha <- rnorm(ncol(data$X))
-        # fn
-        expect_equal(
-            c(loss_smooth(alpha, X=data$X, Y=data$Y, epsilon=0.03, lambda=0)),
-            c(loss_smooth_c(alpha, data=data$X, response=data$Y, beta=3, epsilon=0.03)),
-            tolerance = 1e-6
-        )
-        expect_equal(
-            c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.03, lambda=0)),
-            c(loss_smooth_grad_c(alpha, data=data$X, response=data$Y, beta=3, epsilon=0.03)),
-            tolerance = 1e-6
-        )
-        # dc
-        dc <- new(DataContainer, data = data$X, response = data$Y, beta = 3, epsilon = 0.03)
-        expect_equal(
-            c(loss_smooth(alpha, X=data$X, Y=data$Y, epsilon=0.03, beta=3, lambda=0)),
-            c(loss_smooth_c_dc(xs=alpha, dcptr = dc$.pointer)),
-            tolerance = 1e-6
-        )
-        expect_equal(
-            c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.03, beta=3, lambda=0)),
-            c(loss_smooth_grad_c_dc(xs=alpha, dcptr = dc$.pointer)),
-            tolerance = 1e-6
-        )
-        # ptr
-        expect_equal(
-            c(loss_smooth(alpha, X=data$X, Y=data$Y, epsilon=0.03, beta=3, lambda=0)),
-            c(lg_combined_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
-            tolerance = 1e-6
-        )
-        expect_equal(
-            c(loss_smooth_grad(alpha, X=data$X, Y=data$Y, epsilon=0.03, beta=3, lambda=0)),
-            c(lg_getgrad_c_dc(xs = alpha, dcptr = dc$.pointer)),
-            tolerance = 1e-6
-        )
-    }
-})
-
-
-test_that("Compare owlqn_c to owlqn_r", {
-    for (j in 1:5) {
-        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
-        alpha <- rnorm(ncol(data$X))
-        expect_equal(
-            owlqn_c(alpha, data$X, data$Y, 0.1, 0, j)$value,
-            owlqn_r(alpha, data$X, data$Y, 0.1, 0, j)$value,
-            tolerance = 1e-6
-        )
-        expect_equal(
-            owlqn_c(alpha, data$X, data$Y, 0.1, 2, j)$value,
-            owlqn_r(alpha, data$X, data$Y, 0.1, 2, j)$value,
-            tolerance = 1e-6
-        )
-    }
-})
-
-
+context("Tests for comparing R/C++ calculations")
+source("setup.R")
+
+test_that("Compare loss_smooth_grad to numerical gradient approximation", {
+    if (require(numDeriv)) {
+        for (j in 1:8) {
+            data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+            alpha <- rnorm(ncol(data$X))
+            expect_equal(
+                c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0)),
+                grad(loss_smooth, alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0),
+                tolerance = 1e-3
+            )
+            expect_equal(
+                c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0.1)),
+                grad(loss_smooth, alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0.1),
+                tolerance = 1e-3
+            )
+            alpha <- rnorm(ncol(data$X))
+            expect_equal(
+                c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0)),
+                grad(loss_smooth, alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0),
+                tolerance = 1e-3
+            )
+            expect_equal(
+                c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0.1)),
+                grad(loss_smooth, alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0.1),
+                tolerance = 1e-3
+            )
+        }
+    }
+})
+
+
+test_that("Compare loss_smooth and loss_smooth_grad to cpp", {
+    for (j in 1:8) {
+        # global
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        alpha <- rnorm(ncol(data$X))
+        # fn
+        expect_equal(
+            c(loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0)),
+            c(loss_smooth_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight =  numeric(0))),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0)),
+            c(loss_smooth_grad_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight =  numeric(0))),
+            tolerance = 1e-4
+        )
+        # dc
+        dc <- data_container(X = data$X, Y = data$Y, epsilon = 0.1, beta = 3)
+        expect_equal(
+            c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0)),
+            c(loss_smooth_grad_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0)),
+            c(loss_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        # combined
+        expect_equal(
+            c(loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0)),
+            c(lg_combined_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0)),
+            c(lg_getgrad_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        # local
+        alpha <- rnorm(ncol(data$X))
+        # fn
+        expect_equal(
+            c(loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0)),
+            c(loss_smooth_c(alpha, data = data$X, response = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0, lambda2 = 0, weight =  numeric(0))),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0)),
+            c(loss_smooth_grad_c(alpha, data = data$X, response = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0, lambda2 = 0, weight =  numeric(0))),
+            tolerance = 1e-4
+        )
+        # dc
+        dc <- data_container(X = data$X, Y = data$Y, epsilon = 0.03, beta = 3)
+        expect_equal(
+            c(loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0)),
+            c(loss_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0)),
+            c(loss_smooth_grad_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        # ptr
+        expect_equal(
+            c(loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0)),
+            c(lg_combined_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 3, lambda1 = 0)),
+            c(lg_getgrad_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+    }
+})
+
+
+test_that("Compare owlqn_c to owlqn_r", {
+    for (j in 1:5) {
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        alpha <- rnorm(ncol(data$X))
+        dc <- data_container(X = data$X, Y = data$Y, epsilon = 0.1, beta = j)
+        expect_equal(
+            owlqn_c(alpha, dc, 0.0)$value,
+            owlqn_r(alpha, data$X, data$Y, 0.1, j, lambda1 = 0)$value,
+            tolerance = 1e-4
+        )
+        expect_equal(
+            owlqn_c(alpha, dc, 2.0)$value,
+            owlqn_r(alpha, data$X, data$Y, 0.1, j, lambda1 = 2)$value,
+            tolerance = 1e-4
+        )
+    }
+})
+
+test_that("Smooth equals sharp at infinity", {
+    for (j in 1:5) {
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        alpha <- rnorm(ncol(data$X))
+        weight <- runif(100)
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 1e10),
+            loss_sharp(alpha, X = data$X, Y = data$Y, epsilon = 0.1),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 1e10, weight = NULL, lambda1 = 1, lambda2 = 1),
+            loss_sharp(alpha, X = data$X, Y = data$Y, epsilon = 0.1, weight = NULL, lambda1 = 1, lambda2 = 1),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 1e10, weight = NULL, lambda1 = 1, lambda2 = 1),
+            loss_sharp(alpha, X = data$X, Y = data$Y, epsilon = 0.03, weight = NULL, lambda1 = 1, lambda2 = 1),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.03, beta = 1e10, weight = weight, lambda1 = 1, lambda2 = 1),
+            loss_sharp(alpha, X = data$X, Y = data$Y, epsilon = 0.03, weight = weight, lambda1 = 1, lambda2 = 1),
+            tolerance = 1e-3
+        )
+    }
+})
+
+test_that("Residual versions are identical", {
+    for (j in 1:5) {
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        alpha <- rnorm(ncol(data$X))
+        res2 <- (data$X %*% alpha - data$Y)^2
+        eps <- 0.1
+        eps2 <- eps^2
+        weight <- runif(100)
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = eps, beta = 3),
+            loss_smooth_res(alpha, res2, epsilon2 = eps2, beta = 3),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = eps, beta = 3, lambda1 = 1, lambda2 = 1),
+            loss_smooth_res(alpha, res2, epsilon2 = eps2, beta = 3, lambda1 = 1, lambda2 = 1),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = eps, beta = 3, weight = weight, lambda1 = 1, lambda2 = 1),
+            loss_smooth_res(alpha, res2, epsilon2 = eps2, beta = 3, weight = weight, lambda1 = 1, lambda2 = 1),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_sharp(alpha, X = data$X, Y = data$Y, epsilon = eps),
+            loss_sharp_res(alpha, res2, epsilon2 = eps2),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_sharp(alpha, X = data$X, Y = data$Y, epsilon = eps, lambda1 = 1, lambda2 = 1),
+            loss_sharp_res(alpha, res2, epsilon2 = eps2, lambda1 = 1, lambda2 = 1),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_sharp(alpha, X = data$X, Y = data$Y, epsilon = eps, weight = weight, lambda1 = 1, lambda2 = 1),
+            loss_sharp_res(alpha, res2, epsilon2 = eps2, weight = weight, lambda1 = 1, lambda2 = 1),
+            tolerance = 1e-3
+        )
+    }
+})
\ No newline at end of file
diff --git a/tests/testthat/test_plot.R b/tests/testthat/test_plot.R
new file mode 100644
index 0000000..d28bd39
--- /dev/null
+++ b/tests/testthat/test_plot.R
@@ -0,0 +1,97 @@
+context("Tests for plotting and printing")
+source("setup.R")
+
+.test_plot <- function(expr, ...) {
+    pdf(NULL)
+    tryCatch(plot(expr, ...), finally = { dev.off() })
+    expect_true(TRUE)
+}
+
+test_that("Check print", {
+    for (i in c(5, 21)) {
+        data <- data_create(100, i, 1)
+        expl <- slise.fit(data$X, data$Y, 0.1)
+        cap <- capture.output(print(expl))[[1]]
+        expect_equal(cap, "SLISE Regression:")
+        expl <- slise.fit(data$X, data$Y, 0.1, normalise = TRUE)
+        cap <- capture.output(print(expl))[[1]]
+        expect_equal(cap, "SLISE Regression:")
+        expl <- slise.explain(data$X, data$Y, 0.2, 3)
+        cap <- capture.output(print(expl))[[1]]
+        expect_equal(cap, "SLISE Explanation:")
+    }
+})
+
+test_that("Check plot 2D", {
+    x <- rnorm(50)
+    y <- rnorm(50)
+    expl <- slise.fit(x, y, 1)
+    .test_plot(expl)
+    expl <- slise.fit(x, y, 0.5, normalise = TRUE)
+    .test_plot(expl)
+    expl <- slise.explain(x, y, 1, 3)
+    .test_plot(expl)
+    expl <- slise.explain(x, y, 1, 3, normalise = TRUE)
+    .test_plot(expl)
+})
+
+test_that("Check plot dist", {
+    data <- data_create(100, 5, 1)
+    expl <- slise.fit(data$X, data$Y, 0.1)
+    .test_plot(expl, "dist")
+    expl <- slise.fit(data$X, data$Y, 0.1, normalise = TRUE)
+    .test_plot(expl, "dist")
+    expl <- slise.explain(data$X, data$Y, 0.2, 3)
+    .test_plot(expl, "dist")
+    expl <- slise.explain(data$X, data$Y, 0.2, 3, normalise = TRUE)
+    .test_plot(expl, "dist")
+})
+
+test_that("Check plot pred", {
+    data <- data_create(100, 5, 1)
+    expl <- slise.fit(data$X, data$Y, 0.1)
+    .test_plot(expl, "prediction")
+    expl <- slise.fit(data$X, data$Y, 0.1, normalise = TRUE)
+    .test_plot(expl, "prediction")
+    expl <- slise.explain(data$X, data$Y, 0.2, 3)
+    .test_plot(expl, "prediction")
+    expl <- slise.explain(data$X, data$Y, 0.2, 3, normalise = TRUE)
+    .test_plot(expl, "prediction")
+})
+
+test_that("Check plot bar", {
+    data <- data_create(100, 15, 2)
+    expl <- slise.fit(data$X, data$Y, 0.1)
+    .test_plot(expl, "bar")
+    expl <- slise.fit(data$X, data$Y, 0.1, normalise = TRUE)
+    .test_plot(expl, "bar")
+    expl <- slise.explain(data$X, data$Y, 0.2, 3)
+    .test_plot(expl, "bar")
+    expl <- slise.explain(data$X, data$Y, 0.2, 3, normalise = TRUE)
+    .test_plot(expl, "bar")
+})
+
+test_that("Check plot wordcloud", {
+    data <- data_create(100, 15, 2)
+    expl <- slise.fit(data$X, data$Y, 0.1)
+    .test_plot(expl, "wordcloud", local = FALSE)
+    .test_plot(expl, "wordcloud", local = TRUE)
+    expl <- slise.fit(data$X, data$Y, 0.1, normalise = TRUE)
+    .test_plot(expl, "wordcloud", local = FALSE)
+    .test_plot(expl, "wordcloud", local = TRUE)
+    expl <- slise.explain(data$X, data$Y, 0.2, 3)
+    .test_plot(expl, "wordcloud", local = FALSE)
+    .test_plot(expl, "wordcloud", local = TRUE)
+    expl <- slise.explain(data$X, data$Y, 0.2, 3, normalise = TRUE)
+    .test_plot(expl, "wordcloud", local = FALSE)
+    .test_plot(expl, "wordcloud", local = TRUE)
+})
+
+test_that("Check plot image", {
+    data <- data_create(100, 16, 2)
+    data$X <- data$X[, c(sapply(1:4, function(i) rep((i - 1) * 4 + rep(1:4, each = 4), 4) ))]
+    expl <- slise.explain(data$X, data$Y, 0.2, 3)
+    .test_plot(plot(expl, type = "mnist", plots = 1))
+    .test_plot(plot(expl, type = "mnist", plots = 2))
+    .test_plot(plot(expl, type = "mnist", plots = 3))
+})
diff --git a/tests/testthat/test_slise.R b/tests/testthat/test_slise.R
index 8acfa24..b37ce30 100644
--- a/tests/testthat/test_slise.R
+++ b/tests/testthat/test_slise.R
@@ -1,66 +1,76 @@
-context("Tests for the slise algorithm")
-source("setup.R")
-
-test_that("Check SLISE", {
-    for (i in c(rep(c(4, 8, 16), 2))) {
-        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
-        slise1 <- slise.fit(data$X, data$Y, epsilon=0.1, lambda=0)
-        slise2 <- slise.fit(data$X, data$Y, epsilon=0.1, lambda=0.1)
-        data2 <- slise1$scaled
-        ols <- .lm.fit(data2$X, data2$Y)$coefficients
-        slise1_loss <- slise1$value
-        slise2_loss <- slise2$value
-        ols1_loss <- loss_sharp(ols, data2$X, data2$Y, epsilon=0.1, lambda=0)
-        ols2_loss <- loss_sharp(ols, data2$X, data2$Y, epsilon=0.1, lambda=0.1)
-        expect_lt(slise1_loss, ols1_loss)
-        expect_lt(slise2_loss, ols2_loss)
-        x <- rnorm(length(data$alpha)-1)
-        y <- sum(x * data$alpha[-1]) + data$alpha[[1]]
-        expl1_alpha <- slise.explain(data$X, data$Y, x, y, 0.03, 0)$coefficients
-        expl2_alpha <- slise.explain(data$X, data$Y, x, y, 0.03, 0.1)$coefficients
-        expect_equal(sum(expl1_alpha[-1] * x) + expl1_alpha[[1]], y)
-        expect_equal(sum(expl2_alpha[-1] * x) + expl2_alpha[[1]], y)
-    }
-})
-
-test_that("Check SLISE find", {
-    for (i in c(rep(c(4, 8), 2))) {
-        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
-        x <- data$X[1,]
-        y <- data$clean[1]
-        expl1_alpha <- slise.explain_find(data$X, data$Y, x, y, 0.03, lambda=0, variables=i/2)$coefficients
-        expl2_alpha <- slise.explain_find(data$X, data$Y, x, y, 0.03, lambda=0.1, variables=i/2)$coefficients
-        expect_equal(sum(expl1_alpha[-1] * x) + expl1_alpha[[1]], y)
-        expect_equal(sum(expl2_alpha[-1] * x) + expl2_alpha[[1]], y)
-    }
-})
-
-test_that("Check SLISE comb", {
-    for (i in c(rep(5, 3))) {
-        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
-        x <- data$X[1,]
-        y <- data$clean[1]
-        expl1 <- slise.explain_comb(data$X, data$Y, x, y, 0.03, variables=3)
-        expl2 <- slise.explain_comb(data$X, data$Y, x, y, 0.10, variables=3)
-        expect_equal(predict(expl1, x), y)
-        expect_equal(predict(expl2, x), y)
-        expect_gte(3, sparsity(expl1$alpha[-1]))
-        expect_gte(3, sparsity(expl2$alpha[-1]))
-    }
-})
-
-test_that("Check SLISE predict", {
-    for (i in c(rep(c(4, 8), 2))) {
-        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
-        x <- data$X[1,]
-        y <- data$clean[1]
-        expl1 <- slise.explain(data$X, data$Y, x, y, scale=TRUE)
-        expl2 <- slise.explain(data$X, sigmoid(data$Y), x, sigmoid(y), logit=TRUE)
-        expl3 <- slise.fit(data$X, data$Y, lambda=0.1)
-        Y1 <- predict(expl1, data$X)
-        Y2 <- predict(expl2, data$X)
-        Y3 <- predict(expl3, data$X)
-        expect_equal(Y1[1], y)
-        expect_equal(Y2[1], sigmoid(y))
-    }
-})
+context("Tests for the slise algorithm")
+source("setup.R")
+
+test_that("Check SLISE", {
+    for (i in c(rep(c(4, 8, 16), 2))) {
+        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
+        slise1 <- slise.fit(data$X, data$Y, epsilon = 0.1, lambda1 = 0)
+        slise2 <- slise.fit(data$X, data$Y, epsilon = 0.1, lambda1 = 0.1)
+        ols <- .lm.fit(data$X, data$Y)$coefficients
+        slise1_loss <- slise1$value
+        slise2_loss <- slise2$value
+        ols1_loss <- loss_sharp(ols, data$X, data$Y, epsilon = 0.1, lambda1 = 0)
+        ols2_loss <- loss_sharp(ols, data$X, data$Y, epsilon = 0.1, lambda1 = 0.1)
+        expect_lt(slise1_loss, ols1_loss)
+        expect_lt(slise2_loss, ols2_loss)
+        x <- rnorm(length(data$alpha) - 1)
+        y <- sum(x * data$alpha[-1]) + data$alpha[[1]]
+        expl1_alpha <- slise.explain(data$X, data$Y, 0.03, x, y, 0)$coefficients
+        expl2_alpha <- slise.explain(data$X, data$Y, 0.03, x, y, 0.1)$coefficients
+        expect_equal(sum(expl1_alpha[-1] * x) + expl1_alpha[[1]], y)
+        expect_equal(sum(expl2_alpha[-1] * x) + expl2_alpha[[1]], y)
+    }
+})
+
+test_that("Check SLISE find", {
+    for (i in c(rep(c(4, 8), 2))) {
+        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
+        x <- data$X[1, ]
+        y <- data$clean[1]
+        expl1_alpha <- slise.explain_find(data$X, data$Y, 0.03, x, y, lambda1 = 0, variables = i / 2)$coefficients
+        expl2_alpha <- slise.explain_find(data$X, data$Y, 0.03, x, y, lambda1 = 0.1, variables = i / 2)$coefficients
+        expect_equal(sum(expl1_alpha[-1] * x) + expl1_alpha[[1]], y)
+        expect_equal(sum(expl2_alpha[-1] * x) + expl2_alpha[[1]], y)
+    }
+})
+
+test_that("Check SLISE comb", {
+    for (i in c(rep(5, 3))) {
+        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
+        x <- data$X[1, ]
+        y <- data$clean[1]
+        expl1 <- slise.explain_comb(data$X, data$Y, 0.03, x, y, variables = 3)
+        expl2 <- slise.explain_comb(data$X, data$Y, 0.10, x, y, variables = 3)
+        expect_equal(predict(expl1, x)[[1]], y)
+        expect_equal(predict(expl2, x)[[1]], y)
+        expect_gte(3, sparsity(expl1$alpha[-1]))
+        expect_gte(3, sparsity(expl2$alpha[-1]))
+    }
+})
+
+test_that("Check SLISE predict", {
+    for (i in c(rep(c(4, 8), 2))) {
+        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
+        Y2 <- data$Y - min(data$Y)
+        Y2 <- Y2 / max(Y2)
+        expl1 <- slise.explain(data$X, data$Y, 0.2, data$X[1, ], data$clean[1])
+        expl2 <- slise.explain(data$X, Y2, 0.3, data$X[1, ], Y2[1], logit = TRUE)
+        expl3 <- slise.explain(data$X, Y2, 0.3, 2, logit = TRUE, normalise = TRUE)
+        fit1 <- slise.fit(data$X, data$Y, 0.1, 0.1)
+        Y1 <- predict(expl1, data$X)
+        Y2b <- predict(expl2, data$X)
+        Y2c <- predict(expl3, data$X[2, ])
+        Y3 <- predict(fit1, data$X)
+        expect_equal(Y1[1], data$clean[1])
+        expect_equal(Y2[2], c(Y2c))
+    }
+})
+
+test_that("Check SLISE unscale", {
+    for (i in c(rep(c(4, 8, 16), 2))) {
+        data <- data_create(i * 30, i, floor(i * 0.5), 0.03, 0.3, 0.3)
+        slise1 <- slise.fit(data$X, data$Y, epsilon = 0.1, lambda1 = 0, normalise = TRUE)
+        slise2 <- slise.fit(data$X, data$Y, epsilon = slise1$epsilon, lambda1 = 0)
+        expect_equal(slise1$coefficients, slise2$coefficients, 0.3)
+    }
+})
\ No newline at end of file
diff --git a/tests/testthat/test_weights.R b/tests/testthat/test_weights.R
new file mode 100644
index 0000000..286de7d
--- /dev/null
+++ b/tests/testthat/test_weights.R
@@ -0,0 +1,331 @@
+context("Tests for checking that the weights are implemented correctly")
+source("setup.R")
+
+
+test_that("Check simple losses", {
+    for (j in 1:8) {
+        mask <- runif(100) > 0.5
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        data2 <- list(X = rbind(data$X, data$X), Y = c(data$Y, data$Y))
+        data3 <- list(X = rbind(data$X, data$X[mask, ]), Y = c(data$Y, data$Y[mask]))
+        alpha <- rnorm(ncol(data$X))
+        w1 <- rep(1, 100)
+        w2 <- rep(2, 100)
+        w3 <- mask + 1
+        # Simple losses
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = w1),
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = w2),
+            loss_smooth(alpha, X = data2$X, Y = data2$Y, epsilon = 0.1, beta = 3),
+            tolerance = 1e-3
+        )
+        expect_equivalent(
+            loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = w3),
+            loss_smooth(alpha, X = data3$X, Y = data3$Y, epsilon = 0.1, beta = 3),
+            tolerance = 1e-3
+        )
+    }
+})
+
+test_that("Check derivations numerically", {
+    if (require(numDeriv)) {
+        for (j in 1:8) {
+            mask <- runif(100) > 0.5
+            data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+            data2 <- list(X = rbind(data$X, data$X), Y = c(data$Y, data$Y))
+            data3 <- list(X = rbind(data$X, data$X[mask, ]), Y = c(data$Y, data$Y[mask]))
+            alpha <- rnorm(ncol(data$X))
+            w1 <- rep(1, 100)
+            w2 <- rep(2, 100)
+            wr <- runif(100)
+            w3 <- mask + 1
+            # Numeric Derivation
+            expect_equivalent(
+                loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3),
+                grad(loss_smooth, alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = w1),
+                tolerance = 1e-3
+            )
+            expect_equivalent(
+                loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = w2),
+                grad(loss_smooth, alpha, X = data2$X, Y = data2$Y, epsilon = 0.1, beta = 3),
+                tolerance = 1e-3
+            )
+            expect_equivalent(
+                loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = wr),
+                grad(loss_smooth, alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = wr),
+                tolerance = 1e-3
+            )
+            expect_equivalent(
+                loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = w3),
+                grad(loss_smooth, alpha, X = data3$X, Y = data3$Y, epsilon = 0.1, beta = 3),
+                tolerance = 1e-3
+            )
+        }
+    }
+})
+
+test_that("Check that the R and C++ versions are identical", {
+    for (j in 1:8) {
+        mask <- runif(100) > 0.5
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        data2 <- list(X = rbind(data$X, data$X), Y = c(data$Y, data$Y))
+        data3 <- list(X = rbind(data$X, data$X[mask, ]), Y = c(data$Y, data$Y[mask]))
+        alpha <- rnorm(ncol(data$X))
+        w1 <- rep(1, 100)
+        w2 <- rep(2, 100)
+        wr <- runif(100)
+        w3 <- mask + 1
+        # R to C++
+        expect_equal(
+            c(loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3)),
+            c(loss_smooth_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = w1)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth(alpha, X = data2$X, Y = data2$Y, epsilon = 0.1, beta = 3)),
+            c(loss_smooth_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = w2)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = wr)),
+            c(loss_smooth_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = wr)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad(alpha, X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = wr)),
+            c(loss_smooth_grad_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = wr)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth(alpha, X = data3$X, Y = data3$Y, epsilon = 0.1, beta = 3)),
+            c(loss_smooth_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = w3)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad(alpha, X = data3$X, Y = data3$Y, epsilon = 0.1, beta = 3)),
+            c(loss_smooth_grad_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = w3)),
+            tolerance = 1e-4
+        )
+        # Combined
+        dc <- data_container(X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = w1)
+        expect_equal(
+            c(loss_smooth_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight =  numeric(0))),
+            c(lg_combined_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = numeric(0))),
+            c(lg_getgrad_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        dc$setWeight(w2)
+        expect_equal(
+            c(loss_smooth_c(alpha, data = data2$X, response = data2$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight =  numeric(0))),
+            c(lg_combined_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad_c(alpha, data = data2$X, response = data2$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = numeric(0))),
+            c(lg_getgrad_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        dc$setWeight(wr)
+        expect_equal(
+            c(loss_smooth_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = wr)),
+            c(lg_combined_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad_c(alpha, data = data$X, response = data$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = wr)),
+            c(lg_getgrad_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        dc$setWeight(w3)
+        expect_equal(
+            c(loss_smooth_c(alpha, data = data3$X, response = data3$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight =  numeric(0))),
+            c(lg_combined_smooth_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+        expect_equal(
+            c(loss_smooth_grad_c(alpha, data = data3$X, response = data3$Y, epsilon = 0.1, beta = 3, lambda1 = 0, lambda2 = 0, weight = numeric(0))),
+            c(lg_getgrad_c_dc(xs = alpha, dcptr = dc$.pointer)),
+            tolerance = 1e-4
+        )
+    }
+})
+
+test_that("Check that OWL-QN still works", {
+    for (j in 1:8) {
+        mask <- runif(100) > 0.5
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        data2 <- list(X = rbind(data$X, data$X), Y = c(data$Y, data$Y))
+        data3 <- list(X = rbind(data$X, data$X[mask, ]), Y = c(data$Y, data$Y[mask]))
+        alpha <- rnorm(ncol(data$X))
+        w1 <- rep(1, 100)
+        w2 <- rep(2, 100)
+        wr <- runif(100)
+        w3 <- mask + 1
+        # OWL-QN
+        dc <- data_container(X = data$X, Y = data$Y, epsilon = 0.1, beta = 3, weight = wr)
+        expect_equal(
+            owlqn_c(alpha, dc)$value,
+            owlqn_r(alpha, data$X, data$Y, 0.1, 3, weight = wr)$value,
+            tolerance = 1e-4
+        )
+        dc$setWeight(w1)
+        expect_equal(
+            owlqn_c(alpha, dc)$value,
+            owlqn_r(alpha, data$X, data$Y, 0.1, 3)$value,
+            tolerance = 1e-4
+        )
+        dc$setWeight(w2)
+        expect_equal(
+            owlqn_c(alpha, dc)$value,
+            owlqn_r(alpha, data2$X, data2$Y, 0.1, 3)$value,
+            tolerance = 1e-4
+        )
+        dc$setWeight(w3)
+        expect_equal(
+            owlqn_c(alpha, dc)$value,
+            owlqn_r(alpha, data3$X, data3$Y, 0.1, 3)$value,
+            tolerance = 1e-4
+        )
+    }
+})
+
+test_that("Check log_approx_ratio", {
+    for (j in 1:8) {
+        mask <- runif(100) > 0.5
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        data2 <- list(X = rbind(data$X, data$X), Y = c(data$Y, data$Y))
+        data3 <- list(X = rbind(data$X, data$X[mask, ]), Y = c(data$Y, data$Y[mask]))
+        alpha <- rnorm(ncol(data$X))
+        w1 <- rep(1, 100)
+        w2 <- rep(2, 100)
+        w3 <- mask + 1
+        # log_approximation ratio
+        expect_equivalent(
+            log_approximation_ratio((data$X %*% alpha - data$Y)^2, 0.01, 1, 4, w1),
+            log_approximation_ratio((data$X %*% alpha - data$Y)^2, 0.01, 1, 4),
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            log_approximation_ratio((data$X %*% alpha - data$Y)^2, 0.01, 1, 4, w2),
+            log_approximation_ratio((data2$X %*% alpha - data2$Y)^2, 0.01, 1, 4),
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            log_approximation_ratio((data$X %*% alpha - data$Y)^2, 0.01, 1, 4, w3),
+            log_approximation_ratio((data3$X %*% alpha - data3$Y)^2, 0.01, 1, 4),
+            tolerance = 1e-4
+        )
+    }
+})
+
+test_that("Check matching_epsilon", {
+    for (j in 1:8) {
+        mask <- runif(100) > 0.5
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        data2 <- list(X = rbind(data$X, data$X), Y = c(data$Y, data$Y))
+        data3 <- list(X = rbind(data$X, data$X[mask, ]), Y = c(data$Y, data$Y[mask]))
+        alpha <- rnorm(ncol(data$X))
+        w1 <- rep(1, 100)
+        w2 <- rep(2, 100)
+        w3 <- mask + 1
+        # log_approximation ratio
+        expect_equivalent(
+            matching_epsilon((data$X %*% alpha - data$Y)^2, 0.04, 4, w1),
+            matching_epsilon((data$X %*% alpha - data$Y)^2, 0.04, 4),
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            matching_epsilon((data$X %*% alpha - data$Y)^2, 0.03, 2, w2),
+            matching_epsilon((data2$X %*% alpha - data2$Y)^2, 0.03, 2),
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            matching_epsilon((data$X %*% alpha - data$Y)^2, 0.05, 3, w3),
+            matching_epsilon((data3$X %*% alpha - data3$Y)^2, 0.05, 3),
+            tolerance = 1e-4
+        )
+    }
+})
+
+test_that("Check the next beta", {
+    for (j in 1:8) {
+        mask <- runif(100) > 0.5
+        data <- data_create(100, 6, 2, 0.1, 0.3, 0.3)
+        data2 <- list(X = rbind(data$X, data$X), Y = c(data$Y, data$Y))
+        data3 <- list(X = rbind(data$X, data$X[mask, ]), Y = c(data$Y, data$Y[mask]))
+        alpha <- rnorm(ncol(data$X))
+        w1 <- rep(1, 100)
+        w2 <- rep(2, 100)
+        w3 <- mask + 1
+        # log_approximation ratio
+        expect_equivalent(
+            next_beta((data$X %*% alpha - data$Y)^2, 0.04, 4, w1),
+            next_beta((data$X %*% alpha - data$Y)^2, 0.04, 4),
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            next_beta((data$X %*% alpha - data$Y)^2, 0.03, 2, w2),
+            next_beta((data2$X %*% alpha - data2$Y)^2, 0.03, 2),
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            next_beta((data$X %*% alpha - data$Y)^2, 0.05, 3, w3),
+            next_beta((data3$X %*% alpha - data3$Y)^2, 0.05, 3),
+            tolerance = 1e-4
+        )
+    }
+})
+
+test_that("Check that the full SLISE algorithm works", {
+    for (j in 1:8) {
+        mask <- runif(200) > 0.5
+        data <- data_create(200, 8, 2, 0.1, 0.3, 0.3)
+        data2 <- list(X = rbind(data$X, data$X), Y = c(data$Y, data$Y))
+        data3 <- list(X = rbind(data$X, data$X[mask, ]), Y = c(data$Y, data$Y[mask]))
+        alpha <- rnorm(ncol(data$X))
+        w1 <- rep(1, 200)
+        w2 <- rep(2, 200)
+        w3 <- mask + 1
+        init1 <- list(rep(0, ncol(data$X) + 1), 0)
+        init2 <- list(rep(0, ncol(data$X)), 0)
+        # test SLISE with weights
+        expect_equivalent(
+            slise.fit(data$X, data$Y, 0.1, initialisation = init1)$loss,
+            slise.fit(data$X, data$Y, 0.1, weight = w1, initialisation = init1)$loss,
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            slise.fit(data2$X, data2$Y, 0.1, initialisation = init1)$loss,
+            slise.fit(data$X, data$Y, 0.1, weight = w2, initialisation = init1)$loss,
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            slise.fit(data3$X, data3$Y, 0.1, initialisation = init1)$loss,
+            slise.fit(data$X, data$Y, 0.1, weight = w3, initialisation = init1)$loss,
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            slise.explain(data$X, data$Y, 0.1, data$X[1,], data$Y[1], initialisation = init2)$loss,
+            slise.explain(data$X, data$Y, 0.1, data$X[1,], data$Y[1], weight = w1, initialisation = init2)$loss,
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            slise.explain(data2$X, data2$Y, 0.1, data$X[2,], data$Y[2], initialisation = init2)$loss,
+            slise.explain(data$X, data$Y, 0.1, data$X[2,], data$Y[2], weight = w2, initialisation = init2)$loss,
+            tolerance = 1e-4
+        )
+        expect_equivalent(
+            slise.explain(data3$X, data3$Y, 0.1, data$X[4,], data$Y[4], initialisation = init2)$loss,
+            slise.explain(data$X, data$Y, 0.1, data$X[4,], data$Y[4], weight = w3, initialisation = init2)$loss,
+            tolerance = 1e-4
+        )
+    }
+})
\ No newline at end of file
diff --git a/vignettes/paper.pdf b/vignettes/bjorklund2019sparse.pdf
similarity index 100%
rename from vignettes/paper.pdf
rename to vignettes/bjorklund2019sparse.pdf
diff --git a/vignettes/paper.pdf.asis b/vignettes/bjorklund2019sparse.pdf.asis
similarity index 100%
rename from vignettes/paper.pdf.asis
rename to vignettes/bjorklund2019sparse.pdf.asis
diff --git a/vignettes/bjorklund2022robust.pdf b/vignettes/bjorklund2022robust.pdf
new file mode 100644
index 0000000..b5fbbf6
Binary files /dev/null and b/vignettes/bjorklund2022robust.pdf differ
diff --git a/vignettes/bjorklund2022robust.pdf.asis b/vignettes/bjorklund2022robust.pdf.asis
new file mode 100644
index 0000000..0bb3270
--- /dev/null
+++ b/vignettes/bjorklund2022robust.pdf.asis
@@ -0,0 +1,3 @@
+%\VignetteIndexEntry{Paper: Robust regression via error tolerance}
+%\VignetteEngine{R.rsp::asis}
+%\VignetteKeyword{paper}