Skip to content

Commit

Permalink
[SYSTEMDS-3681] Cleanup stepLM builtin function, remove duplicate
Browse files Browse the repository at this point in the history
  • Loading branch information
mboehm7 committed May 4, 2024
1 parent 976d4bd commit b1cb505
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 112 deletions.
93 changes: 0 additions & 93 deletions scripts/algorithms/StepLinearRegDS.dml

This file was deleted.

31 changes: 16 additions & 15 deletions scripts/builtin/steplm.dml
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,16 @@
#
# INPUT:
# ------------------------------------------------------------------------------------------
# X Location (on HDFS) to read the matrix X of feature vectors
# Y Location (on HDFS) to read the 1-column matrix Y of response values
# X Matrix X of feature vectors
# Y Single-column Matrix Y of response values
# icpt Intercept presence, shifting and rescaling the columns of X:
# 0 = no intercept, no shifting, no rescaling;
# 1 = add intercept, but neither shift nor rescale X;
# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
# reg learning rate
# reg Regularization parameter, 0 for no penalty
# tol Tolerance threshold to train until achieved
# maxi maximum iterations 0 means until tolerance is reached
# verbose If the algorithm should be verbose
# maxi Maximum iterations 0 means until tolerance is reached
# verbose Indicator for verbose debug output
# ------------------------------------------------------------------------------------------
#
# OUTPUT:
Expand All @@ -67,7 +67,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
# start from one feature and iteratively add features until AIC improves
thr = 0.001;

if(verbose)
if(verbose)
print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
X_orig = X;
n = nrow(X_orig);
Expand All @@ -76,10 +76,10 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
# BEGIN STEPWISE LINEAR REGRESSION
columns_fixed = matrix(0, 1, m_orig);
columns_fixed_ordered = matrix(0, 1, 1);

# X_global stores the best model found at each step
X_global = matrix(0, n, 1);

if (icpt == 1 | icpt == 2) {
beta = mean(y);
AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n);
Expand All @@ -88,7 +88,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
AIC_best_orig = n * log(sum(y ^ 2) / n);
}
if(verbose)
print("Best AIC without any features: " + AIC_best_orig);
print("Best AIC without any features: " + AIC_best_orig);
boa_ncol = ncol(X_orig) + as.integer(icpt!=0);
beta_out_all = matrix(0, boa_ncol, m_orig);

Expand All @@ -107,14 +107,14 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
beta_best = beta_out_all[, column_best];
if (column_best == 0) {
if(verbose)
print("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
print("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
B = matrix(0, m_orig, 1);
if (icpt != 0)
B = rbind(B, as.matrix(beta));
S = matrix(0, 1, 1);
}
else {
if(verbose)
if(verbose)
print("Best AIC " + AIC_best + " achieved with feature: " + column_best);

columns_fixed[1, column_best] = 1;
Expand Down Expand Up @@ -152,7 +152,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
if (as.scalar(columns_fixed[1, column_best]) == 0) {
# new best feature found
if(verbose)
print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
columns_fixed[1, column_best] = 1;
columns_fixed_ordered = cbind(columns_fixed_ordered, as.matrix(column_best));
if (ncol(columns_fixed_ordered) == m_orig) {
Expand All @@ -168,7 +168,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
}
# run linear regression with selected set of features
if( verbose )
print("Running linear regression with selected features...");
print("Running linear regression with selected features...");
[AIC, beta_out] = linear_regression(X_global, y, icpt, reg, tol, maxi, verbose);
S = columns_fixed_ordered;
if (icpt != 0)
Expand All @@ -178,13 +178,13 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
}

# Computes linear regression using lm and outputs AIC.
linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt,
linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt,
Double reg, Double tol, Integer maxi, Boolean verbose)
return(Double AIC, Matrix[Double] beta)
{
# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
beta = lm(X = X, y = y, icpt = icpt, reg=reg, tol=tol, maxi=maxi, verbose=FALSE);

# PREPARE X for SCORING
if( icpt != 0 )
X = cbind(X, matrix(1,nrow(X),1))
Expand Down Expand Up @@ -224,3 +224,4 @@ reorder_matrix = function(
checkAIC = function(Double AIC_cur, Double AIC_best, Double thr) return (Boolean R) {
R = (AIC_cur < AIC_best) & (AIC_best-AIC_cur > abs(thr * AIC_best))
}

Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@

public class AlgorithmStepwiseRegression extends AutomatedTestBase
{
private final static String TEST_NAME1 = "Algorithm_Stepwise";
private final static String TEST_NAME1 = "Algorithm_StepLM";
private final static String TEST_NAME2 = "Algorithm_StepGLM";
private final static String TEST_DIR = "functions/codegenalg/";
private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmStepwiseRegression.class.getSimpleName() + "/";

Expand All @@ -58,6 +59,7 @@ public enum StepwiseType {
public void setUp() {
TestUtils.clearAssertionInformation();
addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" }));
addTestConfiguration(TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "w" }));
}

@Test
Expand Down Expand Up @@ -188,18 +190,18 @@ private void runStepwiseTest( StepwiseType type, boolean sparse, boolean rewrite

try
{
String TEST_NAME = TEST_NAME1;
String TEST_NAME = (type==StepwiseType.LINREG_DS) ? TEST_NAME1 : TEST_NAME2;
TestConfiguration config = getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";

if( type == StepwiseType.LINREG_DS) {
fullDMLScriptName = "scripts/algorithms/StepLinearRegDS.dml";
programArgs = new String[]{ "-stats", "-nvargs",
"X="+input("X"), "Y="+input("Y"), "icpt="+String.valueOf(icpt),
"thr="+String.valueOf(thr), "B="+output("B"), "S="+output("S")};
}
else { //GLM binomial probit
fullDMLScriptName = "scripts/algorithms/StepGLM.dml";
programArgs = new String[]{ "-stats", "-nvargs",
"X="+input("X"), "Y="+input("Y"), "icpt="+String.valueOf(icpt),
"thr="+String.valueOf(thr), "link=3", "yneg=0",
Expand Down
38 changes: 38 additions & 0 deletions src/test/scripts/functions/codegenalg/Algorithm_StepLM.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

fileX = $X;
fileY = $Y;
fileB = $B;
fileS = $S;
write_beta = ifdef($write_beta, TRUE);
fmt = ifdef ($fmt, "text");
intercept = ifdef ($icpt, 1);
thr = ifdef ($thr, 0.001);

X_orig = read (fileX);
y = read (fileY);

[beta_out, Selected] = steplm(X=X_orig, y=y, icpt=intercept, verbose=FALSE);

write(Selected, fileS, format=fmt);
write(beta_out, fileB, format=fmt);

0 comments on commit b1cb505

Please sign in to comment.