diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml new file mode 100644 index 00000000000..00230ca455c --- /dev/null +++ b/scripts/builtin/stepGLM.dml @@ -0,0 +1,272 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT CHOOSES A GLM REGRESSION MODEL IN A STEPWISE ALGIRITHM USING AIC +# EACH GLM REGRESSION IS SOLVED USING NEWTON/FISHER SCORING WITH TRUST REGIONS +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix --- Matrix X of feature vectors +# Y Matrix --- Response Matrix Y with 1 column +# link Int 2 Link function code: 1 = log, 2 = Logit, 3 = Probit, 4 = Cloglog +# yneg Double 0.0 Response value for Bernoulli "No" label, usually 0.0 or -1.0 +# icpt Int 0 Intercept presence, X columns shifting and rescaling: +# 0 = no intercept, no shifting, no rescaling; +# 1 = add intercept, but neither shift nor rescale X; +# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 +# tol Double 0.000001 Tolerance (epsilon) +# disp Double 0.0 (Over-)dispersion value, or 0.0 to estimate it from data +# moi Int 200 Maximum number of outer (Newton / Fisher Scoring) iterations +# mii Int 0 Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum +# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr +# no further features are being checked and the algorithm stops +# --------------------------------------------------------------------------------------------- +# OUTPUT: Matrix beta, whose size depends on icpt: +# icpt=0: ncol(X) x 1; icpt=1: (ncol(X) + 1) x 1; icpt=2: (ncol(X) + 1) x 2 +# +# AIC Double --- AIC value +# B Matrix --- Estimated regression parameters (betas) +# S Matrix --- The selected features ordered as computed by the algorithm +# --------------------------------------------------------------------------------------------- + +# THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY! +# - LOG +# - LOGIT +# - PROBIT +# - CLOGLOG + +source("./scripts/builtin/glm.dml") as glm; + +m_stepGLM = function ( + Matrix[Double] X, + Matrix[Double] Y, + Int link = 2, + Double yneg = 0.0, + Int icpt = 0, + Double tol = 0.000001, + Double disp = 0.0, + Int moi = 200, + Int mii = 0, + Double thr = 0.01 +) return ( + Double AIC, + Matrix[Double] B, + Matrix[Double] S + ) + { + intercept_status = icpt; + bernoulli_No_label = yneg; + distribution_type = 2; + + + if (distribution_type == 2 & ncol(Y) == 1) { + is_Y_negative = (Y == bernoulli_No_label); + Y = cbind (1 - is_Y_negative, is_Y_negative); + count_Y_negative = sum (is_Y_negative); + if (count_Y_negative == 0) { + stop ("StepGLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label"); + } + if (count_Y_negative == nrow(Y)) { + stop ("StepGLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label"); + } + } + + X_orig = X; + num_records = nrow (X_orig); + num_features = ncol (X_orig); + + # BEGIN STEPWISE GENERALIZED LINEAR MODELS + + continue = TRUE; + columns_fixed = matrix (0, rows = 1, cols = num_features); + columns_fixed_ordered = matrix (0, rows = 1, cols = 1); + + # X_global stores the best model found at each step + X_global = matrix (0, rows = num_records, cols = 1); + + if (intercept_status == 0) { + # Compute AIC of an empty model with no features and no intercept (all Ys are zero) + [AIC_best, ignore_B1, ignore_S1] = internal_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + } else { + # compute AIC of an empty model with only intercept (all Ys are constant) + all_ones = matrix (1, rows = num_records, cols = 1); + [AIC_best, ignore_beta2, ignore_S2] = internal_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + } + #print ("Best AIC without any features: " + AIC_best); + + # First pass to examine single features + AICs = matrix (AIC_best, rows = 1, cols = num_features); + parfor (i in 1:num_features) { + [AIC_1, ignore_beta3, ignore_S3] = internal_glm(X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + AICs[1,i] = AIC_1; + } + + # Determine the best AIC + column_best = 0; + for (k in 1:num_features) { + AIC_cur = as.scalar (AICs[1,k]); + if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) { + column_best = k; + AIC_best = as.scalar(AICs[1,k]); + } + } + + if (column_best == 0) { + #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); + if (intercept_status == 0) { + # Compute AIC of an empty model with no features and no intercept (all Ys are zero) + [AIC_best, ignore_beta4, ignore_S4] = internal_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + } else { + # compute AIC of an empty model with only intercept (all Ys are constant) + ###all_ones = matrix (1, rows = num_records, cols = 1); + [AIC_best, ignore_beta5, ignore_S5] = internal_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + } + }; + + # print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); + columns_fixed[1,column_best] = 1; + columns_fixed_ordered[1,1] = column_best; + X_global = X_orig[,column_best]; + + while (continue) { + # Subsequent passes over the features + parfor (i in 1:num_features) { + if (as.scalar(columns_fixed[1,i]) == 0) { + + # Construct the feature matrix + X_loop = cbind (X_global, X_orig[,i]); + + [AIC_2, ignore_beta6, ignore_S6] = internal_glm(X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + AICs[1,i] = AIC_2; + } + } + + # Determine the best AIC + for (k in 1:num_features) { + AIC_cur = as.scalar (AICs[1,k]); + if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) { + column_best = k; + AIC_best = as.scalar(AICs[1,k]); + } + } + + # cbind best found features (i.e., columns) to X_global + if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found + #print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); + columns_fixed[1,column_best] = 1; + columns_fixed_ordered = cbind (columns_fixed_ordered, as.matrix(column_best)); + if (ncol(columns_fixed_ordered) == num_features) { # all features examined + X_global = cbind (X_global, X_orig[,column_best]); + continue = FALSE; + } else { + X_global = cbind (X_global, X_orig[,column_best]); + } + } else { + continue = FALSE; + } + } + + # run GLM with selected set of features + print ("Running GLM with selected features..."); + [AIC, B, S] = internal_glm(X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + } + +compute_AIC = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] B, Int link, Int icpt) + return (Double aic) +{ + # Isolate unscaled parameters; accounts for m_glm returning 2 columns when icpt=2 + beta = B[, 1]; + + if (icpt > 0) { + ones = matrix(1, rows=nrow(X), cols=1); + X_design = cbind(X, ones); + } else { + X_design = X; + } + + eta = X_design %*% beta; + + # Map inverse link functions + if (link == 1) { + # log, see https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function + mu = exp(eta); + } else if (link == 2) { + # logit, see https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function + mu = 1 / (1 + exp(-eta)); + } else if (link == 3) { + # probit approximation, page 1487 in "Qualitative Response Models: A Survey (1981)" by Takeshi Amemiya + mu = 1 / (1 + exp(-1.6 * eta)); + } else if (link == 4) { + # cloglog, see https://search.r-project.org/CRAN/refmans/VGAM/html/clogloglink.html + mu = 1 - exp(-exp(eta)); + } else { + stop("Unsupported link function code: " + link); + mu = eta; + } + + # Constrain to prevent numerical discontinuity in log(0) + mu = max(mu, 1e-15); + mu = min(mu, 1 - 1e-15); + + Y_yes = Y[, 1]; + Y_neg = Y[, 2]; + + LL = sum(Y_yes * log(mu) + Y_neg * log(1 - mu)); + k = nrow(beta); + + aic = 2 * k - 2 * LL; +} + +internal_glm = function ( + Matrix[Double] X, + Matrix[Double] Y, + Int intercept_status, + Double num_features_orig, + Matrix[Double] Selected, + Int link, + Double disp, + Double tol, + Int moi, + Int mii +) return ( + Double AIC, + Matrix[Double] B, + Matrix[Double] S +) { + # bernoulli family parameters, see table in ./scripts/builtin/m_glm.dml for details. + new_link = link; + new_lpow = 1.0; + + if (link == 1) { + new_link = 1; + new_lpow = 0.0; + } + B = glm::m_glm(X=X, Y=Y, dfam=2, vpow=0.0, link=new_link, lpow=new_lpow, yneg=0.0, + icpt=intercept_status, disp=disp, reg=0.0, tol=tol, + moi=moi, mii=mii, verbose=FALSE); + + AIC = compute_AIC(X, Y, B, link, intercept_status); + + S = Selected; +} diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index e21c539d6d8..62145124d82 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -333,6 +333,7 @@ public enum Builtins { STATSNA("statsNA", true), STRATSTATS("stratstats", true), STEPLM("steplm",true, ReturnType.MULTI_RETURN), + STEPGLM("stepGLM",true, ReturnType.MULTI_RETURN), STFT("stft", false, ReturnType.MULTI_RETURN), SQRT("sqrt", false), SQRT_MATRIX("sqrtMatrix", true), diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java new file mode 100644 index 00000000000..7d95dc6ec28 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + +import org.junit.Test; +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.common.Types.ExecType; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; + +public class BuiltinSTEPGlmTest extends AutomatedTestBase +{ + private final static String TEST_NAME = "stepGLM"; + private final static String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinSTEPGlmTest.class.getSimpleName() + "/"; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{})); + } + + @Test + public void testLmMatrixDenseCPlm() { + runSTEPGlmTest(ExecType.CP); + } + + @Test + public void testLmMatrixSparseSPlm() { + runSTEPGlmTest(ExecType.SPARK); + } + + private void runSTEPGlmTest(ExecType instType) { + ExecMode platformOld = setExecMode(instType); + + try { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + + String HOME = SCRIPT_DIR + TEST_DIR; + + // Pointing to the generated validation DML script + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[]{}; + + // runTest executes the script; fails if the DML script invokes stop() + runTest(true, false, null, -1); + } + finally { + rtplatform = platformOld; + } + } +} diff --git a/src/test/scripts/functions/builtin/stepGLM.dml b/src/test/scripts/functions/builtin/stepGLM.dml new file mode 100644 index 00000000000..95a4dfb9908 --- /dev/null +++ b/src/test/scripts/functions/builtin/stepGLM.dml @@ -0,0 +1,67 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +source("scripts/builtin/stepGLM.dml") as stepGLM; + +N = 1000; +P = 10; +X = rand(rows=N, cols=P, min=-1.0, max=1.0, pdf="uniform", seed=123); + +beta_true = matrix(0, rows=P, cols=1); +beta_true[2,1] = 3.5; +beta_true[5,1] = -2.0; +beta_true[8,1] = 1.5; + +Z = X %*% beta_true; +P_y = 1.0 / (1.0 + exp(-Z)); +Y = (rand(rows=N, cols=1, min=0.0, max=1.0, seed=456) < P_y) * 1.0; + +[AIC, B, S] = stepGLM::m_stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01); + +print("\n\n\n\n\n\n\nTest Results:"); +print("Optimal AIC: " + AIC); +print("Selected Feature Indices:\n" + toString(S)); +print("Estimated Coefficients:\n" + toString(B)); + +beta_est = matrix(0, rows=P, cols=1); +for (i in 1:nrow(B)) { + idx = as.scalar(S[1, i]); + beta_est[idx, 1] = B[i, 1]; +} + +# Case 01 +if (nrow(B) != 3 | sum(beta_est != 0 & beta_true == 0) > 0 | sum(beta_est == 0 & beta_true != 0) > 0) { + stop("Test failed: Inexact feature support recovery."); +} +print("passed test 1") + +# Case 02 +epsilon = 0.5; +if (max(abs(beta_est - beta_true)) > epsilon) { + stop("Test failed: Parameter estimates exceed tolerance bound epsilon = " + epsilon + "."); +} +print("passed test 2") + + + +#stop("!!!Sucess!!!") # uncomment for letting the test fail +