From 274f0c82d31b0e7c406ed06e15b771f34aca42ca Mon Sep 17 00:00:00 2001 From: bruno Date: Sun, 24 May 2026 10:35:02 +0200 Subject: [PATCH 01/12] refactor StepGLM.dml from a script into a function and resolve a few nested blocks --- scripts/builtin/StepGLM.dml | 1202 +++++++++++++++++++++++++++++++++++ 1 file changed, 1202 insertions(+) create mode 100644 scripts/builtin/StepGLM.dml diff --git a/scripts/builtin/StepGLM.dml b/scripts/builtin/StepGLM.dml new file mode 100644 index 00000000000..3bbaf361650 --- /dev/null +++ b/scripts/builtin/StepGLM.dml @@ -0,0 +1,1202 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT CHOOSES A GLM REGRESSION MODEL IN A STEPWISE ALGIRITHM USING AIC +# EACH GLM REGRESSION IS SOLVED USING NEWTON/FISHER SCORING WITH TRUST REGIONS +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix --- Matrix X of feature vectors +# Y Matrix --- Response Matrix Y with 1 column +# link Int 2 Link function code: 1 = log, 2 = Logit, 3 = Probit, 4 = Cloglog +# yneg Double 0.0 Response value for Bernoulli "No" label, usually 0.0 or -1.0 +# icpt Int 0 Intercept presence, X columns shifting and rescaling: +# 0 = no intercept, no shifting, no rescaling; +# 1 = add intercept, but neither shift nor rescale X; +# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 +# tol Double 0.000001 Tolerance (epsilon) +# disp Double 0.0 (Over-)dispersion value, or 0.0 to estimate it from data +# moi Int 200 Maximum number of outer (Newton / Fisher Scoring) iterations +# mii Int 0 Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum +# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr +# no further features are being checked and the algorithm stops +# --------------------------------------------------------------------------------------------- +# OUTPUT: Matrix beta, whose size depends on icpt: +# icpt=0: ncol(X) x 1; icpt=1: (ncol(X) + 1) x 1; icpt=2: (ncol(X) + 1) x 2 +# +# B Matrix --- Estimated regression parameters (betas) +# S Matrix --- The selected features ordered as computed by the algorithm +# O String --- The statistics +# --------------------------------------------------------------------------------------------- + +# THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY! +# - LOG +# - LOGIT +# - PROBIT +# - CLOGLOG + +# In addition, in the last run of GLM some statistics are provided in CSV format, one comma-separated name-value +# pair per each line, as follows: +# +# NAME MEANING +# ------------------------------------------------------------------------------------------- +# TERMINATION_CODE A positive integer indicating success/failure as follows: +# 1 = Converged successfully; 2 = Maximum number of iterations reached; +# 3 = Input (X, Y) out of range; 4 = Distribution/link is not supported +# BETA_MIN Smallest beta value (regression coefficient), excluding the intercept +# BETA_MIN_INDEX Column index for the smallest beta value +# BETA_MAX Largest beta value (regression coefficient), excluding the intercept +# BETA_MAX_INDEX Column index for the largest beta value +# INTERCEPT Intercept value, or NaN if there is no intercept (if icpt=0) +# DISPERSION Dispersion used to scale deviance, provided as "disp" input parameter +# or estimated (same as DISPERSION_EST) if the "disp" parameter is <= 0 +# DISPERSION_EST Dispersion estimated from the dataset +# DEVIANCE_UNSCALED Deviance from the saturated model, assuming dispersion == 1.0 +# DEVIANCE_SCALED Deviance from the saturated model, scaled by the DISPERSION value +# ------------------------------------------------------------------------------------------- + + +stepGLM = function ( + Matrix[Double] X, + Matrix[Double] Y, + Int link = 2, + Double yneg = 0.0, + Int icpt = 0, + Double tol = 0.000001, + Double disp = 0.0, + Int moi = 200, + Int mii = 0, + Double thr = 0.01, +) + return ( + Double AIC, + Matrix[Double] B, + Matrix[Double] S, + Matrix[Double] O, + ) + { + intercept_status = icpt; + bernoulli_No_label = yneg; + distribution_type = 2; + + # currently only the forward selection strategy in supported: start from one feature and iteratively add + # features until AIC improves + dir = "forward"; + + if (distribution_type == 2 & ncol(Y) == 1) { + is_Y_negative = (Y == bernoulli_No_label); + Y = cbind (1 - is_Y_negative, is_Y_negative); + count_Y_negative = sum (is_Y_negative); + if (count_Y_negative == 0) { + stop ("StepGLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label"); + } + if (count_Y_negative == nrow(Y)) { + stop ("StepGLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label"); + } + } + + X_orig = X; + num_records = nrow (X_orig); + num_features = ncol (X_orig); + + # BEGIN STEPWISE GENERALIZED LINEAR MODELS + + if (dir != "forward") { + stop ("Currently only forward selection strategy is supported!"); + } + + continue = TRUE; + columns_fixed = matrix (0, rows = 1, cols = num_features); + columns_fixed_ordered = matrix (0, rows = 1, cols = 1); + + # X_global stores the best model found at each step + X_global = matrix (0, rows = num_records, cols = 1); + + if (intercept_status == 0) { + # Compute AIC of an empty model with no features and no intercept (all Ys are zero) + [AIC_best, _beta, _S, _O] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered); + } else { + # compute AIC of an empty model with only intercept (all Ys are constant) + all_ones = matrix (1, rows = num_records, cols = 1); + [AIC_best, _beta, _S, _O] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered); + } + #print ("Best AIC without any features: " + AIC_best); + + # First pass to examine single features + AICs = matrix (AIC_best, rows = 1, cols = num_features); + parfor (i in 1:num_features) { + [AIC_1, _beta, _S, _O] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered); + AICs[1,i] = AIC_1; + } + + # Determine the best AIC + column_best = 0; + for (k in 1:num_features) { + AIC_cur = as.scalar (AICs[1,k]); + if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) { + column_best = k; + AIC_best = as.scalar(AICs[1,k]); + } + } + + if (column_best == 0) { + #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); + if (intercept_status == 0) { + # Compute AIC of an empty model with no features and no intercept (all Ys are zero) + [AIC_best, _beta, _S, _O] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered); + } else { + # compute AIC of an empty model with only intercept (all Ys are constant) + ###all_ones = matrix (1, rows = num_records, cols = 1); + [AIC_best, _beta, _S, _O] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered); + } + }; + + # print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); + columns_fixed[1,column_best] = 1; + columns_fixed_ordered[1,1] = column_best; + X_global = X_orig[,column_best]; + + while (continue) { + # Subsequent passes over the features + parfor (i in 1:num_features) { + if (as.scalar(columns_fixed[1,i]) == 0) { + + # Construct the feature matrix + X = cbind (X_global, X_orig[,i]); + + [AIC_2, _beta, _S, _O] = glm_fit (X, Y, intercept_status, num_features, columns_fixed_ordered); + AICs[1,i] = AIC_2; + } + } + + # Determine the best AIC + for (k in 1:num_features) { + AIC_cur = as.scalar (AICs[1,k]); + if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) { + column_best = k; + AIC_best = as.scalar(AICs[1,k]); + } + } + + # cbind best found features (i.e., columns) to X_global + if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found + #print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); + columns_fixed[1,column_best] = 1; + columns_fixed_ordered = cbind (columns_fixed_ordered, as.matrix(column_best)); + if (ncol(columns_fixed_ordered) == num_features) { # all features examined + X_global = cbind (X_global, X_orig[,column_best]); + continue = FALSE; + } else { + X_global = cbind (X_global, X_orig[,column_best]); + } + } else { + continue = FALSE; + } + } + + # run GLM with selected set of features + print ("Running GLM with selected features..."); + [AIC, beta, S, O] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered); + return (AIC, beta, S, O) + } + + +################### UDFS USED IN THIS SCRIPT ################## + +glm_fit = function ( + Matrix[Double] X, + Matrix[Double] Y, + Int intercept_status, + Double num_features_orig, + Matrix[Double] Selected, + Int link = 2, + Double disp = 0.0, + Double tol = 0.000001, + Int moi = 200, + Int mii = 0, +) + return ( + Double AIC, + Matrix[Double] beta, + Matrix[Double] S, + String O, + ) + { + + # distribution family code: 1 = Power, 2 = Bernoulli/Binomial; currently only Bernouli distribution family is supported! + distribution_type = 2; # $dfam = 2; + variance_as_power_of_the_mean = 0.0; # $vpow = 0.0; + # link function code: 0 = canonical (depends on distribution), 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit; + # currently only log (link = 1), logit (link = 2), probit (link = 3), and cloglog (link = 4) are supported! + link_type = link; + link_as_power_of_the_mean = 0.0; # $lpow = 0.0; + + dispersion = disp; + eps = tol; + max_iteration_IRLS = moi; + max_iteration_CG = mii; + + variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean); + link_as_power_of_the_mean = as.double (link_as_power_of_the_mean); + + dispersion = as.double (dispersion); + eps = as.double (eps); + + # Default values for output statistics: + regularization = 0.0; + termination_code = 0.0; + min_beta = NaN; + i_min_beta = NaN; + max_beta = NaN; + i_max_beta = NaN; + intercept_value = NaN; + dispersion = NaN; + estimated_dispersion = NaN; + deviance_nodisp = NaN; + deviance = NaN; + + ##### INITIALIZE THE PARAMETERS ##### + + num_records = nrow (X); + num_features = ncol (X); + zeros_r = matrix (0, rows = num_records, cols = 1); + ones_r = 1 + zeros_r; + + # Introduce the intercept, shift and rescale the columns of X if needed + + if (intercept_status == 1 | intercept_status == 2) { # add the intercept column + X = cbind (X, ones_r); + num_features = ncol (X); + } + + scale_lambda = matrix (1, rows = num_features, cols = 1); + if (intercept_status == 1 | intercept_status == 2) { + scale_lambda [num_features, 1] = 0; + } + + if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1 + # Important assumption: X [, num_features] = ones_r + avg_X_cols = t(colSums(X)) / num_records; + var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1); + is_unsafe = (var_X_cols <= 0); + scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X [num_features, 1] = 1; + shift_X = - avg_X_cols * scale_X; + shift_X [num_features, 1] = 0; + rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2); + } else { + scale_X = matrix (1, rows = num_features, cols = 1); + shift_X = matrix (0, rows = num_features, cols = 1); + rowSums_X_sq = rowSums (X ^ 2); + } + + # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) + # with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. + # The transform is then associatively applied to the other side of the expression, + # and is rewritten via "scale_X" and "shift_X" as follows: + # + # ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: + # ssX_A = diag (scale_X) %*% A; + # ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A; + # + # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: + # tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ]; + + # Initialize other input-dependent parameters + + lambda = scale_lambda * regularization; + if (max_iteration_CG == 0) { + max_iteration_CG = num_features; + } + + # Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const] + + if (link_type == 0) { + if (distribution_type == 1) { + link_type = 1; + link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean; + } else { + if (distribution_type == 2) { + link_type = 2; + } + } + } + + # For power distributions and/or links, we use two constants, + # "variance as power of the mean" and "link_as_power_of_the_mean", + # to specify the variance and the link as arbitrary powers of the + # mean. However, the variance-powers of 1.0 (Poisson family) and + # 2.0 (Gamma family) have to be treated as special cases, because + # these values integrate into logarithms. The link-power of 0.0 + # is also special as it represents the logarithm link. + + num_response_columns = ncol (Y); + is_supported = 0; + if (num_response_columns == 2 & distribution_type == 2 & link_type >= 1 & link_type <= 4) { # BERNOULLI DISTRIBUTION + is_supported = 1; + } + if (num_response_columns == 1 & distribution_type == 2) { + print ("Error: Bernoulli response matrix has not been converted into two-column format."); + } + + if (is_supported != 1) { + stop ("Response matrix with " + num_response_columns + " columns, distribution family (" + distribution_type + ", " + variance_as_power_of_the_mean + + ") and link family (" + link_type + ", " + link_as_power_of_the_mean + ") are NOT supported together."); + } + + ##### INITIALIZE THE BETAS ##### + + [beta, saturated_log_l, isNaN] = + glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG); + + # print(" --- saturated logLik " + saturated_log_l); + + if (isNaN != 0) { + stop ("Input matrices X and/or Y are out of range!"); + } + + ##### START OF THE MAIN PART ##### + + sum_X_sq = sum (rowSums_X_sq); + trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq)); + ### max_trust_delta = trust_delta * 10000.0; + log_l = 0.0; + deviance_nodisp = 0.0; + new_deviance_nodisp = 0.0; + isNaN_log_l = 2; + newbeta = beta; + g = matrix (0.0, rows = num_features, cols = 1); + g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); + accept_new_beta = 1; + reached_trust_boundary = 0; + neg_log_l_change_predicted = 0.0; + i_IRLS = 0; + + # print ("BEGIN IRLS ITERATIONS..."); + + ssX_newbeta = diag (scale_X) %*% newbeta; + ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; + all_linear_terms = X %*% ssX_newbeta; + + [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part + (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + + if (isNaN_new_log_l == 0) { + new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); + new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); + } + + while (termination_code == 0) { + accept_new_beta = 1; + + if (i_IRLS > 0) { + if (isNaN_log_l == 0) { + accept_new_beta = 0; + } + + # Decide whether to accept a new iteration point and update the trust region + # See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright + + rho = (- new_log_l + log_l) / neg_log_l_change_predicted; + if (rho < 0.25 | isNaN_new_log_l == 1) { + trust_delta = 0.25 * trust_delta; + } + if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) { + trust_delta = 2 * trust_delta; + + ### if (trust_delta > max_trust_delta) { + ### trust_delta = max_trust_delta; + ### } + } + if (rho > 0.1 & isNaN_new_log_l == 0) { + accept_new_beta = 1; + } + } + + if (accept_new_beta == 1) { + beta = newbeta; log_l = new_log_l; deviance_nodisp = new_deviance_nodisp; isNaN_log_l = isNaN_new_log_l; + + [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + + # We introduced these variables to avoid roundoff errors: + # g_Y = y_residual / (y_var * link_grad); + # w = 1.0 / (y_var * link_grad * link_grad); + + gXY = - t(X) %*% g_Y; + g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ]; + g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); + } + + [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] = + get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG); + + newbeta = beta + z; + + ssX_newbeta = diag (scale_X) %*% newbeta; + ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; + all_linear_terms = X %*% ssX_newbeta; + + [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part + (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + + if (isNaN_new_log_l == 0) { + new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); + new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); + } + + log_l_change = new_log_l - log_l; # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps + + if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & + (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) { + termination_code = 1; + } + rho = - log_l_change / neg_log_l_change_predicted; + z_norm = sqrt (sum (z * z)); + + i_IRLS = i_IRLS + 1; + + if (i_IRLS == max_iteration_IRLS) { + termination_code = 2; + } + } + + beta = newbeta; + log_l = new_log_l; + deviance_nodisp = new_deviance_nodisp; + + #---------------------------- last part + + if (termination_code != 1) { + print ("One of the runs of GLM did not converged in " + i_IRLS + " steps!"); + } + + ##### COMPUTE AIC ##### + + if (distribution_type == 2 & link_type >= 1 & link_type <= 4) { + AIC = -2 * log_l; + if (sum (X) != 0) { + AIC = AIC + 2 * num_features; + } + } else { + stop ("Currently only the Bernoulli distribution family the following link functions are supported: log, logit, probit, and cloglog!"); + } + + # Output which features give the best AIC and are being used for linear regression + S = Selected; + + ssX_beta = diag (scale_X) %*% beta; + ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta; + if (intercept_status == 2) { + beta_out = cbind (ssX_beta, beta); + } else { + beta_out = ssX_beta; + } + + if (intercept_status == 0 & num_features == 1) { + p = sum (X == 1); + if (p == num_records) { + beta_out = beta_out[1,]; + } + } + + + if (intercept_status == 1 | intercept_status == 2) { + intercept_value = as.scalar (beta_out [num_features, 1]); + beta_noicept = beta_out [1 : (num_features - 1), 1]; + } else { + beta_noicept = beta_out [1 : num_features, 1]; + } + min_beta = min (beta_noicept); + max_beta = max (beta_noicept); + tmp_i_min_beta = rowIndexMin (t(beta_noicept)) + i_min_beta = as.scalar (tmp_i_min_beta [1, 1]); + tmp_i_max_beta = rowIndexMax (t(beta_noicept)) + i_max_beta = as.scalar (tmp_i_max_beta [1, 1]); + + ##### OVER-DISPERSION PART ##### + + all_linear_terms = X %*% ssX_beta; + [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + + pearson_residual_sq = g_Y ^ 2 / w; + pearson_residual_sq = replace (target = pearson_residual_sq, pattern = NaN, replacement = 0); + # pearson_residual_sq = (y_residual ^ 2) / y_var; + + if (num_records > num_features) { + estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features); + } + if (dispersion <= 0) { + dispersion = estimated_dispersion; + } + deviance = deviance_nodisp / dispersion; + + ##### END OF THE MAIN PART ##### + + str = "BETA_MIN," + min_beta; + str = append (str, "BETA_MIN_INDEX," + i_min_beta); + str = append (str, "BETA_MAX," + max_beta); + str = append (str, "BETA_MAX_INDEX," + i_max_beta); + str = append (str, "INTERCEPT," + intercept_value); + str = append (str, "DISPERSION," + dispersion); + str = append (str, "DISPERSION_EST," + estimated_dispersion); + str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp); + str = append (str, "DEVIANCE_SCALED," + deviance); + O = str + + # Prepare the output matrix + print ("Writing the output matrix..."); + if (intercept_status == 0 & num_features == 1) { + if (p == num_records) { + beta_out_tmp = matrix (0, rows = num_features_orig + 1, cols = 1); + beta_out_tmp[num_features_orig + 1,] = beta_out; + beta_out = beta_out_tmp; + return AIC, beta_out, S, O; + } else if (sum (X) == 0){ + beta_out = matrix (0, rows = num_features_orig, cols = 1); + return AIC, beta_out, S, O; + } + } + + no_selected = ncol (Selected); + max_selected = max (Selected); + last = max_selected + 1; + + if (intercept_status != 0) { + + Selected_ext = cbind (Selected, as.matrix (last)); + P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); + + if (intercept_status == 2) { + + P1_ssX_beta = P1 * ssX_beta; + P2_ssX_beta = colSums (P1_ssX_beta); + P1_beta = P1 * beta; + P2_beta = colSums (P1_beta); + + if (max_selected < num_features_orig) { + + P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); + P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); + + P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; + P2_ssX_beta[1, max_selected + 1] = 0; + + P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1]; + P2_beta[1, max_selected + 1] = 0; + + } + beta_out = cbind (t(P2_ssX_beta), t(P2_beta)); + + } else { + + P1_beta = P1 * beta; + P2_beta = colSums (P1_beta); + + if (max_selected < num_features_orig) { + P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); + P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ; + P2_beta[1, max_selected + 1] = 0; + } + beta_out = t(P2_beta); + + } + } else { + + P1 = table (seq (1, no_selected), t(Selected)); + P1_beta = P1 * beta; + P2_beta = colSums (P1_beta); + + if (max_selected < num_features_orig) { + P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); + } + + beta_out = t(P2_beta); + } + + return AIC, beta_out, S, O; + } + +glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG) + return (Matrix[double] beta, double saturated_log_l, int isNaN) + { + saturated_log_l = 0.0; + isNaN = 0; + y_corr = Y [, 1]; + if (dist_type == 2) { + n_corr = rowSums (Y); + is_n_zero = (n_corr == 0); + y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero; + } + linear_terms = y_corr; + if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION + if (link_power == 0) { + if (sum (y_corr < 0) == 0) { + is_zero_y_corr = (y_corr == 0); + linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { isNaN = 1; } + } else { if (link_power == 1.0) { + linear_terms = y_corr; + } else { if (link_power == -1.0) { + linear_terms = 1.0 / y_corr; + } else { if (link_power == 0.5) { + if (sum (y_corr < 0) == 0) { + linear_terms = sqrt (y_corr); + } else { isNaN = 1; } + } else { if (link_power > 0) { + if (sum (y_corr < 0) == 0) { + is_zero_y_corr = (y_corr == 0); + linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; + } else { isNaN = 1; } + } else { + if (sum (y_corr <= 0) == 0) { + linear_terms = y_corr ^ link_power; + } else { isNaN = 1; } + }}}}} + } + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + if (link_type == 1 & link_power == 0) { # Binomial.log + if (sum (y_corr < 0) == 0) { + is_zero_y_corr = (y_corr == 0); + linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { isNaN = 1; } + } else { if (link_type == 1 & link_power > 0) { # Binomial.power_nonlog pos + if (sum (y_corr < 0) == 0) { + is_zero_y_corr = (y_corr == 0); + linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; + } else { isNaN = 1; } + } else { if (link_type == 1) { # Binomial.power_nonlog neg + if (sum (y_corr <= 0) == 0) { + linear_terms = y_corr ^ link_power; + } else { isNaN = 1; } + } else { + is_zero_y_corr = (y_corr <= 0); + is_one_y_corr = (y_corr >= 1.0); + y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr); + if (link_type == 2) { # Binomial.logit + linear_terms = log (y_corr / (1.0 - y_corr)) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 3) { # Binomial.probit + y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5); + t = sqrt (- 2.0 * log (y_below_half)); + approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308))); + linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5)) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 4) { # Binomial.cloglog + linear_terms = log (- log (1.0 - y_corr)) + - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 5) { # Binomial.cauchit + linear_terms = tan ((y_corr - 0.5) * pi) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + }} }}}}} + } + + if (isNaN == 0) { + [saturated_log_l, isNaN] = + glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power); + } + + if ((dist_type == 1 & link_type == 1 & link_power == 0) | + (dist_type == 2 & link_type >= 2)) + { + desired_eta = 0.0; + } else { if (link_type == 1 & link_power == 0) { + desired_eta = log (0.5); + } else { if (link_type == 1) { + desired_eta = 0.5 ^ link_power; + } else { + desired_eta = 0.5; + }}} + + beta = matrix (0.0, rows = ncol(X), cols = 1); + + if (desired_eta != 0) { + if (icept_status == 1 | icept_status == 2) { + beta [nrow(beta), 1] = desired_eta; + } else { + # We want: avg (X %*% ssX_transform %*% beta) = desired_eta + # Note that "ssX_transform" is trivial here, hence ignored + + beta = straightenX (X, 0.000001, max_iter_CG); + beta = beta * desired_eta; + } } } + + +glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y, + int dist_type, double var_power, int link_type, double link_power) + return (Matrix[double] g_Y, Matrix[double] w) +# ORIGINALLY we returned more meaningful vectors, namely: +# Matrix[double] y_residual : y - y_mean, i.e. y observed - y predicted +# Matrix[double] link_gradient : derivative of the link function +# Matrix[double] var_function : variance without dispersion, i.e. the V(mu) function +# BUT, this caused roundoff errors, so we had to compute "directly useful" vectors +# and skip over the "meaningful intermediaries". Now we output these two variables: +# g_Y = y_residual / (var_function * link_gradient); +# w = 1.0 / (var_function * link_gradient ^ 2); + { + num_records = nrow (linear_terms); + zeros_r = matrix (0.0, rows = num_records, cols = 1); + ones_r = 1 + zeros_r; + g_Y = zeros_r; + w = zeros_r; + + # Some constants + + one_over_sqrt_two_pi = 0.39894228040143267793994605993438; + ones_2 = matrix (1.0, rows = 1, cols = 2); + p_one_m_one = ones_2; + p_one_m_one [1, 2] = -1.0; + m_one_p_one = ones_2; + m_one_p_one [1, 1] = -1.0; + zero_one = ones_2; + zero_one [1, 1] = 0.0; + one_zero = ones_2; + one_zero [1, 2] = 0.0; + flip_pos = matrix (0, rows = 2, cols = 2); + flip_neg = flip_pos; + flip_pos [1, 2] = 1; + flip_pos [2, 1] = 1; + flip_neg [1, 2] = -1; + flip_neg [2, 1] = 1; + + if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION + y_mean = zeros_r; + if (link_power == 0) { + y_mean = exp (linear_terms); + y_mean_pow = y_mean ^ (1 - var_power); + w = y_mean_pow * y_mean; + g_Y = y_mean_pow * (Y - y_mean); + } else { if (link_power == 1.0) { + y_mean = linear_terms; + w = y_mean ^ (- var_power); + g_Y = w * (Y - y_mean); + } else { + y_mean = linear_terms ^ (1.0 / link_power); + c1 = (1 - var_power) / link_power - 1; + c2 = (2 - var_power) / link_power - 2; + g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power; + w = (linear_terms ^ c2) / (link_power ^ 2); + } }} + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + if (link_type == 1) { # BINOMIAL.POWER LINKS + if (link_power == 0) { # Binomial.log + vec1 = 1 / (exp (- linear_terms) - 1); + g_Y = Y [, 1] - Y [, 2] * vec1; + w = rowSums (Y) * vec1; + } else { # Binomial.nonlog + vec1 = zeros_r; + if (link_power == 0.5) { + vec1 = 1 / (1 - linear_terms ^ 2); + } else { if (sum (linear_terms < 0) == 0) { + vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power)); + } else {isNaN = 1;}} + # We want a "zero-protected" version of + # vec2 = Y [, 1] / linear_terms; + is_y_0 = (Y [, 1] == 0); + vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0; + g_Y = (vec2 - Y [, 2] * vec1 * linear_terms) / link_power; + w = rowSums (Y) * vec1 / link_power ^ 2; + } + } else { + is_LT_pos_infinite = (linear_terms == Inf); + is_LT_neg_infinite = (linear_terms == -Inf); + is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; + finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); + finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); + if (link_type == 2) { # Binomial.logit + Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; + Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); + Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; + g_Y = rowSums (Y * (Y_prob %*% flip_neg)); ### = y_residual; + w = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob); ### = y_variance; + } else { if (link_type == 3) { # Binomial.probit + is_lt_pos = (linear_terms >= 0); + t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) + pt_gp = t_gp * ( 0.254829592 + + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, + + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 + + t_gp * (-1.453152027 + + t_gp * 1.061405429)))); + the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0); + vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp); + vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5); + w = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1; + g_Y = one_over_sqrt_two_pi * vec2 / vec1; + } else { if (link_type == 4) { # Binomial.cloglog + the_exp = exp (linear_terms) + the_exp_exp = exp (- the_exp); + is_too_small = ((10000000 + the_exp) == 10000000); + the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2); + g_Y = (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio; + w = the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio; + } else { if (link_type == 5) { # Binomial.cauchit + Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; + Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; + y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1]; + var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2]; + link_gradient_normalized = (1 + linear_terms ^ 2) * pi; + g_Y = rowSums (Y) * y_residual / (var_function * link_gradient_normalized); + w = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2); + }}}} + } + } + } + + +glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y, + int dist_type, double var_power, int link_type, double link_power) + return (double log_l, int isNaN) + { + isNaN = 0; + log_l = 0.0; + num_records = nrow (Y); + zeros_r = matrix (0.0, rows = num_records, cols = 1); + + if (dist_type == 1 & link_type == 1) + { # POWER DISTRIBUTION + b_cumulant = zeros_r; + natural_parameters = zeros_r; + is_natural_parameter_log_zero = zeros_r; + if (var_power == 1.0 & link_power == 0) { # Poisson.log + b_cumulant = exp (linear_terms); + is_natural_parameter_log_zero = (linear_terms == -Inf); + natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0); + } else { if (var_power == 1.0 & link_power == 1.0) { # Poisson.id + if (sum (linear_terms < 0) == 0) { + b_cumulant = linear_terms; + is_natural_parameter_log_zero = (linear_terms == 0); + natural_parameters = log (linear_terms + is_natural_parameter_log_zero); + } else {isNaN = 1;} + } else { if (var_power == 1.0 & link_power == 0.5) { # Poisson.sqrt + if (sum (linear_terms < 0) == 0) { + b_cumulant = linear_terms ^ 2; + is_natural_parameter_log_zero = (linear_terms == 0); + natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero); + } else {isNaN = 1;} + } else { if (var_power == 1.0 & link_power > 0) { # Poisson.power_nonlog, pos + if (sum (linear_terms < 0) == 0) { + is_natural_parameter_log_zero = (linear_terms == 0); + b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero; + natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power; + } else {isNaN = 1;} + } else { if (var_power == 1.0) { # Poisson.power_nonlog, neg + if (sum (linear_terms <= 0) == 0) { + b_cumulant = linear_terms ^ (1.0 / link_power); + natural_parameters = log (linear_terms) / link_power; + } else {isNaN = 1;} + } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse + if (sum (linear_terms <= 0) == 0) { + b_cumulant = - log (linear_terms); + natural_parameters = - linear_terms; + } else {isNaN = 1;} + } else { if (var_power == 2.0 & link_power == 1.0) { # Gamma.id + if (sum (linear_terms <= 0) == 0) { + b_cumulant = log (linear_terms); + natural_parameters = - 1.0 / linear_terms; + } else {isNaN = 1;} + } else { if (var_power == 2.0 & link_power == 0) { # Gamma.log + b_cumulant = linear_terms; + natural_parameters = - exp (- linear_terms); + } else { if (var_power == 2.0) { # Gamma.power_nonlog + if (sum (linear_terms <= 0) == 0) { + b_cumulant = log (linear_terms) / link_power; + natural_parameters = - linear_terms ^ (- 1.0 / link_power); + } else {isNaN = 1;} + } else { if (link_power == 0) { # PowerDist.log + natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power); + b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power); + } else { # PowerDist.power_nonlog + if (-2 * link_power == 1.0 - var_power) { + natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power); + } else { if (-1 * link_power == 1.0 - var_power) { + natural_parameters = 1.0 / linear_terms / (1.0 - var_power); + } else { if ( link_power == 1.0 - var_power) { + natural_parameters = linear_terms / (1.0 - var_power); + } else { if ( 2 * link_power == 1.0 - var_power) { + natural_parameters = linear_terms ^ 2 / (1.0 - var_power); + } else { + if (sum (linear_terms <= 0) == 0) { + power = (1.0 - var_power) / link_power; + natural_parameters = (linear_terms ^ power) / (1.0 - var_power); + } else {isNaN = 1;} + }}}} + if (-2 * link_power == 2.0 - var_power) { + b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power); + } else { if (-1 * link_power == 2.0 - var_power) { + b_cumulant = 1.0 / linear_terms / (2.0 - var_power); + } else { if ( link_power == 2.0 - var_power) { + b_cumulant = linear_terms / (2.0 - var_power); + } else { if ( 2 * link_power == 2.0 - var_power) { + b_cumulant = linear_terms ^ 2 / (2.0 - var_power); + } else { + if (sum (linear_terms <= 0) == 0) { + power = (2.0 - var_power) / link_power; + b_cumulant = (linear_terms ^ power) / (2.0 - var_power); + } else {isNaN = 1;} + }}}} + }}}}} }}}}} + if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) { + log_l = -Inf; + isNaN = 1; + } + if (isNaN == 0) + { + log_l = sum (Y * natural_parameters - b_cumulant); + if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { + log_l = -Inf; + isNaN = 1; + } } } + + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + + [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power); + + if (isNaN == 0) { + does_prob_contradict = (Y_prob <= 0); + if (sum (does_prob_contradict * abs (Y)) == 0) { + log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict)); + if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { + isNaN = 1; + } + } else { + log_l = -Inf; + isNaN = 1; + } } } + + if (isNaN == 1) { + log_l = - Inf; + } + } + + + +binomial_probability_two_column = + function (Matrix[double] linear_terms, int link_type, double link_power) + return (Matrix[double] Y_prob, int isNaN) + { + isNaN = 0; + num_records = nrow (linear_terms); + + # Define some auxiliary matrices + + ones_2 = matrix (1.0, rows = 1, cols = 2); + p_one_m_one = ones_2; + p_one_m_one [1, 2] = -1.0; + m_one_p_one = ones_2; + m_one_p_one [1, 1] = -1.0; + zero_one = ones_2; + zero_one [1, 1] = 0.0; + one_zero = ones_2; + one_zero [1, 2] = 0.0; + + zeros_r = matrix (0.0, rows = num_records, cols = 1); + ones_r = 1.0 + zeros_r; + + # Begin the function body + + Y_prob = zeros_r %*% ones_2; + if (link_type == 1) { # Binomial.power + if (link_power == 0) { # Binomial.log + Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one; + } else { if (link_power == 0.5) { # Binomial.sqrt + Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one; + } else { # Binomial.power_nonlog + if (sum (linear_terms < 0) == 0) { + Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one; + } else {isNaN = 1;} + }} + } else { # Binomial.non_power + is_LT_pos_infinite = (linear_terms == Inf); + is_LT_neg_infinite = (linear_terms == -Inf); + is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; + finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); + finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); + if (link_type == 2) { # Binomial.logit + Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; + Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); + } else { if (link_type == 3) { # Binomial.probit + lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one; + t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) + pt_gp = t_gp * ( 0.254829592 + + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, + + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 + + t_gp * (-1.453152027 + + t_gp * 1.061405429)))); + the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0); + Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg); + } else { if (link_type == 4) { # Binomial.cloglog + the_exp = exp (finite_linear_terms); + the_exp_exp = exp (- the_exp); + is_too_small = ((10000000 + the_exp) == 10000000); + Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2); + Y_prob [, 2] = the_exp_exp; + } else { if (link_type == 5) { # Binomial.cauchit + Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; + } else { + isNaN = 1; + }}}} + Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; + } } + + +# THE CG-STEIHAUG PROCEDURE SCRIPT + +# Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize +# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z +# under constraint: ||z|| <= trust_delta. +# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright +# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform +# is given separately because sparse "X" may become dense after applying the transform. +# +get_CG_Steihaug_point = + function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w, + Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG) + return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary) + { + trust_delta_sq = trust_delta ^ 2; + size_CG = nrow (g); + z = matrix (0.0, rows = size_CG, cols = 1); + neg_log_l_change = 0.0; + reached_trust_boundary = 0; + g_reg = g + lambda * beta; + r_CG = g_reg; + p_CG = -r_CG; + rr_CG = sum(r_CG * r_CG); + eps_CG = rr_CG * min (0.25, sqrt (rr_CG)); + converged_CG = 0; + if (rr_CG < eps_CG) { + converged_CG = 1; + } + + max_iteration_CG = max_iter_CG; + if (max_iteration_CG <= 0) { + max_iteration_CG = size_CG; + } + i_CG = 0; + while (converged_CG == 0) + { + i_CG = i_CG + 1; + ssX_p_CG = diag (scale_X) %*% p_CG; + ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG; + temp_CG = t(X) %*% (w * (X %*% ssX_p_CG)); + q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ]; + pq_CG = sum (p_CG * q_CG); + if (pq_CG <= 0) { + pp_CG = sum (p_CG * p_CG); + if (pp_CG > 0) { + [z, neg_log_l_change] = + get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); + reached_trust_boundary = 1; + } else { + neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); + } + converged_CG = 1; + } + if (converged_CG == 0) { + alpha_CG = rr_CG / pq_CG; + new_z = z + alpha_CG * p_CG; + if (sum(new_z * new_z) >= trust_delta_sq) { + pp_CG = sum (p_CG * p_CG); + [z, neg_log_l_change] = + get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); + reached_trust_boundary = 1; + converged_CG = 1; + } + if (converged_CG == 0) { + z = new_z; + old_rr_CG = rr_CG; + r_CG = r_CG + alpha_CG * q_CG; + rr_CG = sum(r_CG * r_CG); + if (i_CG == max_iteration_CG | rr_CG < eps_CG) { + neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); + reached_trust_boundary = 0; + converged_CG = 1; + } + if (converged_CG == 0) { + p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG; + } } } } } + + +# An auxiliary function used twice inside the CG-STEIHAUG loop: +get_trust_boundary_point = + function (Matrix[double] g, Matrix[double] z, Matrix[double] p, + Matrix[double] q, Matrix[double] r, double pp, double pq, + double trust_delta_sq) + return (Matrix[double] new_z, double f_change) + { + zz = sum (z * z); pz = sum (p * z); + sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq)); + tau_1 = (- pz + sq_root_d) / pp; + tau_2 = (- pz - sq_root_d) / pp; + zq = sum (z * q); gp = sum (g * p); + f_extra = 0.5 * sum (z * (r + g)); + f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1; + f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2; + ind1 = as.integer(f_change_1 < f_change_2); + ind2 = as.integer(f_change_1 >= f_change_2); + new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p); + f_change = ind1 * f_change_1 + ind2 * f_change_2; + } + + +# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 +# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale +# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). +straightenX = + function (Matrix[double] X, double eps, int max_iter_CG) + return (Matrix[double] w) + { + w_X = t(colSums(X)); + lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); + eps_LS = eps * nrow(X); + + # BEGIN LEAST SQUARES + + r_LS = - w_X; + z_LS = matrix (0.0, rows = ncol(X), cols = 1); + p_LS = - r_LS; + norm_r2_LS = sum (r_LS ^ 2); + i_LS = 0; + while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS) + { + q_LS = t(X) %*% X %*% p_LS; + q_LS = q_LS + lambda_LS * p_LS; + alpha_LS = norm_r2_LS / sum (p_LS * q_LS); + z_LS = z_LS + alpha_LS * p_LS; + old_norm_r2_LS = norm_r2_LS; + r_LS = r_LS + alpha_LS * q_LS; + norm_r2_LS = sum (r_LS ^ 2); + p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; + i_LS = i_LS + 1; + } + + # END LEAST SQUARES + + w = (nrow(X) / sum (w_X * z_LS)) * z_LS; + } + + From 78018f9b09bd80e7c47f24fac83e4c1955f13f7a Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 26 Jun 2026 01:16:07 +0200 Subject: [PATCH 02/12] rename builtin StepGLM.dml to stepglm.dml --- scripts/builtin/{StepGLM.dml => stepglm.dml} | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) rename scripts/builtin/{StepGLM.dml => stepglm.dml} (99%) diff --git a/scripts/builtin/StepGLM.dml b/scripts/builtin/stepglm.dml similarity index 99% rename from scripts/builtin/StepGLM.dml rename to scripts/builtin/stepglm.dml index 3bbaf361650..cd2d881ecba 100644 --- a/scripts/builtin/StepGLM.dml +++ b/scripts/builtin/stepglm.dml @@ -88,12 +88,11 @@ stepGLM = function ( Int moi = 200, Int mii = 0, Double thr = 0.01, -) - return ( - Double AIC, - Matrix[Double] B, - Matrix[Double] S, - Matrix[Double] O, +) return ( + Double AIC, + Matrix[Double] B, + Matrix[Double] S, + Matrix[Double] O, ) { intercept_status = icpt; From 739c5d283a0c82622e42bced0c43079c3e1ce153 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 26 Jun 2026 01:18:10 +0200 Subject: [PATCH 03/12] rename builtin stepglm.dml to stepGLM.dml --- scripts/builtin/{stepglm.dml => stepGLM.dml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/builtin/{stepglm.dml => stepGLM.dml} (100%) diff --git a/scripts/builtin/stepglm.dml b/scripts/builtin/stepGLM.dml similarity index 100% rename from scripts/builtin/stepglm.dml rename to scripts/builtin/stepGLM.dml From 9dc956bd0b6728a43d08e6eb8563490484755cdc Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 26 Jun 2026 05:44:12 +0200 Subject: [PATCH 04/12] fix syntax errors in stepGLM --- scripts/algorithms/TestBuiltinStepGLM.dml | 21 + scripts/builtin/stepGLM.dml | 1102 ++++++++++----------- 2 files changed, 570 insertions(+), 553 deletions(-) create mode 100644 scripts/algorithms/TestBuiltinStepGLM.dml diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml new file mode 100644 index 00000000000..ab64960b35a --- /dev/null +++ b/scripts/algorithms/TestBuiltinStepGLM.dml @@ -0,0 +1,21 @@ +source("scripts/builtin/stepGLM.dml") as stepGLM; + +N = 1000; +P = 10; +X = rand(rows=N, cols=P, min=-1.0, max=1.0, pdf="uniform", seed=123); + +beta_true = matrix(0, rows=P, cols=1); +beta_true[2,1] = 3.5; +beta_true[5,1] = -2.0; +beta_true[8,1] = 1.5; + +Z = X %*% beta_true; +P_y = 1.0 / (1.0 + exp(-Z)); +Y = (rand(rows=N, cols=1, min=0.0, max=1.0, seed=456) < P_y) * 1.0; + +[AIC, B, S, O] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01); + +print("Optimal AIC: " + AIC); +print("Selected Feature Indices:\n" + toString(S)); +print("Estimated Coefficients:\n" + toString(B)); + diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml index cd2d881ecba..c8be7e582b1 100644 --- a/scripts/builtin/stepGLM.dml +++ b/scripts/builtin/stepGLM.dml @@ -87,12 +87,12 @@ stepGLM = function ( Double disp = 0.0, Int moi = 200, Int mii = 0, - Double thr = 0.01, + Double thr = 0.01 ) return ( Double AIC, Matrix[Double] B, Matrix[Double] S, - Matrix[Double] O, + String O ) { intercept_status = icpt; @@ -134,18 +134,18 @@ stepGLM = function ( if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best, _beta, _S, _O] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered); + [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered); } else { # compute AIC of an empty model with only intercept (all Ys are constant) all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best, _beta, _S, _O] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered); + [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered); } #print ("Best AIC without any features: " + AIC_best); # First pass to examine single features AICs = matrix (AIC_best, rows = 1, cols = num_features); parfor (i in 1:num_features) { - [AIC_1, _beta, _S, _O] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered); + [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered); AICs[1,i] = AIC_1; } @@ -163,11 +163,11 @@ stepGLM = function ( #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best, _beta, _S, _O] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered); + [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered); } else { # compute AIC of an empty model with only intercept (all Ys are constant) ###all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best, _beta, _S, _O] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered); + [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered); } }; @@ -182,9 +182,9 @@ stepGLM = function ( if (as.scalar(columns_fixed[1,i]) == 0) { # Construct the feature matrix - X = cbind (X_global, X_orig[,i]); + X_loop = cbind (X_global, X_orig[,i]); - [AIC_2, _beta, _S, _O] = glm_fit (X, Y, intercept_status, num_features, columns_fixed_ordered); + [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm_fit (X_loop, Y, intercept_status, num_features, columns_fixed_ordered); AICs[1,i] = AIC_2; } } @@ -216,8 +216,7 @@ stepGLM = function ( # run GLM with selected set of features print ("Running GLM with selected features..."); - [AIC, beta, S, O] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered); - return (AIC, beta, S, O) + [AIC, B, S, O] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered); } @@ -233,13 +232,13 @@ glm_fit = function ( Double disp = 0.0, Double tol = 0.000001, Int moi = 200, - Int mii = 0, + Int mii = 0 ) return ( Double AIC, Matrix[Double] beta, Matrix[Double] S, - String O, + String O ) { @@ -563,189 +562,186 @@ glm_fit = function ( str = append (str, "DEVIANCE_SCALED," + deviance); O = str + do_return = 0; # Prepare the output matrix - print ("Writing the output matrix..."); if (intercept_status == 0 & num_features == 1) { if (p == num_records) { beta_out_tmp = matrix (0, rows = num_features_orig + 1, cols = 1); beta_out_tmp[num_features_orig + 1,] = beta_out; - beta_out = beta_out_tmp; - return AIC, beta_out, S, O; + beta = beta_out_tmp; + do_return = 1; } else if (sum (X) == 0){ - beta_out = matrix (0, rows = num_features_orig, cols = 1); - return AIC, beta_out, S, O; + beta = matrix (0, rows = num_features_orig, cols = 1); + do_return = 1; } } + if (do_return != 0) { + no_selected = ncol (Selected); + max_selected = max (Selected); + last = max_selected + 1; - no_selected = ncol (Selected); - max_selected = max (Selected); - last = max_selected + 1; - - if (intercept_status != 0) { + if (intercept_status != 0) { Selected_ext = cbind (Selected, as.matrix (last)); P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); if (intercept_status == 2) { - P1_ssX_beta = P1 * ssX_beta; - P2_ssX_beta = colSums (P1_ssX_beta); - P1_beta = P1 * beta; - P2_beta = colSums (P1_beta); + P1_ssX_beta = P1 * ssX_beta; + P2_ssX_beta = colSums (P1_ssX_beta); + P1_beta = P1 * beta; + P2_beta = colSums (P1_beta); - if (max_selected < num_features_orig) { + if (max_selected < num_features_orig) { - P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); - P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); + P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); + P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); - P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; - P2_ssX_beta[1, max_selected + 1] = 0; + P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; + P2_ssX_beta[1, max_selected + 1] = 0; - P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1]; - P2_beta[1, max_selected + 1] = 0; + P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1]; + P2_beta[1, max_selected + 1] = 0; - } - beta_out = cbind (t(P2_ssX_beta), t(P2_beta)); + } + beta = cbind (t(P2_ssX_beta), t(P2_beta)); } else { - P1_beta = P1 * beta; - P2_beta = colSums (P1_beta); - - if (max_selected < num_features_orig) { - P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); - P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ; - P2_beta[1, max_selected + 1] = 0; - } - beta_out = t(P2_beta); + P1_beta = P1 * beta; + P2_beta = colSums (P1_beta); + if (max_selected < num_features_orig) { + P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); + P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ; + P2_beta[1, max_selected + 1] = 0; + } + beta = t(P2_beta); } - } else { - + } else { P1 = table (seq (1, no_selected), t(Selected)); P1_beta = P1 * beta; P2_beta = colSums (P1_beta); if (max_selected < num_features_orig) { - P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); + P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); } - beta_out = t(P2_beta); + beta = t(P2_beta); + } } - - return AIC, beta_out, S, O; } glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG) - return (Matrix[double] beta, double saturated_log_l, int isNaN) - { - saturated_log_l = 0.0; - isNaN = 0; - y_corr = Y [, 1]; - if (dist_type == 2) { - n_corr = rowSums (Y); - is_n_zero = (n_corr == 0); - y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero; - } - linear_terms = y_corr; - if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION - if (link_power == 0) { + return (Matrix[double] beta, double saturated_log_l, int isNaN) + { + saturated_log_l = 0.0; + isNaN = 0; + y_corr = Y [, 1]; + if (dist_type == 2) { + n_corr = rowSums (Y); + is_n_zero = (n_corr == 0); + y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero; + } + linear_terms = y_corr; + if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION + if (link_power == 0) { + if (sum (y_corr < 0) == 0) { + is_zero_y_corr = (y_corr == 0); + linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { isNaN = 1; } + } else { if (link_power == 1.0) { + linear_terms = y_corr; + } else { if (link_power == -1.0) { + linear_terms = 1.0 / y_corr; + } else { if (link_power == 0.5) { + if (sum (y_corr < 0) == 0) { + linear_terms = sqrt (y_corr); + } else { isNaN = 1; } + } else { if (link_power > 0) { if (sum (y_corr < 0) == 0) { - is_zero_y_corr = (y_corr == 0); - linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + is_zero_y_corr = (y_corr == 0); + linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; } else { isNaN = 1; } - } else { if (link_power == 1.0) { - linear_terms = y_corr; - } else { if (link_power == -1.0) { - linear_terms = 1.0 / y_corr; - } else { if (link_power == 0.5) { - if (sum (y_corr < 0) == 0) { - linear_terms = sqrt (y_corr); - } else { isNaN = 1; } - } else { if (link_power > 0) { - if (sum (y_corr < 0) == 0) { - is_zero_y_corr = (y_corr == 0); - linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; - } else { isNaN = 1; } - } else { - if (sum (y_corr <= 0) == 0) { - linear_terms = y_corr ^ link_power; - } else { isNaN = 1; } - }}}}} - } - if (dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - if (link_type == 1 & link_power == 0) { # Binomial.log - if (sum (y_corr < 0) == 0) { - is_zero_y_corr = (y_corr == 0); - linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { + if (sum (y_corr <= 0) == 0) { + linear_terms = y_corr ^ link_power; } else { isNaN = 1; } - } else { if (link_type == 1 & link_power > 0) { # Binomial.power_nonlog pos - if (sum (y_corr < 0) == 0) { - is_zero_y_corr = (y_corr == 0); - linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; - } else { isNaN = 1; } - } else { if (link_type == 1) { # Binomial.power_nonlog neg - if (sum (y_corr <= 0) == 0) { - linear_terms = y_corr ^ link_power; - } else { isNaN = 1; } - } else { - is_zero_y_corr = (y_corr <= 0); - is_one_y_corr = (y_corr >= 1.0); - y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr); - if (link_type == 2) { # Binomial.logit - linear_terms = log (y_corr / (1.0 - y_corr)) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { if (link_type == 3) { # Binomial.probit - y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5); - t = sqrt (- 2.0 * log (y_below_half)); - approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308))); - linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5)) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { if (link_type == 4) { # Binomial.cloglog - linear_terms = log (- log (1.0 - y_corr)) - - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { if (link_type == 5) { # Binomial.cauchit - linear_terms = tan ((y_corr - 0.5) * pi) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - }} }}}}} - } + }}}}} + } + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + if (link_type == 1 & link_power == 0) { # Binomial.log + if (sum (y_corr < 0) == 0) { + is_zero_y_corr = (y_corr == 0); + linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { isNaN = 1; } + } else { if (link_type == 1 & link_power > 0) { # Binomial.power_nonlog pos + if (sum (y_corr < 0) == 0) { + is_zero_y_corr = (y_corr == 0); + linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; + } else { isNaN = 1; } + } else { if (link_type == 1) { # Binomial.power_nonlog neg + if (sum (y_corr <= 0) == 0) { + linear_terms = y_corr ^ link_power; + } else { isNaN = 1; } + } else { + is_zero_y_corr = (y_corr <= 0); + is_one_y_corr = (y_corr >= 1.0); + y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr); + if (link_type == 2) { # Binomial.logit + linear_terms = log (y_corr / (1.0 - y_corr)) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 3) { # Binomial.probit + y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5); + t = sqrt (- 2.0 * log (y_below_half)); + approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308))); + linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5)) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 4) { # Binomial.cloglog + linear_terms = log (- log (1.0 - y_corr)) + - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 5) { # Binomial.cauchit + linear_terms = tan ((y_corr - 0.5) * pi) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + }} }}}}} + } - if (isNaN == 0) { - [saturated_log_l, isNaN] = - glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power); - } + if (isNaN == 0) { + [saturated_log_l, isNaN] = + glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power); + } - if ((dist_type == 1 & link_type == 1 & link_power == 0) | - (dist_type == 2 & link_type >= 2)) - { - desired_eta = 0.0; - } else { if (link_type == 1 & link_power == 0) { - desired_eta = log (0.5); - } else { if (link_type == 1) { - desired_eta = 0.5 ^ link_power; - } else { - desired_eta = 0.5; - }}} + if ((dist_type == 1 & link_type == 1 & link_power == 0) | + (dist_type == 2 & link_type >= 2)) + { + desired_eta = 0.0; + } else { if (link_type == 1 & link_power == 0) { + desired_eta = log (0.5); + } else { if (link_type == 1) { + desired_eta = 0.5 ^ link_power; + } else { + desired_eta = 0.5; + }}} - beta = matrix (0.0, rows = ncol(X), cols = 1); + beta = matrix (0.0, rows = ncol(X), cols = 1); - if (desired_eta != 0) { - if (icept_status == 1 | icept_status == 2) { - beta [nrow(beta), 1] = desired_eta; - } else { - # We want: avg (X %*% ssX_transform %*% beta) = desired_eta - # Note that "ssX_transform" is trivial here, hence ignored + if (desired_eta != 0) { + if (icept_status == 1 | icept_status == 2) { + beta [nrow(beta), 1] = desired_eta; + } else { + # We want: avg (X %*% ssX_transform %*% beta) = desired_eta + # Note that "ssX_transform" is trivial here, hence ignored - beta = straightenX (X, 0.000001, max_iter_CG); - beta = beta * desired_eta; - } } } + beta = straightenX (X, 0.000001, max_iter_CG); + beta = beta * desired_eta; + } } } glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power) - return (Matrix[double] g_Y, Matrix[double] w) + return (Matrix[double] g_Y, Matrix[double] w) # ORIGINALLY we returned more meaningful vectors, namely: # Matrix[double] y_residual : y - y_mean, i.e. y observed - y predicted # Matrix[double] link_gradient : derivative of the link function @@ -754,312 +750,312 @@ glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y, # and skip over the "meaningful intermediaries". Now we output these two variables: # g_Y = y_residual / (var_function * link_gradient); # w = 1.0 / (var_function * link_gradient ^ 2); - { - num_records = nrow (linear_terms); - zeros_r = matrix (0.0, rows = num_records, cols = 1); - ones_r = 1 + zeros_r; - g_Y = zeros_r; - w = zeros_r; - - # Some constants - - one_over_sqrt_two_pi = 0.39894228040143267793994605993438; - ones_2 = matrix (1.0, rows = 1, cols = 2); - p_one_m_one = ones_2; - p_one_m_one [1, 2] = -1.0; - m_one_p_one = ones_2; - m_one_p_one [1, 1] = -1.0; - zero_one = ones_2; - zero_one [1, 1] = 0.0; - one_zero = ones_2; - one_zero [1, 2] = 0.0; - flip_pos = matrix (0, rows = 2, cols = 2); - flip_neg = flip_pos; - flip_pos [1, 2] = 1; - flip_pos [2, 1] = 1; - flip_neg [1, 2] = -1; - flip_neg [2, 1] = 1; - - if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION - y_mean = zeros_r; - if (link_power == 0) { - y_mean = exp (linear_terms); - y_mean_pow = y_mean ^ (1 - var_power); - w = y_mean_pow * y_mean; - g_Y = y_mean_pow * (Y - y_mean); - } else { if (link_power == 1.0) { - y_mean = linear_terms; - w = y_mean ^ (- var_power); - g_Y = w * (Y - y_mean); - } else { - y_mean = linear_terms ^ (1.0 / link_power); - c1 = (1 - var_power) / link_power - 1; - c2 = (2 - var_power) / link_power - 2; - g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power; - w = (linear_terms ^ c2) / (link_power ^ 2); - } }} - if (dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - if (link_type == 1) { # BINOMIAL.POWER LINKS - if (link_power == 0) { # Binomial.log - vec1 = 1 / (exp (- linear_terms) - 1); - g_Y = Y [, 1] - Y [, 2] * vec1; - w = rowSums (Y) * vec1; - } else { # Binomial.nonlog - vec1 = zeros_r; - if (link_power == 0.5) { - vec1 = 1 / (1 - linear_terms ^ 2); - } else { if (sum (linear_terms < 0) == 0) { - vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power)); - } else {isNaN = 1;}} - # We want a "zero-protected" version of - # vec2 = Y [, 1] / linear_terms; - is_y_0 = (Y [, 1] == 0); - vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0; - g_Y = (vec2 - Y [, 2] * vec1 * linear_terms) / link_power; - w = rowSums (Y) * vec1 / link_power ^ 2; - } - } else { - is_LT_pos_infinite = (linear_terms == Inf); - is_LT_neg_infinite = (linear_terms == -Inf); - is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; - finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); - finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); - if (link_type == 2) { # Binomial.logit - Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; - Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); - Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; - g_Y = rowSums (Y * (Y_prob %*% flip_neg)); ### = y_residual; - w = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob); ### = y_variance; - } else { if (link_type == 3) { # Binomial.probit - is_lt_pos = (linear_terms >= 0); - t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) - pt_gp = t_gp * ( 0.254829592 - + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, - + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 - + t_gp * (-1.453152027 - + t_gp * 1.061405429)))); - the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0); - vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp); - vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5); - w = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1; - g_Y = one_over_sqrt_two_pi * vec2 / vec1; - } else { if (link_type == 4) { # Binomial.cloglog - the_exp = exp (linear_terms) - the_exp_exp = exp (- the_exp); - is_too_small = ((10000000 + the_exp) == 10000000); - the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2); - g_Y = (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio; - w = the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio; - } else { if (link_type == 5) { # Binomial.cauchit - Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; - Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; - y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1]; - var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2]; - link_gradient_normalized = (1 + linear_terms ^ 2) * pi; - g_Y = rowSums (Y) * y_residual / (var_function * link_gradient_normalized); - w = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2); - }}}} - } + { + num_records = nrow (linear_terms); + zeros_r = matrix (0.0, rows = num_records, cols = 1); + ones_r = 1 + zeros_r; + g_Y = zeros_r; + w = zeros_r; + + # Some constants + + one_over_sqrt_two_pi = 0.39894228040143267793994605993438; + ones_2 = matrix (1.0, rows = 1, cols = 2); + p_one_m_one = ones_2; + p_one_m_one [1, 2] = -1.0; + m_one_p_one = ones_2; + m_one_p_one [1, 1] = -1.0; + zero_one = ones_2; + zero_one [1, 1] = 0.0; + one_zero = ones_2; + one_zero [1, 2] = 0.0; + flip_pos = matrix (0, rows = 2, cols = 2); + flip_neg = flip_pos; + flip_pos [1, 2] = 1; + flip_pos [2, 1] = 1; + flip_neg [1, 2] = -1; + flip_neg [2, 1] = 1; + + if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION + y_mean = zeros_r; + if (link_power == 0) { + y_mean = exp (linear_terms); + y_mean_pow = y_mean ^ (1 - var_power); + w = y_mean_pow * y_mean; + g_Y = y_mean_pow * (Y - y_mean); + } else { if (link_power == 1.0) { + y_mean = linear_terms; + w = y_mean ^ (- var_power); + g_Y = w * (Y - y_mean); + } else { + y_mean = linear_terms ^ (1.0 / link_power); + c1 = (1 - var_power) / link_power - 1; + c2 = (2 - var_power) / link_power - 2; + g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power; + w = (linear_terms ^ c2) / (link_power ^ 2); + } }} + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + if (link_type == 1) { # BINOMIAL.POWER LINKS + if (link_power == 0) { # Binomial.log + vec1 = 1 / (exp (- linear_terms) - 1); + g_Y = Y [, 1] - Y [, 2] * vec1; + w = rowSums (Y) * vec1; + } else { # Binomial.nonlog + vec1 = zeros_r; + if (link_power == 0.5) { + vec1 = 1 / (1 - linear_terms ^ 2); + } else { if (sum (linear_terms < 0) == 0) { + vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power)); + } else {isNaN = 1;}} + # We want a "zero-protected" version of + # vec2 = Y [, 1] / linear_terms; + is_y_0 = (Y [, 1] == 0); + vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0; + g_Y = (vec2 - Y [, 2] * vec1 * linear_terms) / link_power; + w = rowSums (Y) * vec1 / link_power ^ 2; } + } else { + is_LT_pos_infinite = (linear_terms == Inf); + is_LT_neg_infinite = (linear_terms == -Inf); + is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; + finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); + finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); + if (link_type == 2) { # Binomial.logit + Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; + Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); + Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; + g_Y = rowSums (Y * (Y_prob %*% flip_neg)); ### = y_residual; + w = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob); ### = y_variance; + } else { if (link_type == 3) { # Binomial.probit + is_lt_pos = (linear_terms >= 0); + t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) + pt_gp = t_gp * ( 0.254829592 + + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, + + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 + + t_gp * (-1.453152027 + + t_gp * 1.061405429)))); + the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0); + vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp); + vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5); + w = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1; + g_Y = one_over_sqrt_two_pi * vec2 / vec1; + } else { if (link_type == 4) { # Binomial.cloglog + the_exp = exp (linear_terms) + the_exp_exp = exp (- the_exp); + is_too_small = ((10000000 + the_exp) == 10000000); + the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2); + g_Y = (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio; + w = the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio; + } else { if (link_type == 5) { # Binomial.cauchit + Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; + Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; + y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1]; + var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2]; + link_gradient_normalized = (1 + linear_terms ^ 2) * pi; + g_Y = rowSums (Y) * y_residual / (var_function * link_gradient_normalized); + w = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2); + }}}} + } } + } glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power) - return (double log_l, int isNaN) - { - isNaN = 0; - log_l = 0.0; - num_records = nrow (Y); - zeros_r = matrix (0.0, rows = num_records, cols = 1); - - if (dist_type == 1 & link_type == 1) - { # POWER DISTRIBUTION - b_cumulant = zeros_r; - natural_parameters = zeros_r; - is_natural_parameter_log_zero = zeros_r; - if (var_power == 1.0 & link_power == 0) { # Poisson.log - b_cumulant = exp (linear_terms); - is_natural_parameter_log_zero = (linear_terms == -Inf); - natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0); - } else { if (var_power == 1.0 & link_power == 1.0) { # Poisson.id - if (sum (linear_terms < 0) == 0) { - b_cumulant = linear_terms; - is_natural_parameter_log_zero = (linear_terms == 0); - natural_parameters = log (linear_terms + is_natural_parameter_log_zero); + return (double log_l, int isNaN) + { + isNaN = 0; + log_l = 0.0; + num_records = nrow (Y); + zeros_r = matrix (0.0, rows = num_records, cols = 1); + + if (dist_type == 1 & link_type == 1) + { # POWER DISTRIBUTION + b_cumulant = zeros_r; + natural_parameters = zeros_r; + is_natural_parameter_log_zero = zeros_r; + if (var_power == 1.0 & link_power == 0) { # Poisson.log + b_cumulant = exp (linear_terms); + is_natural_parameter_log_zero = (linear_terms == -Inf); + natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0); + } else { if (var_power == 1.0 & link_power == 1.0) { # Poisson.id + if (sum (linear_terms < 0) == 0) { + b_cumulant = linear_terms; + is_natural_parameter_log_zero = (linear_terms == 0); + natural_parameters = log (linear_terms + is_natural_parameter_log_zero); + } else {isNaN = 1;} + } else { if (var_power == 1.0 & link_power == 0.5) { # Poisson.sqrt + if (sum (linear_terms < 0) == 0) { + b_cumulant = linear_terms ^ 2; + is_natural_parameter_log_zero = (linear_terms == 0); + natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero); + } else {isNaN = 1;} + } else { if (var_power == 1.0 & link_power > 0) { # Poisson.power_nonlog, pos + if (sum (linear_terms < 0) == 0) { + is_natural_parameter_log_zero = (linear_terms == 0); + b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero; + natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power; + } else {isNaN = 1;} + } else { if (var_power == 1.0) { # Poisson.power_nonlog, neg + if (sum (linear_terms <= 0) == 0) { + b_cumulant = linear_terms ^ (1.0 / link_power); + natural_parameters = log (linear_terms) / link_power; + } else {isNaN = 1;} + } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse + if (sum (linear_terms <= 0) == 0) { + b_cumulant = - log (linear_terms); + natural_parameters = - linear_terms; + } else {isNaN = 1;} + } else { if (var_power == 2.0 & link_power == 1.0) { # Gamma.id + if (sum (linear_terms <= 0) == 0) { + b_cumulant = log (linear_terms); + natural_parameters = - 1.0 / linear_terms; } else {isNaN = 1;} - } else { if (var_power == 1.0 & link_power == 0.5) { # Poisson.sqrt - if (sum (linear_terms < 0) == 0) { - b_cumulant = linear_terms ^ 2; - is_natural_parameter_log_zero = (linear_terms == 0); - natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero); + } else { if (var_power == 2.0 & link_power == 0) { # Gamma.log + b_cumulant = linear_terms; + natural_parameters = - exp (- linear_terms); + } else { if (var_power == 2.0) { # Gamma.power_nonlog + if (sum (linear_terms <= 0) == 0) { + b_cumulant = log (linear_terms) / link_power; + natural_parameters = - linear_terms ^ (- 1.0 / link_power); } else {isNaN = 1;} - } else { if (var_power == 1.0 & link_power > 0) { # Poisson.power_nonlog, pos - if (sum (linear_terms < 0) == 0) { - is_natural_parameter_log_zero = (linear_terms == 0); - b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero; - natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power; - } else {isNaN = 1;} - } else { if (var_power == 1.0) { # Poisson.power_nonlog, neg - if (sum (linear_terms <= 0) == 0) { - b_cumulant = linear_terms ^ (1.0 / link_power); - natural_parameters = log (linear_terms) / link_power; - } else {isNaN = 1;} - } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse - if (sum (linear_terms <= 0) == 0) { - b_cumulant = - log (linear_terms); - natural_parameters = - linear_terms; - } else {isNaN = 1;} - } else { if (var_power == 2.0 & link_power == 1.0) { # Gamma.id - if (sum (linear_terms <= 0) == 0) { - b_cumulant = log (linear_terms); - natural_parameters = - 1.0 / linear_terms; - } else {isNaN = 1;} - } else { if (var_power == 2.0 & link_power == 0) { # Gamma.log - b_cumulant = linear_terms; - natural_parameters = - exp (- linear_terms); - } else { if (var_power == 2.0) { # Gamma.power_nonlog - if (sum (linear_terms <= 0) == 0) { - b_cumulant = log (linear_terms) / link_power; - natural_parameters = - linear_terms ^ (- 1.0 / link_power); - } else {isNaN = 1;} - } else { if (link_power == 0) { # PowerDist.log - natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power); - b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power); - } else { # PowerDist.power_nonlog - if (-2 * link_power == 1.0 - var_power) { - natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power); - } else { if (-1 * link_power == 1.0 - var_power) { - natural_parameters = 1.0 / linear_terms / (1.0 - var_power); - } else { if ( link_power == 1.0 - var_power) { - natural_parameters = linear_terms / (1.0 - var_power); - } else { if ( 2 * link_power == 1.0 - var_power) { - natural_parameters = linear_terms ^ 2 / (1.0 - var_power); - } else { - if (sum (linear_terms <= 0) == 0) { - power = (1.0 - var_power) / link_power; - natural_parameters = (linear_terms ^ power) / (1.0 - var_power); - } else {isNaN = 1;} - }}}} - if (-2 * link_power == 2.0 - var_power) { - b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power); - } else { if (-1 * link_power == 2.0 - var_power) { - b_cumulant = 1.0 / linear_terms / (2.0 - var_power); - } else { if ( link_power == 2.0 - var_power) { - b_cumulant = linear_terms / (2.0 - var_power); - } else { if ( 2 * link_power == 2.0 - var_power) { - b_cumulant = linear_terms ^ 2 / (2.0 - var_power); - } else { - if (sum (linear_terms <= 0) == 0) { - power = (2.0 - var_power) / link_power; - b_cumulant = (linear_terms ^ power) / (2.0 - var_power); - } else {isNaN = 1;} - }}}} - }}}}} }}}}} - if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) { - log_l = -Inf; - isNaN = 1; - } - if (isNaN == 0) - { - log_l = sum (Y * natural_parameters - b_cumulant); - if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { - log_l = -Inf; - isNaN = 1; - } } } - - if (dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - - [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power); - - if (isNaN == 0) { - does_prob_contradict = (Y_prob <= 0); - if (sum (does_prob_contradict * abs (Y)) == 0) { - log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict)); - if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { - isNaN = 1; - } - } else { - log_l = -Inf; - isNaN = 1; - } } } + } else { if (link_power == 0) { # PowerDist.log + natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power); + b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power); + } else { # PowerDist.power_nonlog + if (-2 * link_power == 1.0 - var_power) { + natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power); + } else { if (-1 * link_power == 1.0 - var_power) { + natural_parameters = 1.0 / linear_terms / (1.0 - var_power); + } else { if ( link_power == 1.0 - var_power) { + natural_parameters = linear_terms / (1.0 - var_power); + } else { if ( 2 * link_power == 1.0 - var_power) { + natural_parameters = linear_terms ^ 2 / (1.0 - var_power); + } else { + if (sum (linear_terms <= 0) == 0) { + power = (1.0 - var_power) / link_power; + natural_parameters = (linear_terms ^ power) / (1.0 - var_power); + } else {isNaN = 1;} + }}}} + if (-2 * link_power == 2.0 - var_power) { + b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power); + } else { if (-1 * link_power == 2.0 - var_power) { + b_cumulant = 1.0 / linear_terms / (2.0 - var_power); + } else { if ( link_power == 2.0 - var_power) { + b_cumulant = linear_terms / (2.0 - var_power); + } else { if ( 2 * link_power == 2.0 - var_power) { + b_cumulant = linear_terms ^ 2 / (2.0 - var_power); + } else { + if (sum (linear_terms <= 0) == 0) { + power = (2.0 - var_power) / link_power; + b_cumulant = (linear_terms ^ power) / (2.0 - var_power); + } else {isNaN = 1;} + }}}} + }}}}} }}}}} + if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) { + log_l = -Inf; + isNaN = 1; + } + if (isNaN == 0) + { + log_l = sum (Y * natural_parameters - b_cumulant); + if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { + log_l = -Inf; + isNaN = 1; + } } } - if (isNaN == 1) { - log_l = - Inf; - } + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + + [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power); + + if (isNaN == 0) { + does_prob_contradict = (Y_prob <= 0); + if (sum (does_prob_contradict * abs (Y)) == 0) { + log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict)); + if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { + isNaN = 1; + } + } else { + log_l = -Inf; + isNaN = 1; + } } } + + if (isNaN == 1) { + log_l = - Inf; } + } binomial_probability_two_column = - function (Matrix[double] linear_terms, int link_type, double link_power) - return (Matrix[double] Y_prob, int isNaN) - { - isNaN = 0; - num_records = nrow (linear_terms); - - # Define some auxiliary matrices - - ones_2 = matrix (1.0, rows = 1, cols = 2); - p_one_m_one = ones_2; - p_one_m_one [1, 2] = -1.0; - m_one_p_one = ones_2; - m_one_p_one [1, 1] = -1.0; - zero_one = ones_2; - zero_one [1, 1] = 0.0; - one_zero = ones_2; - one_zero [1, 2] = 0.0; - - zeros_r = matrix (0.0, rows = num_records, cols = 1); - ones_r = 1.0 + zeros_r; - - # Begin the function body - - Y_prob = zeros_r %*% ones_2; - if (link_type == 1) { # Binomial.power - if (link_power == 0) { # Binomial.log - Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one; - } else { if (link_power == 0.5) { # Binomial.sqrt - Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one; - } else { # Binomial.power_nonlog - if (sum (linear_terms < 0) == 0) { - Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one; - } else {isNaN = 1;} - }} - } else { # Binomial.non_power - is_LT_pos_infinite = (linear_terms == Inf); - is_LT_neg_infinite = (linear_terms == -Inf); - is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; - finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); - finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); - if (link_type == 2) { # Binomial.logit - Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; - Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); - } else { if (link_type == 3) { # Binomial.probit - lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one; - t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) - pt_gp = t_gp * ( 0.254829592 - + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, - + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 - + t_gp * (-1.453152027 - + t_gp * 1.061405429)))); - the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0); - Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg); - } else { if (link_type == 4) { # Binomial.cloglog - the_exp = exp (finite_linear_terms); - the_exp_exp = exp (- the_exp); - is_too_small = ((10000000 + the_exp) == 10000000); - Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2); - Y_prob [, 2] = the_exp_exp; - } else { if (link_type == 5) { # Binomial.cauchit - Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; - } else { - isNaN = 1; - }}}} - Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; - } } + function (Matrix[double] linear_terms, int link_type, double link_power) + return (Matrix[double] Y_prob, int isNaN) + { + isNaN = 0; + num_records = nrow (linear_terms); + + # Define some auxiliary matrices + + ones_2 = matrix (1.0, rows = 1, cols = 2); + p_one_m_one = ones_2; + p_one_m_one [1, 2] = -1.0; + m_one_p_one = ones_2; + m_one_p_one [1, 1] = -1.0; + zero_one = ones_2; + zero_one [1, 1] = 0.0; + one_zero = ones_2; + one_zero [1, 2] = 0.0; + + zeros_r = matrix (0.0, rows = num_records, cols = 1); + ones_r = 1.0 + zeros_r; + + # Begin the function body + + Y_prob = zeros_r %*% ones_2; + if (link_type == 1) { # Binomial.power + if (link_power == 0) { # Binomial.log + Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one; + } else { if (link_power == 0.5) { # Binomial.sqrt + Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one; + } else { # Binomial.power_nonlog + if (sum (linear_terms < 0) == 0) { + Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one; + } else {isNaN = 1;} + }} + } else { # Binomial.non_power + is_LT_pos_infinite = (linear_terms == Inf); + is_LT_neg_infinite = (linear_terms == -Inf); + is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; + finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); + finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); + if (link_type == 2) { # Binomial.logit + Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; + Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); + } else { if (link_type == 3) { # Binomial.probit + lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one; + t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) + pt_gp = t_gp * ( 0.254829592 + + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, + + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 + + t_gp * (-1.453152027 + + t_gp * 1.061405429)))); + the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0); + Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg); + } else { if (link_type == 4) { # Binomial.cloglog + the_exp = exp (finite_linear_terms); + the_exp_exp = exp (- the_exp); + is_too_small = ((10000000 + the_exp) == 10000000); + Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2); + Y_prob [, 2] = the_exp_exp; + } else { if (link_type == 5) { # Binomial.cauchit + Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; + } else { + isNaN = 1; + }}}} + Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; + } } # THE CG-STEIHAUG PROCEDURE SCRIPT @@ -1072,130 +1068,130 @@ binomial_probability_two_column = # is given separately because sparse "X" may become dense after applying the transform. # get_CG_Steihaug_point = - function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w, - Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG) - return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary) - { - trust_delta_sq = trust_delta ^ 2; - size_CG = nrow (g); - z = matrix (0.0, rows = size_CG, cols = 1); - neg_log_l_change = 0.0; - reached_trust_boundary = 0; - g_reg = g + lambda * beta; - r_CG = g_reg; - p_CG = -r_CG; + function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w, + Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG) + return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary) + { + trust_delta_sq = trust_delta ^ 2; + size_CG = nrow (g); + z = matrix (0.0, rows = size_CG, cols = 1); + neg_log_l_change = 0.0; + reached_trust_boundary = 0; + g_reg = g + lambda * beta; + r_CG = g_reg; + p_CG = -r_CG; + rr_CG = sum(r_CG * r_CG); + eps_CG = rr_CG * min (0.25, sqrt (rr_CG)); + converged_CG = 0; + if (rr_CG < eps_CG) { + converged_CG = 1; + } + + max_iteration_CG = max_iter_CG; + if (max_iteration_CG <= 0) { + max_iteration_CG = size_CG; + } + i_CG = 0; + while (converged_CG == 0) + { + i_CG = i_CG + 1; + ssX_p_CG = diag (scale_X) %*% p_CG; + ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG; + temp_CG = t(X) %*% (w * (X %*% ssX_p_CG)); + q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ]; + pq_CG = sum (p_CG * q_CG); + if (pq_CG <= 0) { + pp_CG = sum (p_CG * p_CG); + if (pp_CG > 0) { + [z, neg_log_l_change] = + get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); + reached_trust_boundary = 1; + } else { + neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); + } + converged_CG = 1; + } + if (converged_CG == 0) { + alpha_CG = rr_CG / pq_CG; + new_z = z + alpha_CG * p_CG; + if (sum(new_z * new_z) >= trust_delta_sq) { + pp_CG = sum (p_CG * p_CG); + [z, neg_log_l_change] = + get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); + reached_trust_boundary = 1; + converged_CG = 1; + } + if (converged_CG == 0) { + z = new_z; + old_rr_CG = rr_CG; + r_CG = r_CG + alpha_CG * q_CG; rr_CG = sum(r_CG * r_CG); - eps_CG = rr_CG * min (0.25, sqrt (rr_CG)); - converged_CG = 0; - if (rr_CG < eps_CG) { - converged_CG = 1; - } - - max_iteration_CG = max_iter_CG; - if (max_iteration_CG <= 0) { - max_iteration_CG = size_CG; + if (i_CG == max_iteration_CG | rr_CG < eps_CG) { + neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); + reached_trust_boundary = 0; + converged_CG = 1; } - i_CG = 0; - while (converged_CG == 0) - { - i_CG = i_CG + 1; - ssX_p_CG = diag (scale_X) %*% p_CG; - ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG; - temp_CG = t(X) %*% (w * (X %*% ssX_p_CG)); - q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ]; - pq_CG = sum (p_CG * q_CG); - if (pq_CG <= 0) { - pp_CG = sum (p_CG * p_CG); - if (pp_CG > 0) { - [z, neg_log_l_change] = - get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); - reached_trust_boundary = 1; - } else { - neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); - } - converged_CG = 1; - } - if (converged_CG == 0) { - alpha_CG = rr_CG / pq_CG; - new_z = z + alpha_CG * p_CG; - if (sum(new_z * new_z) >= trust_delta_sq) { - pp_CG = sum (p_CG * p_CG); - [z, neg_log_l_change] = - get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); - reached_trust_boundary = 1; - converged_CG = 1; - } - if (converged_CG == 0) { - z = new_z; - old_rr_CG = rr_CG; - r_CG = r_CG + alpha_CG * q_CG; - rr_CG = sum(r_CG * r_CG); - if (i_CG == max_iteration_CG | rr_CG < eps_CG) { - neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); - reached_trust_boundary = 0; - converged_CG = 1; - } - if (converged_CG == 0) { - p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG; - } } } } } + if (converged_CG == 0) { + p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG; + } } } } } # An auxiliary function used twice inside the CG-STEIHAUG loop: get_trust_boundary_point = - function (Matrix[double] g, Matrix[double] z, Matrix[double] p, - Matrix[double] q, Matrix[double] r, double pp, double pq, - double trust_delta_sq) - return (Matrix[double] new_z, double f_change) - { - zz = sum (z * z); pz = sum (p * z); - sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq)); - tau_1 = (- pz + sq_root_d) / pp; - tau_2 = (- pz - sq_root_d) / pp; - zq = sum (z * q); gp = sum (g * p); - f_extra = 0.5 * sum (z * (r + g)); - f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1; - f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2; - ind1 = as.integer(f_change_1 < f_change_2); - ind2 = as.integer(f_change_1 >= f_change_2); - new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p); - f_change = ind1 * f_change_1 + ind2 * f_change_2; - } + function (Matrix[double] g, Matrix[double] z, Matrix[double] p, + Matrix[double] q, Matrix[double] r, double pp, double pq, + double trust_delta_sq) + return (Matrix[double] new_z, double f_change) + { + zz = sum (z * z); pz = sum (p * z); + sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq)); + tau_1 = (- pz + sq_root_d) / pp; + tau_2 = (- pz - sq_root_d) / pp; + zq = sum (z * q); gp = sum (g * p); + f_extra = 0.5 * sum (z * (r + g)); + f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1; + f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2; + ind1 = as.integer(f_change_1 < f_change_2); + ind2 = as.integer(f_change_1 >= f_change_2); + new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p); + f_change = ind1 * f_change_1 + ind2 * f_change_2; + } # Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 # We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale # it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). straightenX = - function (Matrix[double] X, double eps, int max_iter_CG) - return (Matrix[double] w) - { - w_X = t(colSums(X)); - lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); - eps_LS = eps * nrow(X); - - # BEGIN LEAST SQUARES - - r_LS = - w_X; - z_LS = matrix (0.0, rows = ncol(X), cols = 1); - p_LS = - r_LS; - norm_r2_LS = sum (r_LS ^ 2); - i_LS = 0; - while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS) - { - q_LS = t(X) %*% X %*% p_LS; - q_LS = q_LS + lambda_LS * p_LS; - alpha_LS = norm_r2_LS / sum (p_LS * q_LS); - z_LS = z_LS + alpha_LS * p_LS; - old_norm_r2_LS = norm_r2_LS; - r_LS = r_LS + alpha_LS * q_LS; - norm_r2_LS = sum (r_LS ^ 2); - p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; - i_LS = i_LS + 1; - } - - # END LEAST SQUARES - - w = (nrow(X) / sum (w_X * z_LS)) * z_LS; - } + function (Matrix[double] X, double eps, int max_iter_CG) + return (Matrix[double] w) + { + w_X = t(colSums(X)); + lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); + eps_LS = eps * nrow(X); + + # BEGIN LEAST SQUARES + + r_LS = - w_X; + z_LS = matrix (0.0, rows = ncol(X), cols = 1); + p_LS = - r_LS; + norm_r2_LS = sum (r_LS ^ 2); + i_LS = 0; + while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS) + { + q_LS = t(X) %*% X %*% p_LS; + q_LS = q_LS + lambda_LS * p_LS; + alpha_LS = norm_r2_LS / sum (p_LS * q_LS); + z_LS = z_LS + alpha_LS * p_LS; + old_norm_r2_LS = norm_r2_LS; + r_LS = r_LS + alpha_LS * q_LS; + norm_r2_LS = sum (r_LS ^ 2); + p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; + i_LS = i_LS + 1; + } + + # END LEAST SQUARES + + w = (nrow(X) / sum (w_X * z_LS)) * z_LS; + } From e60c2dd1ab3c1b52f947463cd6a2515db21de8c1 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 26 Jun 2026 05:48:34 +0200 Subject: [PATCH 05/12] fix glm_fit call syntax --- scripts/builtin/stepGLM.dml | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml index c8be7e582b1..0431b27ddde 100644 --- a/scripts/builtin/stepGLM.dml +++ b/scripts/builtin/stepGLM.dml @@ -134,18 +134,18 @@ stepGLM = function ( if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered); + [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm_fit (X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } else { # compute AIC of an empty model with only intercept (all Ys are constant) all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered); + [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm_fit (X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } #print ("Best AIC without any features: " + AIC_best); # First pass to examine single features AICs = matrix (AIC_best, rows = 1, cols = num_features); parfor (i in 1:num_features) { - [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered); + [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm_fit (X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); AICs[1,i] = AIC_1; } @@ -163,11 +163,11 @@ stepGLM = function ( #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered); + [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm_fit (X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } else { # compute AIC of an empty model with only intercept (all Ys are constant) ###all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered); + [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm_fit (X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } }; @@ -184,7 +184,7 @@ stepGLM = function ( # Construct the feature matrix X_loop = cbind (X_global, X_orig[,i]); - [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm_fit (X_loop, Y, intercept_status, num_features, columns_fixed_ordered); + [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm_fit (X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); AICs[1,i] = AIC_2; } } @@ -216,7 +216,7 @@ stepGLM = function ( # run GLM with selected set of features print ("Running GLM with selected features..."); - [AIC, B, S, O] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered); + [AIC, B, S, O] = glm_fit (X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } @@ -834,7 +834,7 @@ glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y, + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 + t_gp * (-1.453152027 - + t_gp * 1.061405429)))); + + t_gp * 1.061405429)))); the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0); vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp); vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5); @@ -1040,7 +1040,7 @@ binomial_probability_two_column = + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 + t_gp * (-1.453152027 - + t_gp * 1.061405429)))); + + t_gp * 1.061405429)))); the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0); Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg); } else { if (link_type == 4) { # Binomial.cloglog @@ -1193,5 +1193,3 @@ straightenX = w = (nrow(X) / sum (w_X * z_LS)) * z_LS; } - - From 9de039d912e842bdeaec7b8a2231550893e6258a Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 26 Jun 2026 06:20:55 +0200 Subject: [PATCH 06/12] remove fit_glm --- scripts/builtin/stepGLM.dml | 973 ------------------------------------ 1 file changed, 973 deletions(-) diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml index 0431b27ddde..3ec1cffcb7f 100644 --- a/scripts/builtin/stepGLM.dml +++ b/scripts/builtin/stepGLM.dml @@ -220,976 +220,3 @@ stepGLM = function ( } -################### UDFS USED IN THIS SCRIPT ################## - -glm_fit = function ( - Matrix[Double] X, - Matrix[Double] Y, - Int intercept_status, - Double num_features_orig, - Matrix[Double] Selected, - Int link = 2, - Double disp = 0.0, - Double tol = 0.000001, - Int moi = 200, - Int mii = 0 -) - return ( - Double AIC, - Matrix[Double] beta, - Matrix[Double] S, - String O - ) - { - - # distribution family code: 1 = Power, 2 = Bernoulli/Binomial; currently only Bernouli distribution family is supported! - distribution_type = 2; # $dfam = 2; - variance_as_power_of_the_mean = 0.0; # $vpow = 0.0; - # link function code: 0 = canonical (depends on distribution), 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit; - # currently only log (link = 1), logit (link = 2), probit (link = 3), and cloglog (link = 4) are supported! - link_type = link; - link_as_power_of_the_mean = 0.0; # $lpow = 0.0; - - dispersion = disp; - eps = tol; - max_iteration_IRLS = moi; - max_iteration_CG = mii; - - variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean); - link_as_power_of_the_mean = as.double (link_as_power_of_the_mean); - - dispersion = as.double (dispersion); - eps = as.double (eps); - - # Default values for output statistics: - regularization = 0.0; - termination_code = 0.0; - min_beta = NaN; - i_min_beta = NaN; - max_beta = NaN; - i_max_beta = NaN; - intercept_value = NaN; - dispersion = NaN; - estimated_dispersion = NaN; - deviance_nodisp = NaN; - deviance = NaN; - - ##### INITIALIZE THE PARAMETERS ##### - - num_records = nrow (X); - num_features = ncol (X); - zeros_r = matrix (0, rows = num_records, cols = 1); - ones_r = 1 + zeros_r; - - # Introduce the intercept, shift and rescale the columns of X if needed - - if (intercept_status == 1 | intercept_status == 2) { # add the intercept column - X = cbind (X, ones_r); - num_features = ncol (X); - } - - scale_lambda = matrix (1, rows = num_features, cols = 1); - if (intercept_status == 1 | intercept_status == 2) { - scale_lambda [num_features, 1] = 0; - } - - if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1 - # Important assumption: X [, num_features] = ones_r - avg_X_cols = t(colSums(X)) / num_records; - var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1); - is_unsafe = (var_X_cols <= 0); - scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); - scale_X [num_features, 1] = 1; - shift_X = - avg_X_cols * scale_X; - shift_X [num_features, 1] = 0; - rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2); - } else { - scale_X = matrix (1, rows = num_features, cols = 1); - shift_X = matrix (0, rows = num_features, cols = 1); - rowSums_X_sq = rowSums (X ^ 2); - } - - # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) - # with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. - # The transform is then associatively applied to the other side of the expression, - # and is rewritten via "scale_X" and "shift_X" as follows: - # - # ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: - # ssX_A = diag (scale_X) %*% A; - # ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A; - # - # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: - # tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ]; - - # Initialize other input-dependent parameters - - lambda = scale_lambda * regularization; - if (max_iteration_CG == 0) { - max_iteration_CG = num_features; - } - - # Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const] - - if (link_type == 0) { - if (distribution_type == 1) { - link_type = 1; - link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean; - } else { - if (distribution_type == 2) { - link_type = 2; - } - } - } - - # For power distributions and/or links, we use two constants, - # "variance as power of the mean" and "link_as_power_of_the_mean", - # to specify the variance and the link as arbitrary powers of the - # mean. However, the variance-powers of 1.0 (Poisson family) and - # 2.0 (Gamma family) have to be treated as special cases, because - # these values integrate into logarithms. The link-power of 0.0 - # is also special as it represents the logarithm link. - - num_response_columns = ncol (Y); - is_supported = 0; - if (num_response_columns == 2 & distribution_type == 2 & link_type >= 1 & link_type <= 4) { # BERNOULLI DISTRIBUTION - is_supported = 1; - } - if (num_response_columns == 1 & distribution_type == 2) { - print ("Error: Bernoulli response matrix has not been converted into two-column format."); - } - - if (is_supported != 1) { - stop ("Response matrix with " + num_response_columns + " columns, distribution family (" + distribution_type + ", " + variance_as_power_of_the_mean - + ") and link family (" + link_type + ", " + link_as_power_of_the_mean + ") are NOT supported together."); - } - - ##### INITIALIZE THE BETAS ##### - - [beta, saturated_log_l, isNaN] = - glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG); - - # print(" --- saturated logLik " + saturated_log_l); - - if (isNaN != 0) { - stop ("Input matrices X and/or Y are out of range!"); - } - - ##### START OF THE MAIN PART ##### - - sum_X_sq = sum (rowSums_X_sq); - trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq)); - ### max_trust_delta = trust_delta * 10000.0; - log_l = 0.0; - deviance_nodisp = 0.0; - new_deviance_nodisp = 0.0; - isNaN_log_l = 2; - newbeta = beta; - g = matrix (0.0, rows = num_features, cols = 1); - g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); - accept_new_beta = 1; - reached_trust_boundary = 0; - neg_log_l_change_predicted = 0.0; - i_IRLS = 0; - - # print ("BEGIN IRLS ITERATIONS..."); - - ssX_newbeta = diag (scale_X) %*% newbeta; - ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; - all_linear_terms = X %*% ssX_newbeta; - - [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part - (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); - - if (isNaN_new_log_l == 0) { - new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); - new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); - } - - while (termination_code == 0) { - accept_new_beta = 1; - - if (i_IRLS > 0) { - if (isNaN_log_l == 0) { - accept_new_beta = 0; - } - - # Decide whether to accept a new iteration point and update the trust region - # See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright - - rho = (- new_log_l + log_l) / neg_log_l_change_predicted; - if (rho < 0.25 | isNaN_new_log_l == 1) { - trust_delta = 0.25 * trust_delta; - } - if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) { - trust_delta = 2 * trust_delta; - - ### if (trust_delta > max_trust_delta) { - ### trust_delta = max_trust_delta; - ### } - } - if (rho > 0.1 & isNaN_new_log_l == 0) { - accept_new_beta = 1; - } - } - - if (accept_new_beta == 1) { - beta = newbeta; log_l = new_log_l; deviance_nodisp = new_deviance_nodisp; isNaN_log_l = isNaN_new_log_l; - - [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); - - # We introduced these variables to avoid roundoff errors: - # g_Y = y_residual / (y_var * link_grad); - # w = 1.0 / (y_var * link_grad * link_grad); - - gXY = - t(X) %*% g_Y; - g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ]; - g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); - } - - [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] = - get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG); - - newbeta = beta + z; - - ssX_newbeta = diag (scale_X) %*% newbeta; - ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; - all_linear_terms = X %*% ssX_newbeta; - - [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part - (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); - - if (isNaN_new_log_l == 0) { - new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); - new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); - } - - log_l_change = new_log_l - log_l; # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps - - if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & - (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) { - termination_code = 1; - } - rho = - log_l_change / neg_log_l_change_predicted; - z_norm = sqrt (sum (z * z)); - - i_IRLS = i_IRLS + 1; - - if (i_IRLS == max_iteration_IRLS) { - termination_code = 2; - } - } - - beta = newbeta; - log_l = new_log_l; - deviance_nodisp = new_deviance_nodisp; - - #---------------------------- last part - - if (termination_code != 1) { - print ("One of the runs of GLM did not converged in " + i_IRLS + " steps!"); - } - - ##### COMPUTE AIC ##### - - if (distribution_type == 2 & link_type >= 1 & link_type <= 4) { - AIC = -2 * log_l; - if (sum (X) != 0) { - AIC = AIC + 2 * num_features; - } - } else { - stop ("Currently only the Bernoulli distribution family the following link functions are supported: log, logit, probit, and cloglog!"); - } - - # Output which features give the best AIC and are being used for linear regression - S = Selected; - - ssX_beta = diag (scale_X) %*% beta; - ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta; - if (intercept_status == 2) { - beta_out = cbind (ssX_beta, beta); - } else { - beta_out = ssX_beta; - } - - if (intercept_status == 0 & num_features == 1) { - p = sum (X == 1); - if (p == num_records) { - beta_out = beta_out[1,]; - } - } - - - if (intercept_status == 1 | intercept_status == 2) { - intercept_value = as.scalar (beta_out [num_features, 1]); - beta_noicept = beta_out [1 : (num_features - 1), 1]; - } else { - beta_noicept = beta_out [1 : num_features, 1]; - } - min_beta = min (beta_noicept); - max_beta = max (beta_noicept); - tmp_i_min_beta = rowIndexMin (t(beta_noicept)) - i_min_beta = as.scalar (tmp_i_min_beta [1, 1]); - tmp_i_max_beta = rowIndexMax (t(beta_noicept)) - i_max_beta = as.scalar (tmp_i_max_beta [1, 1]); - - ##### OVER-DISPERSION PART ##### - - all_linear_terms = X %*% ssX_beta; - [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); - - pearson_residual_sq = g_Y ^ 2 / w; - pearson_residual_sq = replace (target = pearson_residual_sq, pattern = NaN, replacement = 0); - # pearson_residual_sq = (y_residual ^ 2) / y_var; - - if (num_records > num_features) { - estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features); - } - if (dispersion <= 0) { - dispersion = estimated_dispersion; - } - deviance = deviance_nodisp / dispersion; - - ##### END OF THE MAIN PART ##### - - str = "BETA_MIN," + min_beta; - str = append (str, "BETA_MIN_INDEX," + i_min_beta); - str = append (str, "BETA_MAX," + max_beta); - str = append (str, "BETA_MAX_INDEX," + i_max_beta); - str = append (str, "INTERCEPT," + intercept_value); - str = append (str, "DISPERSION," + dispersion); - str = append (str, "DISPERSION_EST," + estimated_dispersion); - str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp); - str = append (str, "DEVIANCE_SCALED," + deviance); - O = str - - do_return = 0; - # Prepare the output matrix - if (intercept_status == 0 & num_features == 1) { - if (p == num_records) { - beta_out_tmp = matrix (0, rows = num_features_orig + 1, cols = 1); - beta_out_tmp[num_features_orig + 1,] = beta_out; - beta = beta_out_tmp; - do_return = 1; - } else if (sum (X) == 0){ - beta = matrix (0, rows = num_features_orig, cols = 1); - do_return = 1; - } - } - if (do_return != 0) { - no_selected = ncol (Selected); - max_selected = max (Selected); - last = max_selected + 1; - - if (intercept_status != 0) { - - Selected_ext = cbind (Selected, as.matrix (last)); - P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); - - if (intercept_status == 2) { - - P1_ssX_beta = P1 * ssX_beta; - P2_ssX_beta = colSums (P1_ssX_beta); - P1_beta = P1 * beta; - P2_beta = colSums (P1_beta); - - if (max_selected < num_features_orig) { - - P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); - P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); - - P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; - P2_ssX_beta[1, max_selected + 1] = 0; - - P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1]; - P2_beta[1, max_selected + 1] = 0; - - } - beta = cbind (t(P2_ssX_beta), t(P2_beta)); - - } else { - - P1_beta = P1 * beta; - P2_beta = colSums (P1_beta); - - if (max_selected < num_features_orig) { - P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); - P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ; - P2_beta[1, max_selected + 1] = 0; - } - beta = t(P2_beta); - } - } else { - P1 = table (seq (1, no_selected), t(Selected)); - P1_beta = P1 * beta; - P2_beta = colSums (P1_beta); - - if (max_selected < num_features_orig) { - P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected))); - } - - beta = t(P2_beta); - } - } - } - -glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG) - return (Matrix[double] beta, double saturated_log_l, int isNaN) - { - saturated_log_l = 0.0; - isNaN = 0; - y_corr = Y [, 1]; - if (dist_type == 2) { - n_corr = rowSums (Y); - is_n_zero = (n_corr == 0); - y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero; - } - linear_terms = y_corr; - if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION - if (link_power == 0) { - if (sum (y_corr < 0) == 0) { - is_zero_y_corr = (y_corr == 0); - linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { isNaN = 1; } - } else { if (link_power == 1.0) { - linear_terms = y_corr; - } else { if (link_power == -1.0) { - linear_terms = 1.0 / y_corr; - } else { if (link_power == 0.5) { - if (sum (y_corr < 0) == 0) { - linear_terms = sqrt (y_corr); - } else { isNaN = 1; } - } else { if (link_power > 0) { - if (sum (y_corr < 0) == 0) { - is_zero_y_corr = (y_corr == 0); - linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; - } else { isNaN = 1; } - } else { - if (sum (y_corr <= 0) == 0) { - linear_terms = y_corr ^ link_power; - } else { isNaN = 1; } - }}}}} - } - if (dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - if (link_type == 1 & link_power == 0) { # Binomial.log - if (sum (y_corr < 0) == 0) { - is_zero_y_corr = (y_corr == 0); - linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { isNaN = 1; } - } else { if (link_type == 1 & link_power > 0) { # Binomial.power_nonlog pos - if (sum (y_corr < 0) == 0) { - is_zero_y_corr = (y_corr == 0); - linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; - } else { isNaN = 1; } - } else { if (link_type == 1) { # Binomial.power_nonlog neg - if (sum (y_corr <= 0) == 0) { - linear_terms = y_corr ^ link_power; - } else { isNaN = 1; } - } else { - is_zero_y_corr = (y_corr <= 0); - is_one_y_corr = (y_corr >= 1.0); - y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr); - if (link_type == 2) { # Binomial.logit - linear_terms = log (y_corr / (1.0 - y_corr)) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { if (link_type == 3) { # Binomial.probit - y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5); - t = sqrt (- 2.0 * log (y_below_half)); - approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308))); - linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5)) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { if (link_type == 4) { # Binomial.cloglog - linear_terms = log (- log (1.0 - y_corr)) - - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { if (link_type == 5) { # Binomial.cauchit - linear_terms = tan ((y_corr - 0.5) * pi) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - }} }}}}} - } - - if (isNaN == 0) { - [saturated_log_l, isNaN] = - glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power); - } - - if ((dist_type == 1 & link_type == 1 & link_power == 0) | - (dist_type == 2 & link_type >= 2)) - { - desired_eta = 0.0; - } else { if (link_type == 1 & link_power == 0) { - desired_eta = log (0.5); - } else { if (link_type == 1) { - desired_eta = 0.5 ^ link_power; - } else { - desired_eta = 0.5; - }}} - - beta = matrix (0.0, rows = ncol(X), cols = 1); - - if (desired_eta != 0) { - if (icept_status == 1 | icept_status == 2) { - beta [nrow(beta), 1] = desired_eta; - } else { - # We want: avg (X %*% ssX_transform %*% beta) = desired_eta - # Note that "ssX_transform" is trivial here, hence ignored - - beta = straightenX (X, 0.000001, max_iter_CG); - beta = beta * desired_eta; - } } } - - -glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y, - int dist_type, double var_power, int link_type, double link_power) - return (Matrix[double] g_Y, Matrix[double] w) -# ORIGINALLY we returned more meaningful vectors, namely: -# Matrix[double] y_residual : y - y_mean, i.e. y observed - y predicted -# Matrix[double] link_gradient : derivative of the link function -# Matrix[double] var_function : variance without dispersion, i.e. the V(mu) function -# BUT, this caused roundoff errors, so we had to compute "directly useful" vectors -# and skip over the "meaningful intermediaries". Now we output these two variables: -# g_Y = y_residual / (var_function * link_gradient); -# w = 1.0 / (var_function * link_gradient ^ 2); - { - num_records = nrow (linear_terms); - zeros_r = matrix (0.0, rows = num_records, cols = 1); - ones_r = 1 + zeros_r; - g_Y = zeros_r; - w = zeros_r; - - # Some constants - - one_over_sqrt_two_pi = 0.39894228040143267793994605993438; - ones_2 = matrix (1.0, rows = 1, cols = 2); - p_one_m_one = ones_2; - p_one_m_one [1, 2] = -1.0; - m_one_p_one = ones_2; - m_one_p_one [1, 1] = -1.0; - zero_one = ones_2; - zero_one [1, 1] = 0.0; - one_zero = ones_2; - one_zero [1, 2] = 0.0; - flip_pos = matrix (0, rows = 2, cols = 2); - flip_neg = flip_pos; - flip_pos [1, 2] = 1; - flip_pos [2, 1] = 1; - flip_neg [1, 2] = -1; - flip_neg [2, 1] = 1; - - if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION - y_mean = zeros_r; - if (link_power == 0) { - y_mean = exp (linear_terms); - y_mean_pow = y_mean ^ (1 - var_power); - w = y_mean_pow * y_mean; - g_Y = y_mean_pow * (Y - y_mean); - } else { if (link_power == 1.0) { - y_mean = linear_terms; - w = y_mean ^ (- var_power); - g_Y = w * (Y - y_mean); - } else { - y_mean = linear_terms ^ (1.0 / link_power); - c1 = (1 - var_power) / link_power - 1; - c2 = (2 - var_power) / link_power - 2; - g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power; - w = (linear_terms ^ c2) / (link_power ^ 2); - } }} - if (dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - if (link_type == 1) { # BINOMIAL.POWER LINKS - if (link_power == 0) { # Binomial.log - vec1 = 1 / (exp (- linear_terms) - 1); - g_Y = Y [, 1] - Y [, 2] * vec1; - w = rowSums (Y) * vec1; - } else { # Binomial.nonlog - vec1 = zeros_r; - if (link_power == 0.5) { - vec1 = 1 / (1 - linear_terms ^ 2); - } else { if (sum (linear_terms < 0) == 0) { - vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power)); - } else {isNaN = 1;}} - # We want a "zero-protected" version of - # vec2 = Y [, 1] / linear_terms; - is_y_0 = (Y [, 1] == 0); - vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0; - g_Y = (vec2 - Y [, 2] * vec1 * linear_terms) / link_power; - w = rowSums (Y) * vec1 / link_power ^ 2; - } - } else { - is_LT_pos_infinite = (linear_terms == Inf); - is_LT_neg_infinite = (linear_terms == -Inf); - is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; - finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); - finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); - if (link_type == 2) { # Binomial.logit - Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; - Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); - Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; - g_Y = rowSums (Y * (Y_prob %*% flip_neg)); ### = y_residual; - w = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob); ### = y_variance; - } else { if (link_type == 3) { # Binomial.probit - is_lt_pos = (linear_terms >= 0); - t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) - pt_gp = t_gp * ( 0.254829592 - + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, - + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 - + t_gp * (-1.453152027 - + t_gp * 1.061405429)))); - the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0); - vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp); - vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5); - w = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1; - g_Y = one_over_sqrt_two_pi * vec2 / vec1; - } else { if (link_type == 4) { # Binomial.cloglog - the_exp = exp (linear_terms) - the_exp_exp = exp (- the_exp); - is_too_small = ((10000000 + the_exp) == 10000000); - the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2); - g_Y = (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio; - w = the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio; - } else { if (link_type == 5) { # Binomial.cauchit - Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; - Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; - y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1]; - var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2]; - link_gradient_normalized = (1 + linear_terms ^ 2) * pi; - g_Y = rowSums (Y) * y_residual / (var_function * link_gradient_normalized); - w = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2); - }}}} - } - } - } - - -glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y, - int dist_type, double var_power, int link_type, double link_power) - return (double log_l, int isNaN) - { - isNaN = 0; - log_l = 0.0; - num_records = nrow (Y); - zeros_r = matrix (0.0, rows = num_records, cols = 1); - - if (dist_type == 1 & link_type == 1) - { # POWER DISTRIBUTION - b_cumulant = zeros_r; - natural_parameters = zeros_r; - is_natural_parameter_log_zero = zeros_r; - if (var_power == 1.0 & link_power == 0) { # Poisson.log - b_cumulant = exp (linear_terms); - is_natural_parameter_log_zero = (linear_terms == -Inf); - natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0); - } else { if (var_power == 1.0 & link_power == 1.0) { # Poisson.id - if (sum (linear_terms < 0) == 0) { - b_cumulant = linear_terms; - is_natural_parameter_log_zero = (linear_terms == 0); - natural_parameters = log (linear_terms + is_natural_parameter_log_zero); - } else {isNaN = 1;} - } else { if (var_power == 1.0 & link_power == 0.5) { # Poisson.sqrt - if (sum (linear_terms < 0) == 0) { - b_cumulant = linear_terms ^ 2; - is_natural_parameter_log_zero = (linear_terms == 0); - natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero); - } else {isNaN = 1;} - } else { if (var_power == 1.0 & link_power > 0) { # Poisson.power_nonlog, pos - if (sum (linear_terms < 0) == 0) { - is_natural_parameter_log_zero = (linear_terms == 0); - b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero; - natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power; - } else {isNaN = 1;} - } else { if (var_power == 1.0) { # Poisson.power_nonlog, neg - if (sum (linear_terms <= 0) == 0) { - b_cumulant = linear_terms ^ (1.0 / link_power); - natural_parameters = log (linear_terms) / link_power; - } else {isNaN = 1;} - } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse - if (sum (linear_terms <= 0) == 0) { - b_cumulant = - log (linear_terms); - natural_parameters = - linear_terms; - } else {isNaN = 1;} - } else { if (var_power == 2.0 & link_power == 1.0) { # Gamma.id - if (sum (linear_terms <= 0) == 0) { - b_cumulant = log (linear_terms); - natural_parameters = - 1.0 / linear_terms; - } else {isNaN = 1;} - } else { if (var_power == 2.0 & link_power == 0) { # Gamma.log - b_cumulant = linear_terms; - natural_parameters = - exp (- linear_terms); - } else { if (var_power == 2.0) { # Gamma.power_nonlog - if (sum (linear_terms <= 0) == 0) { - b_cumulant = log (linear_terms) / link_power; - natural_parameters = - linear_terms ^ (- 1.0 / link_power); - } else {isNaN = 1;} - } else { if (link_power == 0) { # PowerDist.log - natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power); - b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power); - } else { # PowerDist.power_nonlog - if (-2 * link_power == 1.0 - var_power) { - natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power); - } else { if (-1 * link_power == 1.0 - var_power) { - natural_parameters = 1.0 / linear_terms / (1.0 - var_power); - } else { if ( link_power == 1.0 - var_power) { - natural_parameters = linear_terms / (1.0 - var_power); - } else { if ( 2 * link_power == 1.0 - var_power) { - natural_parameters = linear_terms ^ 2 / (1.0 - var_power); - } else { - if (sum (linear_terms <= 0) == 0) { - power = (1.0 - var_power) / link_power; - natural_parameters = (linear_terms ^ power) / (1.0 - var_power); - } else {isNaN = 1;} - }}}} - if (-2 * link_power == 2.0 - var_power) { - b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power); - } else { if (-1 * link_power == 2.0 - var_power) { - b_cumulant = 1.0 / linear_terms / (2.0 - var_power); - } else { if ( link_power == 2.0 - var_power) { - b_cumulant = linear_terms / (2.0 - var_power); - } else { if ( 2 * link_power == 2.0 - var_power) { - b_cumulant = linear_terms ^ 2 / (2.0 - var_power); - } else { - if (sum (linear_terms <= 0) == 0) { - power = (2.0 - var_power) / link_power; - b_cumulant = (linear_terms ^ power) / (2.0 - var_power); - } else {isNaN = 1;} - }}}} - }}}}} }}}}} - if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) { - log_l = -Inf; - isNaN = 1; - } - if (isNaN == 0) - { - log_l = sum (Y * natural_parameters - b_cumulant); - if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { - log_l = -Inf; - isNaN = 1; - } } } - - if (dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - - [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power); - - if (isNaN == 0) { - does_prob_contradict = (Y_prob <= 0); - if (sum (does_prob_contradict * abs (Y)) == 0) { - log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict)); - if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { - isNaN = 1; - } - } else { - log_l = -Inf; - isNaN = 1; - } } } - - if (isNaN == 1) { - log_l = - Inf; - } - } - - - -binomial_probability_two_column = - function (Matrix[double] linear_terms, int link_type, double link_power) - return (Matrix[double] Y_prob, int isNaN) - { - isNaN = 0; - num_records = nrow (linear_terms); - - # Define some auxiliary matrices - - ones_2 = matrix (1.0, rows = 1, cols = 2); - p_one_m_one = ones_2; - p_one_m_one [1, 2] = -1.0; - m_one_p_one = ones_2; - m_one_p_one [1, 1] = -1.0; - zero_one = ones_2; - zero_one [1, 1] = 0.0; - one_zero = ones_2; - one_zero [1, 2] = 0.0; - - zeros_r = matrix (0.0, rows = num_records, cols = 1); - ones_r = 1.0 + zeros_r; - - # Begin the function body - - Y_prob = zeros_r %*% ones_2; - if (link_type == 1) { # Binomial.power - if (link_power == 0) { # Binomial.log - Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one; - } else { if (link_power == 0.5) { # Binomial.sqrt - Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one; - } else { # Binomial.power_nonlog - if (sum (linear_terms < 0) == 0) { - Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one; - } else {isNaN = 1;} - }} - } else { # Binomial.non_power - is_LT_pos_infinite = (linear_terms == Inf); - is_LT_neg_infinite = (linear_terms == -Inf); - is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; - finite_linear_terms = replace (target = linear_terms, pattern = Inf, replacement = 0); - finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0); - if (link_type == 2) { # Binomial.logit - Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; - Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); - } else { if (link_type == 3) { # Binomial.probit - lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one; - t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) - pt_gp = t_gp * ( 0.254829592 - + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, - + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 - + t_gp * (-1.453152027 - + t_gp * 1.061405429)))); - the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0); - Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg); - } else { if (link_type == 4) { # Binomial.cloglog - the_exp = exp (finite_linear_terms); - the_exp_exp = exp (- the_exp); - is_too_small = ((10000000 + the_exp) == 10000000); - Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2); - Y_prob [, 2] = the_exp_exp; - } else { if (link_type == 5) { # Binomial.cauchit - Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi; - } else { - isNaN = 1; - }}}} - Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; - } } - - -# THE CG-STEIHAUG PROCEDURE SCRIPT - -# Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize -# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z -# under constraint: ||z|| <= trust_delta. -# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright -# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform -# is given separately because sparse "X" may become dense after applying the transform. -# -get_CG_Steihaug_point = - function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w, - Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG) - return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary) - { - trust_delta_sq = trust_delta ^ 2; - size_CG = nrow (g); - z = matrix (0.0, rows = size_CG, cols = 1); - neg_log_l_change = 0.0; - reached_trust_boundary = 0; - g_reg = g + lambda * beta; - r_CG = g_reg; - p_CG = -r_CG; - rr_CG = sum(r_CG * r_CG); - eps_CG = rr_CG * min (0.25, sqrt (rr_CG)); - converged_CG = 0; - if (rr_CG < eps_CG) { - converged_CG = 1; - } - - max_iteration_CG = max_iter_CG; - if (max_iteration_CG <= 0) { - max_iteration_CG = size_CG; - } - i_CG = 0; - while (converged_CG == 0) - { - i_CG = i_CG + 1; - ssX_p_CG = diag (scale_X) %*% p_CG; - ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG; - temp_CG = t(X) %*% (w * (X %*% ssX_p_CG)); - q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ]; - pq_CG = sum (p_CG * q_CG); - if (pq_CG <= 0) { - pp_CG = sum (p_CG * p_CG); - if (pp_CG > 0) { - [z, neg_log_l_change] = - get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); - reached_trust_boundary = 1; - } else { - neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); - } - converged_CG = 1; - } - if (converged_CG == 0) { - alpha_CG = rr_CG / pq_CG; - new_z = z + alpha_CG * p_CG; - if (sum(new_z * new_z) >= trust_delta_sq) { - pp_CG = sum (p_CG * p_CG); - [z, neg_log_l_change] = - get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); - reached_trust_boundary = 1; - converged_CG = 1; - } - if (converged_CG == 0) { - z = new_z; - old_rr_CG = rr_CG; - r_CG = r_CG + alpha_CG * q_CG; - rr_CG = sum(r_CG * r_CG); - if (i_CG == max_iteration_CG | rr_CG < eps_CG) { - neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); - reached_trust_boundary = 0; - converged_CG = 1; - } - if (converged_CG == 0) { - p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG; - } } } } } - - -# An auxiliary function used twice inside the CG-STEIHAUG loop: -get_trust_boundary_point = - function (Matrix[double] g, Matrix[double] z, Matrix[double] p, - Matrix[double] q, Matrix[double] r, double pp, double pq, - double trust_delta_sq) - return (Matrix[double] new_z, double f_change) - { - zz = sum (z * z); pz = sum (p * z); - sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq)); - tau_1 = (- pz + sq_root_d) / pp; - tau_2 = (- pz - sq_root_d) / pp; - zq = sum (z * q); gp = sum (g * p); - f_extra = 0.5 * sum (z * (r + g)); - f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1; - f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2; - ind1 = as.integer(f_change_1 < f_change_2); - ind2 = as.integer(f_change_1 >= f_change_2); - new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p); - f_change = ind1 * f_change_1 + ind2 * f_change_2; - } - - -# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 -# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale -# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). -straightenX = - function (Matrix[double] X, double eps, int max_iter_CG) - return (Matrix[double] w) - { - w_X = t(colSums(X)); - lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); - eps_LS = eps * nrow(X); - - # BEGIN LEAST SQUARES - - r_LS = - w_X; - z_LS = matrix (0.0, rows = ncol(X), cols = 1); - p_LS = - r_LS; - norm_r2_LS = sum (r_LS ^ 2); - i_LS = 0; - while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS) - { - q_LS = t(X) %*% X %*% p_LS; - q_LS = q_LS + lambda_LS * p_LS; - alpha_LS = norm_r2_LS / sum (p_LS * q_LS); - z_LS = z_LS + alpha_LS * p_LS; - old_norm_r2_LS = norm_r2_LS; - r_LS = r_LS + alpha_LS * q_LS; - norm_r2_LS = sum (r_LS ^ 2); - p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; - i_LS = i_LS + 1; - } - - # END LEAST SQUARES - - w = (nrow(X) / sum (w_X * z_LS)) * z_LS; - } From 482349c03a6ba48197748dd43b74cc8cf40cc49f Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 26 Jun 2026 07:12:46 +0200 Subject: [PATCH 07/12] replace glm_fit with m_glm --- scripts/builtin/stepGLM.dml | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml index 3ec1cffcb7f..f7bd72e6b51 100644 --- a/scripts/builtin/stepGLM.dml +++ b/scripts/builtin/stepGLM.dml @@ -47,7 +47,6 @@ # # B Matrix --- Estimated regression parameters (betas) # S Matrix --- The selected features ordered as computed by the algorithm -# O String --- The statistics # --------------------------------------------------------------------------------------------- # THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY! @@ -76,6 +75,7 @@ # DEVIANCE_SCALED Deviance from the saturated model, scaled by the DISPERSION value # ------------------------------------------------------------------------------------------- +source("./scripts/builtin/glm.dlm") as glm; stepGLM = function ( Matrix[Double] X, @@ -92,16 +92,12 @@ stepGLM = function ( Double AIC, Matrix[Double] B, Matrix[Double] S, - String O ) { intercept_status = icpt; bernoulli_No_label = yneg; distribution_type = 2; - # currently only the forward selection strategy in supported: start from one feature and iteratively add - # features until AIC improves - dir = "forward"; if (distribution_type == 2 & ncol(Y) == 1) { is_Y_negative = (Y == bernoulli_No_label); @@ -121,10 +117,6 @@ stepGLM = function ( # BEGIN STEPWISE GENERALIZED LINEAR MODELS - if (dir != "forward") { - stop ("Currently only forward selection strategy is supported!"); - } - continue = TRUE; columns_fixed = matrix (0, rows = 1, cols = num_features); columns_fixed_ordered = matrix (0, rows = 1, cols = 1); @@ -134,18 +126,18 @@ stepGLM = function ( if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm_fit (X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm::m_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } else { # compute AIC of an empty model with only intercept (all Ys are constant) all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm_fit (X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm::m_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } #print ("Best AIC without any features: " + AIC_best); # First pass to examine single features AICs = matrix (AIC_best, rows = 1, cols = num_features); parfor (i in 1:num_features) { - [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm_fit (X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm::m_glm(X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); AICs[1,i] = AIC_1; } @@ -163,11 +155,11 @@ stepGLM = function ( #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm_fit (X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm::m_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } else { # compute AIC of an empty model with only intercept (all Ys are constant) ###all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm_fit (X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm::m_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } }; @@ -184,7 +176,7 @@ stepGLM = function ( # Construct the feature matrix X_loop = cbind (X_global, X_orig[,i]); - [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm_fit (X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm::m_glm(X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); AICs[1,i] = AIC_2; } } @@ -216,7 +208,7 @@ stepGLM = function ( # run GLM with selected set of features print ("Running GLM with selected features..."); - [AIC, B, S, O] = glm_fit (X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC, B, S, O] = glm::m_glm(X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } From 8e24e9c4a1aabebbd5c0f6994aef7c57f6b49579 Mon Sep 17 00:00:00 2001 From: bruno Date: Sun, 28 Jun 2026 03:01:15 +0200 Subject: [PATCH 08/12] add missing glm_fit replacment --- scripts/algorithms/TestBuiltinStepGLM.dml | 19 +++- scripts/builtin/stepGLM.dml | 118 ++++++++++++++++------ 2 files changed, 106 insertions(+), 31 deletions(-) diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml index ab64960b35a..ffe1a1f64db 100644 --- a/scripts/algorithms/TestBuiltinStepGLM.dml +++ b/scripts/algorithms/TestBuiltinStepGLM.dml @@ -13,9 +13,26 @@ Z = X %*% beta_true; P_y = 1.0 / (1.0 + exp(-Z)); Y = (rand(rows=N, cols=1, min=0.0, max=1.0, seed=456) < P_y) * 1.0; -[AIC, B, S, O] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01); +[AIC, B, S] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01); print("Optimal AIC: " + AIC); print("Selected Feature Indices:\n" + toString(S)); print("Estimated Coefficients:\n" + toString(B)); +epsilon = 0.5; +beta_est = matrix(0, rows=P, cols=1); +for (i in 1:nrow(B)) { + idx = as.scalar(S[1, i]); + if (nrow(S) > 1) { idx = as.scalar(S[i, 1]); } + beta_est[idx, 1] = B[i, 1]; +} + +if (nrow(B) != 3 | sum(beta_est != 0 & beta_true == 0) > 0 | sum(beta_est == 0 & beta_true != 0) > 0) { + stop("Test failed: Inexact feature support recovery."); +} + +if (max(abs(beta_est - beta_true)) > epsilon) { + stop("Test failed: Parameter estimates exceed tolerance bound epsilon = " + epsilon + "."); +} + +print("Automated convergence and estimation validation passed."); diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml index f7bd72e6b51..d4f894a52ac 100644 --- a/scripts/builtin/stepGLM.dml +++ b/scripts/builtin/stepGLM.dml @@ -45,37 +45,18 @@ # OUTPUT: Matrix beta, whose size depends on icpt: # icpt=0: ncol(X) x 1; icpt=1: (ncol(X) + 1) x 1; icpt=2: (ncol(X) + 1) x 2 # +# AIC Double --- AIC value # B Matrix --- Estimated regression parameters (betas) # S Matrix --- The selected features ordered as computed by the algorithm # --------------------------------------------------------------------------------------------- # THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY! -# - LOG +# - LOG # - LOGIT # - PROBIT # - CLOGLOG -# In addition, in the last run of GLM some statistics are provided in CSV format, one comma-separated name-value -# pair per each line, as follows: -# -# NAME MEANING -# ------------------------------------------------------------------------------------------- -# TERMINATION_CODE A positive integer indicating success/failure as follows: -# 1 = Converged successfully; 2 = Maximum number of iterations reached; -# 3 = Input (X, Y) out of range; 4 = Distribution/link is not supported -# BETA_MIN Smallest beta value (regression coefficient), excluding the intercept -# BETA_MIN_INDEX Column index for the smallest beta value -# BETA_MAX Largest beta value (regression coefficient), excluding the intercept -# BETA_MAX_INDEX Column index for the largest beta value -# INTERCEPT Intercept value, or NaN if there is no intercept (if icpt=0) -# DISPERSION Dispersion used to scale deviance, provided as "disp" input parameter -# or estimated (same as DISPERSION_EST) if the "disp" parameter is <= 0 -# DISPERSION_EST Dispersion estimated from the dataset -# DEVIANCE_UNSCALED Deviance from the saturated model, assuming dispersion == 1.0 -# DEVIANCE_SCALED Deviance from the saturated model, scaled by the DISPERSION value -# ------------------------------------------------------------------------------------------- - -source("./scripts/builtin/glm.dlm") as glm; +source("./scripts/builtin/glm.dml") as glm; stepGLM = function ( Matrix[Double] X, @@ -91,7 +72,7 @@ stepGLM = function ( ) return ( Double AIC, Matrix[Double] B, - Matrix[Double] S, + Matrix[Double] S ) { intercept_status = icpt; @@ -126,18 +107,18 @@ stepGLM = function ( if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm::m_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_best, ignore_B1, ignore_S1] = internal_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } else { # compute AIC of an empty model with only intercept (all Ys are constant) all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm::m_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_best, ignore_beta2, ignore_S2] = internal_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } #print ("Best AIC without any features: " + AIC_best); # First pass to examine single features AICs = matrix (AIC_best, rows = 1, cols = num_features); parfor (i in 1:num_features) { - [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm::m_glm(X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_1, ignore_beta3, ignore_S3] = internal_glm(X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); AICs[1,i] = AIC_1; } @@ -155,11 +136,11 @@ stepGLM = function ( #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm::m_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_best, ignore_beta4, ignore_S4] = internal_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } else { # compute AIC of an empty model with only intercept (all Ys are constant) ###all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm::m_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_best, ignore_beta5, ignore_S5] = internal_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); } }; @@ -176,7 +157,7 @@ stepGLM = function ( # Construct the feature matrix X_loop = cbind (X_global, X_orig[,i]); - [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm::m_glm(X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC_2, ignore_beta6, ignore_S6] = internal_glm(X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); AICs[1,i] = AIC_2; } } @@ -208,7 +189,84 @@ stepGLM = function ( # run GLM with selected set of features print ("Running GLM with selected features..."); - [AIC, B, S, O] = glm::m_glm(X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + [AIC, B, S] = internal_glm(X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii); + } + +compute_AIC = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] B, Int link, Int icpt) + return (Double aic) +{ + # Isolate unscaled parameters; accounts for m_glm returning 2 columns when icpt=2 + beta = B[, 1]; + + if (icpt > 0) { + ones = matrix(1, rows=nrow(X), cols=1); + X_design = cbind(X, ones); + } else { + X_design = X; + } + + eta = X_design %*% beta; + + # Map inverse link functions + if (link == 1) { + # log, see https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function + mu = exp(eta); + } else if (link == 2) { + # logit, see https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function + mu = 1 / (1 + exp(-eta)); + } else if (link == 3) { + # probit approximation, page 1487 in "Qualitative Response Models: A Survey (1981)" by Takeshi Amemiya + mu = 1 / (1 + exp(-1.6 * eta)); + } else if (link == 4) { + # cloglog, see https://search.r-project.org/CRAN/refmans/VGAM/html/clogloglink.html + mu = 1 - exp(-exp(eta)); + } else { + stop("Unsupported link function code: " + link); + mu = eta; } + # Constrain to prevent numerical discontinuity in log(0) + mu = max(mu, 1e-15); + mu = min(mu, 1 - 1e-15); + + Y_yes = Y[, 1]; + Y_neg = Y[, 2]; + + LL = sum(Y_yes * log(mu) + Y_neg * log(1 - mu)); + k = nrow(beta); + + aic = 2 * k - 2 * LL; +} + +internal_glm = function ( + Matrix[Double] X, + Matrix[Double] Y, + Int intercept_status, + Double num_features_orig, + Matrix[Double] Selected, + Int link, + Double disp, + Double tol, + Int moi, + Int mii +) return ( + Double AIC, + Matrix[Double] B, + Matrix[Double] S +) { + # bernoulli family parameters, see table in ./scripts/builtin/m_glm.dml for details. + new_link = link; + new_lpow = 1.0; + + if (link == 1) { + new_link = 1; + new_lpow = 0.0; + } + B = glm::m_glm(X=X, Y=Y, dfam=2, vpow=0.0, link=new_link, lpow=new_lpow, yneg=0.0, + icpt=intercept_status, disp=disp, reg=0.0, tol=tol, + moi=moi, mii=mii, verbose=FALSE); + AIC = compute_AIC(X, Y, B, link, intercept_status); + + S = Selected; +} From ca5429b3f3c99c1011118c728a031a7f79563470 Mon Sep 17 00:00:00 2001 From: bruno Date: Sun, 28 Jun 2026 05:52:08 +0200 Subject: [PATCH 09/12] edit test script --- scripts/algorithms/TestBuiltinStepGLM.dml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml index ffe1a1f64db..f22bb07d107 100644 --- a/scripts/algorithms/TestBuiltinStepGLM.dml +++ b/scripts/algorithms/TestBuiltinStepGLM.dml @@ -15,24 +15,27 @@ Y = (rand(rows=N, cols=1, min=0.0, max=1.0, seed=456) < P_y) * 1.0; [AIC, B, S] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01); +print("\n\n\n\n\n\n\nTest Results:"); print("Optimal AIC: " + AIC); print("Selected Feature Indices:\n" + toString(S)); print("Estimated Coefficients:\n" + toString(B)); -epsilon = 0.5; beta_est = matrix(0, rows=P, cols=1); for (i in 1:nrow(B)) { idx = as.scalar(S[1, i]); - if (nrow(S) > 1) { idx = as.scalar(S[i, 1]); } beta_est[idx, 1] = B[i, 1]; } +# Case 01 if (nrow(B) != 3 | sum(beta_est != 0 & beta_true == 0) > 0 | sum(beta_est == 0 & beta_true != 0) > 0) { stop("Test failed: Inexact feature support recovery."); } +print("passed test 1") +# Case 02 +epsilon = 0.5; if (max(abs(beta_est - beta_true)) > epsilon) { stop("Test failed: Parameter estimates exceed tolerance bound epsilon = " + epsilon + "."); } +print("passed test 2") -print("Automated convergence and estimation validation passed."); From 3321316255de955baf085d353a69ade2cee71026 Mon Sep 17 00:00:00 2001 From: bruno Date: Mon, 29 Jun 2026 15:45:21 +0200 Subject: [PATCH 10/12] add license to TestBuiltinStepGLM.dml --- scripts/algorithms/TestBuiltinStepGLM.dml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml index f22bb07d107..9924728b992 100644 --- a/scripts/algorithms/TestBuiltinStepGLM.dml +++ b/scripts/algorithms/TestBuiltinStepGLM.dml @@ -1,3 +1,25 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + source("scripts/builtin/stepGLM.dml") as stepGLM; N = 1000; From e13e83af0272959df105283b9f6cb3bd7cba4d6f Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 1 Jul 2026 15:31:40 +0200 Subject: [PATCH 11/12] rename stepGLM to m_stepGLM; register stepGLM in Builtins.java --- scripts/algorithms/TestBuiltinStepGLM.dml | 2 +- scripts/builtin/stepGLM.dml | 2 +- src/main/java/org/apache/sysds/common/Builtins.java | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml index 9924728b992..52d393bd29f 100644 --- a/scripts/algorithms/TestBuiltinStepGLM.dml +++ b/scripts/algorithms/TestBuiltinStepGLM.dml @@ -35,7 +35,7 @@ Z = X %*% beta_true; P_y = 1.0 / (1.0 + exp(-Z)); Y = (rand(rows=N, cols=1, min=0.0, max=1.0, seed=456) < P_y) * 1.0; -[AIC, B, S] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01); +[AIC, B, S] = stepGLM::m_stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01); print("\n\n\n\n\n\n\nTest Results:"); print("Optimal AIC: " + AIC); diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml index d4f894a52ac..00230ca455c 100644 --- a/scripts/builtin/stepGLM.dml +++ b/scripts/builtin/stepGLM.dml @@ -58,7 +58,7 @@ source("./scripts/builtin/glm.dml") as glm; -stepGLM = function ( +m_stepGLM = function ( Matrix[Double] X, Matrix[Double] Y, Int link = 2, diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index e21c539d6d8..62145124d82 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -333,6 +333,7 @@ public enum Builtins { STATSNA("statsNA", true), STRATSTATS("stratstats", true), STEPLM("steplm",true, ReturnType.MULTI_RETURN), + STEPGLM("stepGLM",true, ReturnType.MULTI_RETURN), STFT("stft", false, ReturnType.MULTI_RETURN), SQRT("sqrt", false), SQRT_MATRIX("sqrtMatrix", true), From c3410638cf2748de20b429a34a58674f39fddd4e Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 1 Jul 2026 16:40:11 +0200 Subject: [PATCH 12/12] add BuiltinSTEPGlmTest.java --- .../builtin/part2/BuiltinSTEPGlmTest.java | 68 +++++++++++++++++++ .../scripts/functions/builtin/stepGLM.dml | 4 ++ 2 files changed, 72 insertions(+) create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java rename scripts/algorithms/TestBuiltinStepGLM.dml => src/test/scripts/functions/builtin/stepGLM.dml (97%) diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java new file mode 100644 index 00000000000..7d95dc6ec28 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + +import org.junit.Test; +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.common.Types.ExecType; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; + +public class BuiltinSTEPGlmTest extends AutomatedTestBase +{ + private final static String TEST_NAME = "stepGLM"; + private final static String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinSTEPGlmTest.class.getSimpleName() + "/"; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{})); + } + + @Test + public void testLmMatrixDenseCPlm() { + runSTEPGlmTest(ExecType.CP); + } + + @Test + public void testLmMatrixSparseSPlm() { + runSTEPGlmTest(ExecType.SPARK); + } + + private void runSTEPGlmTest(ExecType instType) { + ExecMode platformOld = setExecMode(instType); + + try { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + + String HOME = SCRIPT_DIR + TEST_DIR; + + // Pointing to the generated validation DML script + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[]{}; + + // runTest executes the script; fails if the DML script invokes stop() + runTest(true, false, null, -1); + } + finally { + rtplatform = platformOld; + } + } +} diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/src/test/scripts/functions/builtin/stepGLM.dml similarity index 97% rename from scripts/algorithms/TestBuiltinStepGLM.dml rename to src/test/scripts/functions/builtin/stepGLM.dml index 52d393bd29f..95a4dfb9908 100644 --- a/scripts/algorithms/TestBuiltinStepGLM.dml +++ b/src/test/scripts/functions/builtin/stepGLM.dml @@ -61,3 +61,7 @@ if (max(abs(beta_est - beta_true)) > epsilon) { } print("passed test 2") + + +#stop("!!!Sucess!!!") # uncomment for letting the test fail +