From 274f0c82d31b0e7c406ed06e15b771f34aca42ca Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Sun, 24 May 2026 10:35:02 +0200
Subject: [PATCH 01/12] refactor StepGLM.dml from a script into a function and
 resolve a few nested blocks

---
 scripts/builtin/StepGLM.dml | 1202 +++++++++++++++++++++++++++++++++++
 1 file changed, 1202 insertions(+)
 create mode 100644 scripts/builtin/StepGLM.dml

diff --git a/scripts/builtin/StepGLM.dml b/scripts/builtin/StepGLM.dml
new file mode 100644
index 00000000000..3bbaf361650
--- /dev/null
+++ b/scripts/builtin/StepGLM.dml
@@ -0,0 +1,1202 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#  
+# THIS SCRIPT CHOOSES A GLM REGRESSION MODEL IN A STEPWISE ALGIRITHM USING AIC
+# EACH GLM REGRESSION IS SOLVED USING NEWTON/FISHER SCORING WITH TRUST REGIONS
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME  TYPE     DEFAULT   MEANING
+# ---------------------------------------------------------------------------------------------
+# X     Matrix    ---      Matrix X of feature vectors
+# Y     Matrix    ---      Response Matrix Y with 1 column
+# link  Int       2        Link function code: 1 = log, 2 = Logit, 3 = Probit, 4 = Cloglog
+# yneg  Double    0.0      Response value for Bernoulli "No" label, usually 0.0 or -1.0
+# icpt  Int       0        Intercept presence, X columns shifting and rescaling:
+#                          0 = no intercept, no shifting, no rescaling;
+#                          1 = add intercept, but neither shift nor rescale X;
+#                          2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# tol   Double    0.000001 Tolerance (epsilon)
+# disp  Double    0.0      (Over-)dispersion value, or 0.0 to estimate it from data
+# moi   Int       200      Maximum number of outer (Newton / Fisher Scoring) iterations
+# mii   Int       0        Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum
+# thr   Double    0.01     Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
+#                          no further features are being checked and the algorithm stops 
+# ---------------------------------------------------------------------------------------------
+# OUTPUT: Matrix beta, whose size depends on icpt:
+#     icpt=0: ncol(X) x 1;  icpt=1: (ncol(X) + 1) x 1;  icpt=2: (ncol(X) + 1) x 2
+#
+# B     Matrix    ---      Estimated regression parameters (betas)
+# S     Matrix    ---      The selected features ordered as computed by the algorithm
+# O     String    ---      The statistics
+# ---------------------------------------------------------------------------------------------
+
+# THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY!
+# 	- LOG
+#	- LOGIT
+#	- PROBIT
+#	- CLOGLOG
+
+# In addition, in the last run of GLM some statistics are provided in CSV format, one comma-separated name-value
+# pair per each line, as follows:
+#
+# NAME                  MEANING
+# -------------------------------------------------------------------------------------------
+# TERMINATION_CODE      A positive integer indicating success/failure as follows:
+#                       1 = Converged successfully; 2 = Maximum number of iterations reached; 
+#                       3 = Input (X, Y) out of range; 4 = Distribution/link is not supported
+# BETA_MIN              Smallest beta value (regression coefficient), excluding the intercept
+# BETA_MIN_INDEX        Column index for the smallest beta value
+# BETA_MAX              Largest beta value (regression coefficient), excluding the intercept
+# BETA_MAX_INDEX        Column index for the largest beta value
+# INTERCEPT             Intercept value, or NaN if there is no intercept (if icpt=0)
+# DISPERSION            Dispersion used to scale deviance, provided as "disp" input parameter
+#                       or estimated (same as DISPERSION_EST) if the "disp" parameter is <= 0
+# DISPERSION_EST        Dispersion estimated from the dataset
+# DEVIANCE_UNSCALED     Deviance from the saturated model, assuming dispersion == 1.0
+# DEVIANCE_SCALED       Deviance from the saturated model, scaled by the DISPERSION value
+# -------------------------------------------------------------------------------------------
+
+
+stepGLM = function (
+                    Matrix[Double] X, 
+                    Matrix[Double] Y, 
+                    Int link = 2,
+                    Double yneg = 0.0,
+                    Int icpt = 0,
+                    Double tol = 0.000001,
+                    Double disp = 0.0,
+                    Int moi = 200,
+                    Int mii = 0,
+                    Double thr = 0.01,
+) 
+    return (
+            Double AIC,
+            Matrix[Double] B, 
+            Matrix[Double] S, 
+            Matrix[Double] O,
+        )
+    {
+        intercept_status = icpt;
+        bernoulli_No_label = yneg;
+        distribution_type = 2;
+
+        # currently only the forward selection strategy in supported: start from one feature and iteratively add 
+        # features until AIC improves
+        dir = "forward";
+
+        if (distribution_type == 2 & ncol(Y) == 1) {
+            is_Y_negative = (Y == bernoulli_No_label);
+            Y = cbind (1 - is_Y_negative, is_Y_negative);
+            count_Y_negative = sum (is_Y_negative);
+            if (count_Y_negative == 0) {
+                stop ("StepGLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label");
+            }
+            if (count_Y_negative == nrow(Y)) {
+                stop ("StepGLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label");
+            }
+        }
+
+        X_orig = X;
+        num_records = nrow (X_orig);
+        num_features = ncol (X_orig);
+
+        # BEGIN STEPWISE GENERALIZED LINEAR MODELS 
+
+        if (dir != "forward") {  
+            stop ("Currently only forward selection strategy is supported!");
+        }
+
+        continue = TRUE;
+        columns_fixed = matrix (0, rows = 1, cols = num_features);
+        columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
+
+        # X_global stores the best model found at each step 
+        X_global = matrix (0, rows = num_records, cols = 1);
+
+        if (intercept_status == 0) {
+            # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
+            [AIC_best, _beta, _S, _O] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered);
+        } else {
+            # compute AIC of an empty model with only intercept (all Ys are constant)
+            all_ones = matrix (1, rows = num_records, cols = 1);
+            [AIC_best, _beta, _S, _O] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered);
+        }
+        #print ("Best AIC without any features: " + AIC_best);
+
+        # First pass to examine single features
+        AICs = matrix (AIC_best, rows = 1, cols = num_features);
+        parfor (i in 1:num_features) { 	
+            [AIC_1, _beta, _S, _O] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered);
+            AICs[1,i] = AIC_1;
+        }
+
+        # Determine the best AIC 
+        column_best = 0;	
+        for (k in 1:num_features) {
+            AIC_cur = as.scalar (AICs[1,k]);
+            if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
+                column_best = k;
+                AIC_best = as.scalar(AICs[1,k]);
+            }
+        }
+
+        if (column_best == 0) {
+            #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
+            if (intercept_status == 0) {
+                # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
+                [AIC_best, _beta, _S, _O] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered);
+            } else {
+                # compute AIC of an empty model with only intercept (all Ys are constant)
+                ###all_ones = matrix (1, rows = num_records, cols = 1);
+                [AIC_best, _beta, _S, _O] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered);
+            }
+        };
+
+        # print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);	
+        columns_fixed[1,column_best] = 1;
+        columns_fixed_ordered[1,1] = column_best;
+        X_global = X_orig[,column_best];		
+
+        while (continue) {
+            # Subsequent passes over the features
+            parfor (i in 1:num_features) { 
+                if (as.scalar(columns_fixed[1,i]) == 0) {	
+
+                    # Construct the feature matrix
+                    X = cbind (X_global, X_orig[,i]);
+
+                    [AIC_2, _beta, _S, _O] = glm_fit (X, Y, intercept_status, num_features, columns_fixed_ordered);
+                    AICs[1,i] = AIC_2;
+                }		
+            }
+
+            # Determine the best AIC
+            for (k in 1:num_features) {
+                AIC_cur = as.scalar (AICs[1,k]);
+                if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
+                    column_best = k;
+                    AIC_best = as.scalar(AICs[1,k]);
+                }
+            }
+
+            # cbind best found features (i.e., columns) to X_global
+            if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
+                #print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
+                columns_fixed[1,column_best] = 1;
+                columns_fixed_ordered = cbind (columns_fixed_ordered, as.matrix(column_best));
+                if (ncol(columns_fixed_ordered) == num_features) { # all features examined
+                    X_global = cbind (X_global, X_orig[,column_best]);
+                    continue = FALSE;
+                } else {
+                    X_global = cbind (X_global, X_orig[,column_best]);
+                }
+            } else {
+                continue = FALSE;
+            }
+        }
+
+        # run GLM with selected set of features
+        print ("Running GLM with selected features...");
+        [AIC, beta, S, O] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered);
+        return (AIC, beta, S, O)
+    }
+
+
+################### UDFS USED IN THIS SCRIPT ##################
+
+glm_fit = function (
+                    Matrix[Double] X, 
+                    Matrix[Double] Y, 
+                    Int intercept_status, 
+                    Double num_features_orig, 
+                    Matrix[Double] Selected, 
+                    Int link = 2,
+                    Double disp = 0.0,
+                    Double tol = 0.000001,
+                    Int moi = 200,
+                    Int mii = 0,
+) 
+    return (
+            Double AIC, 
+            Matrix[Double] beta,
+            Matrix[Double] S,
+            String O,
+        ) 
+    {
+
+        # distribution family code: 1 = Power, 2 = Bernoulli/Binomial; currently only Bernouli distribution family is supported!		
+        distribution_type = 2;                		# $dfam = 2;
+        variance_as_power_of_the_mean = 0.0;  		# $vpow = 0.0;
+        # link function code: 0 = canonical (depends on distribution), 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit;
+        # currently only log (link = 1), logit (link = 2), probit (link = 3), and cloglog (link = 4) are supported!
+        link_type = link;
+        link_as_power_of_the_mean = 0.0;      		# $lpow = 0.0;
+
+        dispersion = disp;
+        eps = tol;
+        max_iteration_IRLS = moi;
+        max_iteration_CG = mii;
+
+        variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean);
+        link_as_power_of_the_mean = as.double (link_as_power_of_the_mean);
+
+        dispersion = as.double (dispersion);
+        eps = as.double (eps);              
+
+        # Default values for output statistics:
+        regularization = 0.0;
+        termination_code     = 0.0;
+        min_beta             = NaN;
+        i_min_beta           = NaN;
+        max_beta             = NaN;
+        i_max_beta           = NaN;
+        intercept_value      = NaN;
+        dispersion           = NaN;
+        estimated_dispersion = NaN;
+        deviance_nodisp      = NaN;
+        deviance             = NaN;
+
+        #####   INITIALIZE THE PARAMETERS   #####
+
+        num_records  = nrow (X);
+        num_features = ncol (X);
+        zeros_r = matrix (0, rows = num_records, cols = 1);
+        ones_r = 1 + zeros_r;
+
+        # Introduce the intercept, shift and rescale the columns of X if needed
+
+        if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
+            X = cbind (X, ones_r);
+            num_features = ncol (X);
+        }
+
+        scale_lambda = matrix (1, rows = num_features, cols = 1);
+        if (intercept_status == 1 | intercept_status == 2) {
+            scale_lambda [num_features, 1] = 0;
+        }
+
+        if (intercept_status == 2) {  # scale-&-shift X columns to mean 0, variance 1
+            # Important assumption: X [, num_features] = ones_r
+            avg_X_cols = t(colSums(X)) / num_records;
+            var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1);
+            is_unsafe = (var_X_cols <= 0);
+            scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+            scale_X [num_features, 1] = 1;
+            shift_X = - avg_X_cols * scale_X;
+            shift_X [num_features, 1] = 0;
+            rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
+        } else {
+            scale_X = matrix (1, rows = num_features, cols = 1);
+            shift_X = matrix (0, rows = num_features, cols = 1);
+            rowSums_X_sq = rowSums (X ^ 2);
+        }	
+
+        # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
+        # with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
+        # The transform is then associatively applied to the other side of the expression,
+        # and is rewritten via "scale_X" and "shift_X" as follows:
+        #
+        # ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
+        # ssX_A  = diag (scale_X) %*% A;
+        # ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A;
+        #
+        # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
+        # tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ];
+
+        # Initialize other input-dependent parameters
+
+        lambda = scale_lambda * regularization;
+        if (max_iteration_CG == 0) {
+            max_iteration_CG = num_features;
+        }
+
+        # Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const]
+
+        if (link_type == 0) {
+            if (distribution_type == 1) {
+                link_type = 1;
+                link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean;
+            } else { 
+                if (distribution_type == 2) {
+                    link_type = 2;
+                }   
+            }   
+        }
+
+        # For power distributions and/or links, we use two constants,
+        # "variance as power of the mean" and "link_as_power_of_the_mean",
+        # to specify the variance and the link as arbitrary powers of the
+        # mean.  However, the variance-powers of 1.0 (Poisson family) and
+        # 2.0 (Gamma family) have to be treated as special cases, because
+        # these values integrate into logarithms.  The link-power of 0.0
+        # is also special as it represents the logarithm link.
+
+        num_response_columns = ncol (Y);
+        is_supported = 0;              
+        if (num_response_columns == 2 & distribution_type == 2 & link_type >= 1 & link_type <= 4) { # BERNOULLI DISTRIBUTION
+            is_supported = 1;			  
+        }
+        if (num_response_columns == 1 & distribution_type == 2) {
+            print ("Error: Bernoulli response matrix has not been converted into two-column format.");
+        }
+
+        if (is_supported != 1) {
+            stop ("Response matrix with " + num_response_columns + " columns, distribution family (" + distribution_type + ", " + variance_as_power_of_the_mean
+                  + ") and link family (" + link_type + ", " + link_as_power_of_the_mean + ") are NOT supported together.");
+        }
+
+        #####   INITIALIZE THE BETAS   #####
+
+        [beta, saturated_log_l, isNaN] = 
+            glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG);
+
+        # print(" --- saturated logLik " + saturated_log_l);
+
+        if (isNaN != 0) {
+            stop ("Input matrices X and/or Y are out of range!"); 
+        }
+
+        #####  START OF THE MAIN PART  #####
+
+        sum_X_sq = sum (rowSums_X_sq);
+        trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq));
+        ###  max_trust_delta = trust_delta * 10000.0;
+        log_l = 0.0;
+        deviance_nodisp = 0.0;
+        new_deviance_nodisp = 0.0;
+        isNaN_log_l = 2;
+        newbeta = beta;
+        g = matrix (0.0, rows = num_features, cols = 1);
+        g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
+        accept_new_beta = 1;
+        reached_trust_boundary = 0;
+        neg_log_l_change_predicted = 0.0;
+        i_IRLS = 0;
+
+        # print ("BEGIN IRLS ITERATIONS...");
+
+        ssX_newbeta = diag (scale_X) %*% newbeta;
+        ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
+        all_linear_terms = X %*% ssX_newbeta;
+
+        [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
+        (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+
+        if (isNaN_new_log_l == 0) {
+            new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
+            new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
+        }
+
+        while (termination_code == 0) {
+            accept_new_beta = 1;
+
+            if (i_IRLS > 0) {
+                if (isNaN_log_l == 0) {
+                    accept_new_beta = 0;
+                }
+
+                # Decide whether to accept a new iteration point and update the trust region
+                # See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
+
+                rho = (- new_log_l + log_l) / neg_log_l_change_predicted;
+                if (rho < 0.25 | isNaN_new_log_l == 1) {
+                    trust_delta = 0.25 * trust_delta;
+                }
+                if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) {
+                    trust_delta = 2 * trust_delta;
+
+                    ### if (trust_delta > max_trust_delta) {
+                    ###     trust_delta = max_trust_delta;
+                    ### }
+                }
+                if (rho > 0.1 & isNaN_new_log_l == 0) {
+                    accept_new_beta = 1;
+                }
+            }
+
+            if (accept_new_beta == 1) {
+                beta = newbeta;  log_l = new_log_l;  deviance_nodisp = new_deviance_nodisp;  isNaN_log_l = isNaN_new_log_l;
+
+                [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+
+                # We introduced these variables to avoid roundoff errors:
+                #     g_Y = y_residual / (y_var * link_grad);
+                #     w   = 1.0 / (y_var * link_grad * link_grad);
+
+                gXY = - t(X) %*% g_Y;
+                g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ];
+                g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
+            }
+
+            [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] = 
+                get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG);
+
+            newbeta = beta + z;
+
+            ssX_newbeta = diag (scale_X) %*% newbeta;
+            ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
+            all_linear_terms = X %*% ssX_newbeta;
+
+            [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
+            (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+
+            if (isNaN_new_log_l == 0) {
+                new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
+                new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
+            }
+
+            log_l_change = new_log_l - log_l;               # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps
+
+            if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & 
+                (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) {
+                termination_code = 1;
+            }
+            rho = - log_l_change / neg_log_l_change_predicted;
+            z_norm = sqrt (sum (z * z));
+
+            i_IRLS = i_IRLS + 1;                        
+
+            if (i_IRLS == max_iteration_IRLS) {
+                termination_code = 2;
+            }
+        }
+
+        beta = newbeta;
+        log_l = new_log_l;
+        deviance_nodisp = new_deviance_nodisp;
+
+        #---------------------------- last part
+
+        if (termination_code != 1) {
+            print ("One of the runs of GLM did not converged in " + i_IRLS + " steps!");
+        }
+
+        ##### COMPUTE AIC ##### 
+
+        if (distribution_type == 2 & link_type >= 1 & link_type <= 4) {			
+            AIC = -2 * log_l;
+            if (sum (X) != 0) {
+                AIC = AIC + 2 * num_features;	
+            }
+        } else {
+            stop ("Currently only the Bernoulli distribution family the following link functions are supported: log, logit, probit, and cloglog!");
+        }
+
+        # Output which features give the best AIC and are being used for linear regression 
+        S = Selected;
+
+        ssX_beta = diag (scale_X) %*% beta;
+        ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta;
+        if (intercept_status == 2) {
+            beta_out = cbind (ssX_beta, beta);
+        } else {
+            beta_out = ssX_beta;
+        }
+
+        if (intercept_status == 0 & num_features == 1) {
+            p = sum (X == 1);
+            if (p == num_records) {
+                beta_out = beta_out[1,];
+            }					
+        } 
+
+
+        if (intercept_status == 1 | intercept_status == 2) {
+            intercept_value = as.scalar (beta_out [num_features, 1]);
+            beta_noicept = beta_out [1 : (num_features - 1), 1];
+        } else {
+            beta_noicept = beta_out [1 : num_features, 1];
+        }
+        min_beta = min (beta_noicept);
+        max_beta = max (beta_noicept);
+        tmp_i_min_beta = rowIndexMin (t(beta_noicept))
+        i_min_beta = as.scalar (tmp_i_min_beta [1, 1]);
+        tmp_i_max_beta = rowIndexMax (t(beta_noicept))
+        i_max_beta = as.scalar (tmp_i_max_beta [1, 1]);
+
+        #####  OVER-DISPERSION PART  #####
+
+        all_linear_terms = X %*% ssX_beta;
+        [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+
+        pearson_residual_sq = g_Y ^ 2 / w;
+        pearson_residual_sq = replace (target = pearson_residual_sq, pattern = NaN, replacement = 0);
+        # pearson_residual_sq = (y_residual ^ 2) / y_var;
+
+        if (num_records > num_features) {
+            estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features);
+        }
+        if (dispersion <= 0) {
+            dispersion = estimated_dispersion;
+        }
+        deviance = deviance_nodisp / dispersion;
+
+        #####  END OF THE MAIN PART  #####
+
+        str = "BETA_MIN," + min_beta;
+        str = append (str, "BETA_MIN_INDEX," + i_min_beta);
+        str = append (str, "BETA_MAX," + max_beta);
+        str = append (str, "BETA_MAX_INDEX," + i_max_beta);
+        str = append (str, "INTERCEPT," + intercept_value);
+        str = append (str, "DISPERSION," + dispersion);
+        str = append (str, "DISPERSION_EST," + estimated_dispersion);
+        str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp);
+        str = append (str, "DEVIANCE_SCALED," + deviance);
+        O = str
+
+        # Prepare the output matrix
+        print ("Writing the output matrix...");
+        if (intercept_status == 0 & num_features == 1) { 
+            if (p == num_records) {
+                beta_out_tmp = matrix (0, rows = num_features_orig + 1, cols = 1); 
+                beta_out_tmp[num_features_orig + 1,] = beta_out;
+                beta_out = beta_out_tmp;
+                return AIC, beta_out, S, O;
+            } else if (sum (X) == 0){
+                beta_out = matrix (0, rows = num_features_orig, cols = 1);
+                return AIC, beta_out, S, O;
+            }
+        }
+
+        no_selected = ncol (Selected);
+        max_selected = max (Selected);
+        last = max_selected + 1;	
+
+        if (intercept_status != 0) {
+
+            Selected_ext = cbind (Selected, as.matrix (last));			
+            P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); 
+
+            if (intercept_status == 2) {
+
+                P1_ssX_beta = P1 * ssX_beta;
+                P2_ssX_beta = colSums (P1_ssX_beta);
+                P1_beta = P1 * beta;
+                P2_beta = colSums (P1_beta);
+
+                if (max_selected < num_features_orig) {
+
+                    P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+                    P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+
+                    P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; 
+                    P2_ssX_beta[1, max_selected + 1] = 0;
+
+                    P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1]; 
+                    P2_beta[1, max_selected + 1] = 0;
+
+                }
+                beta_out = cbind (t(P2_ssX_beta), t(P2_beta));
+
+            } else {
+
+                P1_beta = P1 * beta;
+                P2_beta = colSums (P1_beta);
+
+                if (max_selected < num_features_orig) {
+                    P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+                    P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ; 
+                    P2_beta[1, max_selected + 1] = 0;
+                }
+                beta_out = t(P2_beta);
+
+            }
+        } else {
+
+            P1 = table (seq (1, no_selected), t(Selected)); 
+            P1_beta = P1 * beta;
+            P2_beta = colSums (P1_beta);	
+
+            if (max_selected < num_features_orig) {
+                P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+            }		
+
+            beta_out = t(P2_beta);	
+        }
+
+        return AIC, beta_out, S, O;
+    }
+
+glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG)
+    return (Matrix[double] beta, double saturated_log_l, int isNaN)
+    {
+        saturated_log_l = 0.0;
+        isNaN = 0;
+        y_corr = Y [, 1];
+        if (dist_type == 2) {
+            n_corr = rowSums (Y);
+            is_n_zero = (n_corr == 0);
+            y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;    
+        }
+        linear_terms = y_corr;
+        if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
+            if          (link_power ==  0) {
+                if (sum (y_corr < 0) == 0) {
+                    is_zero_y_corr = (y_corr == 0);
+                    linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+                } else { isNaN = 1; }
+                } else { if (link_power ==  1.0) {
+                    linear_terms = y_corr;
+                    } else { if (link_power == -1.0) {
+                        linear_terms = 1.0 / y_corr;
+                        } else { if (link_power ==  0.5) {
+                            if (sum (y_corr < 0) == 0) {
+                                linear_terms = sqrt (y_corr);
+                            } else { isNaN = 1; }
+                            } else { if (link_power >   0) {
+                                if (sum (y_corr < 0) == 0) {
+                                    is_zero_y_corr = (y_corr == 0);
+                                    linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
+                                } else { isNaN = 1; }
+                            } else {
+                                if (sum (y_corr <= 0) == 0) {
+                                    linear_terms = y_corr ^ link_power;
+                                } else { isNaN = 1; }
+            }}}}}
+        }
+        if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+        { # BINOMIAL/BERNOULLI DISTRIBUTION
+            if          (link_type == 1 & link_power == 0)  { # Binomial.log
+                if (sum (y_corr < 0) == 0) {
+                    is_zero_y_corr = (y_corr == 0);
+                    linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+                } else { isNaN = 1; }
+                } else { if (link_type == 1 & link_power >  0)  { # Binomial.power_nonlog pos
+                    if (sum (y_corr < 0) == 0) {
+                        is_zero_y_corr = (y_corr == 0);
+                        linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
+                    } else { isNaN = 1; }
+                    } else { if (link_type == 1)                      { # Binomial.power_nonlog neg
+                        if (sum (y_corr <= 0) == 0) {
+                            linear_terms = y_corr ^ link_power;
+                        } else { isNaN = 1; }
+                    } else { 
+                        is_zero_y_corr = (y_corr <= 0);
+                        is_one_y_corr  = (y_corr >= 1.0);
+                        y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr);
+                        if (link_type == 2)                           { # Binomial.logit
+                            linear_terms = log (y_corr / (1.0 - y_corr)) 
+                            + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+                            } else { if (link_type == 3)                  { # Binomial.probit
+                                y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5);
+                                t = sqrt (- 2.0 * log (y_below_half));
+                                approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
+                                linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5))
+                                + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+                                } else { if (link_type == 4)                  { # Binomial.cloglog
+                                    linear_terms = log (- log (1.0 - y_corr))
+                                    - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
+                                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+                                    } else { if (link_type == 5)                  { # Binomial.cauchit
+                                        linear_terms = tan ((y_corr - 0.5) * pi)
+                                        + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            }}  }}}}}
+        }
+
+        if (isNaN == 0) {
+            [saturated_log_l, isNaN] = 
+                glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power);
+        }
+
+        if ((dist_type == 1 & link_type == 1 & link_power == 0) |
+            (dist_type == 2 & link_type >= 2))
+        {    
+            desired_eta = 0.0;
+            } else { if (link_type == 1 & link_power == 0) {
+                desired_eta = log (0.5);
+                } else { if (link_type == 1) {
+                    desired_eta = 0.5 ^ link_power;
+                } else {
+                    desired_eta = 0.5;
+        }}}
+
+        beta = matrix (0.0, rows = ncol(X), cols = 1);
+
+        if (desired_eta != 0) {
+            if (icept_status == 1 | icept_status == 2) {
+                beta [nrow(beta), 1] = desired_eta;
+            } else {
+                # We want: avg (X %*% ssX_transform %*% beta) = desired_eta
+                # Note that "ssX_transform" is trivial here, hence ignored
+
+                beta = straightenX (X, 0.000001, max_iter_CG);  
+                beta = beta * desired_eta;
+    }   }   }
+
+
+glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
+                     int dist_type, double var_power, int link_type, double link_power)
+    return (Matrix[double] g_Y, Matrix[double] w)
+# ORIGINALLY we returned more meaningful vectors, namely:
+# Matrix[double] y_residual    : y - y_mean, i.e. y observed - y predicted
+# Matrix[double] link_gradient : derivative of the link function
+# Matrix[double] var_function  : variance without dispersion, i.e. the V(mu) function
+# BUT, this caused roundoff errors, so we had to compute "directly useful" vectors
+# and skip over the "meaningful intermediaries".  Now we output these two variables:
+#     g_Y = y_residual / (var_function * link_gradient);
+#     w   = 1.0 / (var_function * link_gradient ^ 2);
+    {
+        num_records = nrow (linear_terms);
+        zeros_r = matrix (0.0, rows = num_records, cols = 1);
+        ones_r = 1 + zeros_r;
+        g_Y  = zeros_r;
+        w  = zeros_r;
+
+        # Some constants
+
+        one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
+        ones_2 = matrix (1.0, rows = 1, cols = 2);
+        p_one_m_one = ones_2;
+        p_one_m_one [1, 2] = -1.0;
+        m_one_p_one = ones_2;
+        m_one_p_one [1, 1] = -1.0;
+        zero_one = ones_2;
+        zero_one [1, 1] = 0.0;
+        one_zero = ones_2;
+        one_zero [1, 2] = 0.0;
+        flip_pos = matrix (0, rows = 2, cols = 2);
+        flip_neg = flip_pos;
+        flip_pos [1, 2] = 1;
+        flip_pos [2, 1] = 1;
+        flip_neg [1, 2] = -1;
+        flip_neg [2, 1] = 1;
+
+        if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
+            y_mean = zeros_r;
+            if          (link_power ==  0) {
+                y_mean = exp (linear_terms);
+                y_mean_pow = y_mean ^ (1 - var_power);
+                w   = y_mean_pow * y_mean;
+                g_Y = y_mean_pow * (Y - y_mean);
+                } else { if (link_power ==  1.0) {
+                    y_mean = linear_terms;
+                    w   = y_mean ^ (- var_power);
+                    g_Y = w * (Y - y_mean);
+                } else {
+                    y_mean = linear_terms ^ (1.0 / link_power);
+                    c1  = (1 - var_power) / link_power - 1;
+                    c2  = (2 - var_power) / link_power - 2;
+                    g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power;
+                    w   = (linear_terms ^ c2) / (link_power ^ 2);
+        }   }}
+        if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+        { # BINOMIAL/BERNOULLI DISTRIBUTION
+            if (link_type == 1) { # BINOMIAL.POWER LINKS
+                if (link_power == 0)  { # Binomial.log
+                    vec1 = 1 / (exp (- linear_terms) - 1);
+                    g_Y = Y [, 1] - Y [, 2] * vec1;
+                    w   = rowSums (Y) * vec1;
+                } else {                  # Binomial.nonlog
+                    vec1 = zeros_r;
+                    if (link_power == 0.5)  {
+                        vec1 = 1 / (1 - linear_terms ^ 2);
+                        } else { if (sum (linear_terms < 0) == 0) {
+                            vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power));
+                    } else {isNaN = 1;}}
+                    # We want a "zero-protected" version of
+                    #     vec2 = Y [, 1] / linear_terms;
+                    is_y_0 = (Y [, 1] == 0);
+                    vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0;
+                    g_Y =  (vec2 - Y [, 2] * vec1 * linear_terms) / link_power;
+                    w   =  rowSums (Y) * vec1 / link_power ^ 2;
+                }
+            } else {
+                is_LT_pos_infinite = (linear_terms == Inf);
+                is_LT_neg_infinite = (linear_terms == -Inf);
+                is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
+                finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
+                finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
+                if (link_type == 2)                           { # Binomial.logit
+                    Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
+                    Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
+                    Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+                    g_Y = rowSums (Y * (Y_prob %*% flip_neg));           ### = y_residual;
+                    w   = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob);  ### = y_variance;
+                    } else { if (link_type == 3)                  { # Binomial.probit
+                        is_lt_pos = (linear_terms >= 0);
+                        t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
+                        pt_gp = t_gp * ( 0.254829592 
+                                        + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+                                                  + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+                                                            + t_gp * (-1.453152027 
+                                                                      + t_gp *   1.061405429))));
+                        the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
+                        vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
+                        vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5);
+                        w   = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1;
+                        g_Y = one_over_sqrt_two_pi * vec2 / vec1;
+                        } else { if (link_type == 4)                  { # Binomial.cloglog
+                            the_exp = exp (linear_terms)
+                            the_exp_exp = exp (- the_exp);
+                            is_too_small = ((10000000 + the_exp) == 10000000);
+                            the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2);
+                            g_Y =  (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio;
+                            w   =  the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio;
+                            } else { if (link_type == 5)                  { # Binomial.cauchit
+                                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
+                                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+                                y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1];
+                                var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2];
+                                link_gradient_normalized = (1 + linear_terms ^ 2) * pi;
+                                g_Y =  rowSums (Y) * y_residual / (var_function * link_gradient_normalized);
+                                w   = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2);
+                }}}}   
+            }
+        }
+    }
+
+
+glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y,
+                                    int dist_type, double var_power, int link_type, double link_power)
+    return (double log_l, int isNaN)
+    {
+        isNaN = 0;
+        log_l = 0.0;
+        num_records = nrow (Y);
+        zeros_r = matrix (0.0, rows = num_records, cols = 1);
+
+        if (dist_type == 1 & link_type == 1)
+        { # POWER DISTRIBUTION
+            b_cumulant = zeros_r;
+            natural_parameters = zeros_r;
+            is_natural_parameter_log_zero = zeros_r;
+            if          (var_power == 1.0 & link_power == 0)  { # Poisson.log
+                b_cumulant = exp (linear_terms);
+                is_natural_parameter_log_zero = (linear_terms == -Inf);
+                natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0);
+                } else { if (var_power == 1.0 & link_power == 1.0)  { # Poisson.id
+                    if (sum (linear_terms < 0) == 0)  {
+                        b_cumulant = linear_terms;
+                        is_natural_parameter_log_zero = (linear_terms == 0);
+                        natural_parameters = log (linear_terms + is_natural_parameter_log_zero);
+                    } else {isNaN = 1;}
+                    } else { if (var_power == 1.0 & link_power == 0.5)  { # Poisson.sqrt
+                        if (sum (linear_terms < 0) == 0)  {
+                            b_cumulant = linear_terms ^ 2;
+                            is_natural_parameter_log_zero = (linear_terms == 0);
+                            natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero);
+                        } else {isNaN = 1;}
+                        } else { if (var_power == 1.0 & link_power  > 0)  { # Poisson.power_nonlog, pos
+                            if (sum (linear_terms < 0) == 0)  {
+                                is_natural_parameter_log_zero = (linear_terms == 0);
+                                b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
+                                natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
+                            } else {isNaN = 1;}
+                            } else { if (var_power == 1.0)                      { # Poisson.power_nonlog, neg
+                                if (sum (linear_terms <= 0) == 0) {
+                                    b_cumulant = linear_terms ^ (1.0 / link_power);
+                                    natural_parameters = log (linear_terms) / link_power;
+                                } else {isNaN = 1;}
+                                } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse
+                                    if (sum (linear_terms <= 0) == 0) {
+                                        b_cumulant = - log (linear_terms);
+                                        natural_parameters = - linear_terms;
+                                    } else {isNaN = 1;}
+                                    } else { if (var_power == 2.0 & link_power ==  1.0) { # Gamma.id
+                                        if (sum (linear_terms <= 0) == 0) {
+                                            b_cumulant = log (linear_terms);
+                                            natural_parameters = - 1.0 / linear_terms;
+                                        } else {isNaN = 1;}
+                                        } else { if (var_power == 2.0 & link_power ==  0) { # Gamma.log
+                                            b_cumulant = linear_terms;
+                                            natural_parameters = - exp (- linear_terms);
+                                            } else { if (var_power == 2.0)                      { # Gamma.power_nonlog
+                                                if (sum (linear_terms <= 0) == 0) {
+                                                    b_cumulant = log (linear_terms) / link_power;
+                                                    natural_parameters = - linear_terms ^ (- 1.0 / link_power);
+                                                } else {isNaN = 1;}
+                                                } else { if                    (link_power ==  0) { # PowerDist.log
+                                                    natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
+                                                    b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
+                                                } else {                                              # PowerDist.power_nonlog
+                                                    if          (-2 * link_power == 1.0 - var_power) {
+                                                        natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power);
+                                                        } else { if (-1 * link_power == 1.0 - var_power) {
+                                                            natural_parameters = 1.0 / linear_terms / (1.0 - var_power);
+                                                            } else { if (     link_power == 1.0 - var_power) {
+                                                                natural_parameters = linear_terms / (1.0 - var_power);
+                                                                } else { if ( 2 * link_power == 1.0 - var_power) {
+                                                                    natural_parameters = linear_terms ^ 2 / (1.0 - var_power);
+                                                                } else {
+                                                                    if (sum (linear_terms <= 0) == 0) {
+                                                                        power = (1.0 - var_power) / link_power;
+                                                                        natural_parameters = (linear_terms ^ power) / (1.0 - var_power);
+                                                                    } else {isNaN = 1;}
+                                                    }}}}
+                                                    if          (-2 * link_power == 2.0 - var_power) {
+                                                        b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power);
+                                                        } else { if (-1 * link_power == 2.0 - var_power) {
+                                                            b_cumulant = 1.0 / linear_terms / (2.0 - var_power);
+                                                            } else { if (     link_power == 2.0 - var_power) {
+                                                                b_cumulant = linear_terms / (2.0 - var_power);
+                                                                } else { if ( 2 * link_power == 2.0 - var_power) {
+                                                                    b_cumulant = linear_terms ^ 2 / (2.0 - var_power);
+                                                                } else {
+                                                                    if (sum (linear_terms <= 0) == 0) {
+                                                                        power = (2.0 - var_power) / link_power;
+                                                                        b_cumulant = (linear_terms ^ power) / (2.0 - var_power);
+                                                                    } else {isNaN = 1;}
+                                                    }}}}
+            }}}}} }}}}}
+            if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) {
+                log_l = -Inf;
+                isNaN = 1;
+            }
+            if (isNaN == 0)
+            {
+                log_l = sum (Y * natural_parameters - b_cumulant);
+                if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
+                    log_l = -Inf;
+                    isNaN = 1;
+        }   }   }
+
+        if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+        { # BINOMIAL/BERNOULLI DISTRIBUTION
+
+            [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power);
+
+            if (isNaN == 0) {            
+                does_prob_contradict = (Y_prob <= 0);
+                if (sum (does_prob_contradict * abs (Y)) == 0) {
+                    log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict));
+                    if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
+                        isNaN = 1;
+                    }
+                } else {
+                    log_l = -Inf;
+                    isNaN = 1;
+        }   }   }
+
+        if (isNaN == 1) {
+            log_l = - Inf; 
+        }
+    }
+
+
+
+binomial_probability_two_column =
+    function (Matrix[double] linear_terms, int link_type, double link_power)
+        return   (Matrix[double] Y_prob, int isNaN)
+        {
+            isNaN = 0;
+            num_records = nrow (linear_terms);
+
+            # Define some auxiliary matrices
+
+            ones_2 = matrix (1.0, rows = 1, cols = 2);
+            p_one_m_one = ones_2;
+            p_one_m_one [1, 2] = -1.0;
+            m_one_p_one = ones_2;
+            m_one_p_one [1, 1] = -1.0;
+            zero_one = ones_2;
+            zero_one [1, 1] = 0.0;
+            one_zero = ones_2;
+            one_zero [1, 2] = 0.0;
+
+            zeros_r = matrix (0.0, rows = num_records, cols = 1);
+            ones_r = 1.0 + zeros_r;
+
+            # Begin the function body
+
+            Y_prob = zeros_r %*% ones_2;
+            if (link_type == 1) { # Binomial.power
+                if          (link_power == 0) { # Binomial.log
+                    Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one;    
+                    } else { if (link_power == 0.5) { # Binomial.sqrt
+                        Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one;    
+                    } else {                          # Binomial.power_nonlog
+                        if (sum (linear_terms < 0) == 0) {
+                            Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one;    
+                        } else {isNaN = 1;}
+                }}
+            } else {              # Binomial.non_power
+                is_LT_pos_infinite = (linear_terms == Inf);
+                is_LT_neg_infinite = (linear_terms == -Inf);
+                is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
+                finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
+                finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
+                if (link_type == 2)             { # Binomial.logit
+                    Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
+                    Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
+                    } else { if (link_type == 3)    { # Binomial.probit
+                        lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one;
+                        t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
+                        pt_gp = t_gp * ( 0.254829592 
+                                        + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+                                                  + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+                                                            + t_gp * (-1.453152027 
+                                                                      + t_gp *   1.061405429))));
+                        the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
+                        Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg);
+                        } else { if (link_type == 4)    { # Binomial.cloglog
+                            the_exp = exp (finite_linear_terms);
+                            the_exp_exp = exp (- the_exp);
+                            is_too_small = ((10000000 + the_exp) == 10000000);
+                            Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2);
+                            Y_prob [, 2] = the_exp_exp;
+                            } else { if (link_type == 5)    { # Binomial.cauchit
+                                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
+                            } else {
+                                isNaN = 1;
+                }}}}
+                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+        }   }            
+
+
+# THE CG-STEIHAUG PROCEDURE SCRIPT
+
+# Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize
+# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z
+# under constraint:  ||z|| <= trust_delta.
+# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
+# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform
+# is given separately because sparse "X" may become dense after applying the transform.
+#
+get_CG_Steihaug_point =
+    function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w,
+              Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG)
+        return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary)
+        {
+            trust_delta_sq = trust_delta ^ 2;
+            size_CG = nrow (g);
+            z = matrix (0.0, rows = size_CG, cols = 1);
+            neg_log_l_change = 0.0;
+            reached_trust_boundary = 0;
+            g_reg = g + lambda * beta;
+            r_CG = g_reg;
+            p_CG = -r_CG;
+            rr_CG = sum(r_CG * r_CG);
+            eps_CG = rr_CG * min (0.25, sqrt (rr_CG));
+            converged_CG = 0;
+            if (rr_CG < eps_CG) {
+                converged_CG = 1;
+            }
+
+            max_iteration_CG = max_iter_CG;
+            if (max_iteration_CG <= 0) {
+                max_iteration_CG = size_CG;
+            }
+            i_CG = 0;
+            while (converged_CG == 0)
+            {
+                i_CG = i_CG + 1;
+                ssX_p_CG = diag (scale_X) %*% p_CG;
+                ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG;
+                temp_CG = t(X) %*% (w * (X %*% ssX_p_CG));
+                q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ];
+                pq_CG = sum (p_CG * q_CG);
+                if (pq_CG <= 0) {
+                    pp_CG = sum (p_CG * p_CG);  
+                    if (pp_CG > 0) {
+                        [z, neg_log_l_change] = 
+                            get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
+                        reached_trust_boundary = 1;
+                    } else {
+                        neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
+                    }
+                    converged_CG = 1;
+                }
+                if (converged_CG == 0) {
+                    alpha_CG = rr_CG / pq_CG;
+                    new_z = z + alpha_CG * p_CG;
+                    if (sum(new_z * new_z) >= trust_delta_sq) {
+                        pp_CG = sum (p_CG * p_CG);  
+                        [z, neg_log_l_change] = 
+                            get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
+                        reached_trust_boundary = 1;
+                        converged_CG = 1;
+                    }
+                    if (converged_CG == 0) {
+                        z = new_z;
+                        old_rr_CG = rr_CG;
+                        r_CG = r_CG + alpha_CG * q_CG;
+                        rr_CG = sum(r_CG * r_CG);
+                        if (i_CG == max_iteration_CG | rr_CG < eps_CG) {
+                            neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
+                            reached_trust_boundary = 0;
+                            converged_CG = 1;
+                        }
+                        if (converged_CG == 0) {
+                            p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG;
+        }   }   }   }   }
+
+
+# An auxiliary function used twice inside the CG-STEIHAUG loop:
+get_trust_boundary_point = 
+    function (Matrix[double] g, Matrix[double] z, Matrix[double] p, 
+              Matrix[double] q, Matrix[double] r, double pp, double pq, 
+              double trust_delta_sq)
+        return (Matrix[double] new_z, double f_change)
+        {
+            zz = sum (z * z);  pz = sum (p * z);
+            sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq));
+            tau_1 = (- pz + sq_root_d) / pp;
+            tau_2 = (- pz - sq_root_d) / pp;
+            zq = sum (z * q);  gp = sum (g * p);
+            f_extra = 0.5 * sum (z * (r + g));
+            f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1;
+            f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2;
+            ind1 = as.integer(f_change_1 < f_change_2);
+            ind2 = as.integer(f_change_1 >= f_change_2);
+            new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p);
+            f_change = ind1 * f_change_1 + ind2 * f_change_2;
+        }
+
+
+# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
+# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
+# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
+straightenX =
+    function (Matrix[double] X, double eps, int max_iter_CG)
+        return   (Matrix[double] w)
+        {
+            w_X = t(colSums(X));
+            lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+            eps_LS = eps * nrow(X);
+
+            # BEGIN LEAST SQUARES
+
+            r_LS = - w_X;
+            z_LS = matrix (0.0, rows = ncol(X), cols = 1);
+            p_LS = - r_LS;
+            norm_r2_LS = sum (r_LS ^ 2);
+            i_LS = 0;
+            while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS)
+            {
+                q_LS = t(X) %*% X %*% p_LS;
+                q_LS = q_LS + lambda_LS * p_LS;
+                alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+                z_LS = z_LS + alpha_LS * p_LS;
+                old_norm_r2_LS = norm_r2_LS;
+                r_LS = r_LS + alpha_LS * q_LS;
+                norm_r2_LS = sum (r_LS ^ 2);
+                p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+                i_LS = i_LS + 1;
+            }
+
+            # END LEAST SQUARES
+
+            w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+        }
+
+

From 78018f9b09bd80e7c47f24fac83e4c1955f13f7a Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Fri, 26 Jun 2026 01:16:07 +0200
Subject: [PATCH 02/12] rename builtin StepGLM.dml to stepglm.dml

---
 scripts/builtin/{StepGLM.dml => stepglm.dml} | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)
 rename scripts/builtin/{StepGLM.dml => stepglm.dml} (99%)

diff --git a/scripts/builtin/StepGLM.dml b/scripts/builtin/stepglm.dml
similarity index 99%
rename from scripts/builtin/StepGLM.dml
rename to scripts/builtin/stepglm.dml
index 3bbaf361650..cd2d881ecba 100644
--- a/scripts/builtin/StepGLM.dml
+++ b/scripts/builtin/stepglm.dml
@@ -88,12 +88,11 @@ stepGLM = function (
                     Int moi = 200,
                     Int mii = 0,
                     Double thr = 0.01,
-) 
-    return (
-            Double AIC,
-            Matrix[Double] B, 
-            Matrix[Double] S, 
-            Matrix[Double] O,
+) return (
+          Double AIC,
+          Matrix[Double] B, 
+          Matrix[Double] S, 
+          Matrix[Double] O,
         )
     {
         intercept_status = icpt;

From 739c5d283a0c82622e42bced0c43079c3e1ce153 Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Fri, 26 Jun 2026 01:18:10 +0200
Subject: [PATCH 03/12] rename builtin stepglm.dml to stepGLM.dml

---
 scripts/builtin/{stepglm.dml => stepGLM.dml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/builtin/{stepglm.dml => stepGLM.dml} (100%)

diff --git a/scripts/builtin/stepglm.dml b/scripts/builtin/stepGLM.dml
similarity index 100%
rename from scripts/builtin/stepglm.dml
rename to scripts/builtin/stepGLM.dml

From 9dc956bd0b6728a43d08e6eb8563490484755cdc Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Fri, 26 Jun 2026 05:44:12 +0200
Subject: [PATCH 04/12] fix syntax errors in stepGLM

---
 scripts/algorithms/TestBuiltinStepGLM.dml |   21 +
 scripts/builtin/stepGLM.dml               | 1102 ++++++++++-----------
 2 files changed, 570 insertions(+), 553 deletions(-)
 create mode 100644 scripts/algorithms/TestBuiltinStepGLM.dml

diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml
new file mode 100644
index 00000000000..ab64960b35a
--- /dev/null
+++ b/scripts/algorithms/TestBuiltinStepGLM.dml
@@ -0,0 +1,21 @@
+source("scripts/builtin/stepGLM.dml") as stepGLM;
+
+N = 1000;
+P = 10;
+X = rand(rows=N, cols=P, min=-1.0, max=1.0, pdf="uniform", seed=123);
+
+beta_true = matrix(0, rows=P, cols=1);
+beta_true[2,1] = 3.5;
+beta_true[5,1] = -2.0;
+beta_true[8,1] = 1.5;
+
+Z = X %*% beta_true;
+P_y = 1.0 / (1.0 + exp(-Z));
+Y = (rand(rows=N, cols=1, min=0.0, max=1.0, seed=456) < P_y) * 1.0;
+
+[AIC, B, S, O] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01);
+
+print("Optimal AIC: " + AIC);
+print("Selected Feature Indices:\n" + toString(S));
+print("Estimated Coefficients:\n" + toString(B));
+
diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml
index cd2d881ecba..c8be7e582b1 100644
--- a/scripts/builtin/stepGLM.dml
+++ b/scripts/builtin/stepGLM.dml
@@ -87,12 +87,12 @@ stepGLM = function (
                     Double disp = 0.0,
                     Int moi = 200,
                     Int mii = 0,
-                    Double thr = 0.01,
+                    Double thr = 0.01
 ) return (
           Double AIC,
           Matrix[Double] B, 
           Matrix[Double] S, 
-          Matrix[Double] O,
+          String O
         )
     {
         intercept_status = icpt;
@@ -134,18 +134,18 @@ stepGLM = function (
 
         if (intercept_status == 0) {
             # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
-            [AIC_best, _beta, _S, _O] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered);
+            [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered);
         } else {
             # compute AIC of an empty model with only intercept (all Ys are constant)
             all_ones = matrix (1, rows = num_records, cols = 1);
-            [AIC_best, _beta, _S, _O] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered);
+            [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered);
         }
         #print ("Best AIC without any features: " + AIC_best);
 
         # First pass to examine single features
         AICs = matrix (AIC_best, rows = 1, cols = num_features);
         parfor (i in 1:num_features) { 	
-            [AIC_1, _beta, _S, _O] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered);
+            [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered);
             AICs[1,i] = AIC_1;
         }
 
@@ -163,11 +163,11 @@ stepGLM = function (
             #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
             if (intercept_status == 0) {
                 # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
-                [AIC_best, _beta, _S, _O] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered);
+                [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered);
             } else {
                 # compute AIC of an empty model with only intercept (all Ys are constant)
                 ###all_ones = matrix (1, rows = num_records, cols = 1);
-                [AIC_best, _beta, _S, _O] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered);
+                [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered);
             }
         };
 
@@ -182,9 +182,9 @@ stepGLM = function (
                 if (as.scalar(columns_fixed[1,i]) == 0) {	
 
                     # Construct the feature matrix
-                    X = cbind (X_global, X_orig[,i]);
+                    X_loop = cbind (X_global, X_orig[,i]);
 
-                    [AIC_2, _beta, _S, _O] = glm_fit (X, Y, intercept_status, num_features, columns_fixed_ordered);
+                    [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm_fit (X_loop, Y, intercept_status, num_features, columns_fixed_ordered);
                     AICs[1,i] = AIC_2;
                 }		
             }
@@ -216,8 +216,7 @@ stepGLM = function (
 
         # run GLM with selected set of features
         print ("Running GLM with selected features...");
-        [AIC, beta, S, O] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered);
-        return (AIC, beta, S, O)
+        [AIC, B, S, O] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered);
     }
 
 
@@ -233,13 +232,13 @@ glm_fit = function (
                     Double disp = 0.0,
                     Double tol = 0.000001,
                     Int moi = 200,
-                    Int mii = 0,
+                    Int mii = 0
 ) 
     return (
             Double AIC, 
             Matrix[Double] beta,
             Matrix[Double] S,
-            String O,
+            String O
         ) 
     {
 
@@ -563,189 +562,186 @@ glm_fit = function (
         str = append (str, "DEVIANCE_SCALED," + deviance);
         O = str
 
+        do_return = 0;
         # Prepare the output matrix
-        print ("Writing the output matrix...");
         if (intercept_status == 0 & num_features == 1) { 
             if (p == num_records) {
                 beta_out_tmp = matrix (0, rows = num_features_orig + 1, cols = 1); 
                 beta_out_tmp[num_features_orig + 1,] = beta_out;
-                beta_out = beta_out_tmp;
-                return AIC, beta_out, S, O;
+                beta = beta_out_tmp;
+                do_return = 1;
             } else if (sum (X) == 0){
-                beta_out = matrix (0, rows = num_features_orig, cols = 1);
-                return AIC, beta_out, S, O;
+                beta = matrix (0, rows = num_features_orig, cols = 1);
+                do_return = 1;
             }
         }
+        if (do_return != 0) {
+          no_selected = ncol (Selected);
+          max_selected = max (Selected);
+          last = max_selected + 1;	
 
-        no_selected = ncol (Selected);
-        max_selected = max (Selected);
-        last = max_selected + 1;	
-
-        if (intercept_status != 0) {
+          if (intercept_status != 0) {
 
             Selected_ext = cbind (Selected, as.matrix (last));			
             P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); 
 
             if (intercept_status == 2) {
 
-                P1_ssX_beta = P1 * ssX_beta;
-                P2_ssX_beta = colSums (P1_ssX_beta);
-                P1_beta = P1 * beta;
-                P2_beta = colSums (P1_beta);
+              P1_ssX_beta = P1 * ssX_beta;
+              P2_ssX_beta = colSums (P1_ssX_beta);
+              P1_beta = P1 * beta;
+              P2_beta = colSums (P1_beta);
 
-                if (max_selected < num_features_orig) {
+              if (max_selected < num_features_orig) {
 
-                    P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-                    P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+                P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+                P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
 
-                    P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; 
-                    P2_ssX_beta[1, max_selected + 1] = 0;
+                P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; 
+                P2_ssX_beta[1, max_selected + 1] = 0;
 
-                    P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1]; 
-                    P2_beta[1, max_selected + 1] = 0;
+                P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1]; 
+                P2_beta[1, max_selected + 1] = 0;
 
-                }
-                beta_out = cbind (t(P2_ssX_beta), t(P2_beta));
+              }
+              beta = cbind (t(P2_ssX_beta), t(P2_beta));
 
             } else {
 
-                P1_beta = P1 * beta;
-                P2_beta = colSums (P1_beta);
-
-                if (max_selected < num_features_orig) {
-                    P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-                    P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ; 
-                    P2_beta[1, max_selected + 1] = 0;
-                }
-                beta_out = t(P2_beta);
+              P1_beta = P1 * beta;
+              P2_beta = colSums (P1_beta);
 
+              if (max_selected < num_features_orig) {
+                P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+                P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ; 
+                P2_beta[1, max_selected + 1] = 0;
+              }
+              beta = t(P2_beta);
             }
-        } else {
-
+          } else {
             P1 = table (seq (1, no_selected), t(Selected)); 
             P1_beta = P1 * beta;
             P2_beta = colSums (P1_beta);	
 
             if (max_selected < num_features_orig) {
-                P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+              P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
             }		
 
-            beta_out = t(P2_beta);	
+            beta = t(P2_beta);	
+          }
         }
-
-        return AIC, beta_out, S, O;
     }
 
 glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG)
-    return (Matrix[double] beta, double saturated_log_l, int isNaN)
-    {
-        saturated_log_l = 0.0;
-        isNaN = 0;
-        y_corr = Y [, 1];
-        if (dist_type == 2) {
-            n_corr = rowSums (Y);
-            is_n_zero = (n_corr == 0);
-            y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;    
-        }
-        linear_terms = y_corr;
-        if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
-            if          (link_power ==  0) {
+  return (Matrix[double] beta, double saturated_log_l, int isNaN)
+  {
+    saturated_log_l = 0.0;
+    isNaN = 0;
+    y_corr = Y [, 1];
+    if (dist_type == 2) {
+      n_corr = rowSums (Y);
+      is_n_zero = (n_corr == 0);
+      y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;    
+    }
+    linear_terms = y_corr;
+    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
+      if          (link_power ==  0) {
+        if (sum (y_corr < 0) == 0) {
+          is_zero_y_corr = (y_corr == 0);
+          linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+        } else { isNaN = 1; }
+        } else { if (link_power ==  1.0) {
+          linear_terms = y_corr;
+          } else { if (link_power == -1.0) {
+            linear_terms = 1.0 / y_corr;
+            } else { if (link_power ==  0.5) {
+              if (sum (y_corr < 0) == 0) {
+                linear_terms = sqrt (y_corr);
+              } else { isNaN = 1; }
+              } else { if (link_power >   0) {
                 if (sum (y_corr < 0) == 0) {
-                    is_zero_y_corr = (y_corr == 0);
-                    linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+                  is_zero_y_corr = (y_corr == 0);
+                  linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
                 } else { isNaN = 1; }
-                } else { if (link_power ==  1.0) {
-                    linear_terms = y_corr;
-                    } else { if (link_power == -1.0) {
-                        linear_terms = 1.0 / y_corr;
-                        } else { if (link_power ==  0.5) {
-                            if (sum (y_corr < 0) == 0) {
-                                linear_terms = sqrt (y_corr);
-                            } else { isNaN = 1; }
-                            } else { if (link_power >   0) {
-                                if (sum (y_corr < 0) == 0) {
-                                    is_zero_y_corr = (y_corr == 0);
-                                    linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
-                                } else { isNaN = 1; }
-                            } else {
-                                if (sum (y_corr <= 0) == 0) {
-                                    linear_terms = y_corr ^ link_power;
-                                } else { isNaN = 1; }
-            }}}}}
-        }
-        if (dist_type == 2 & link_type >= 1 & link_type <= 5)
-        { # BINOMIAL/BERNOULLI DISTRIBUTION
-            if          (link_type == 1 & link_power == 0)  { # Binomial.log
-                if (sum (y_corr < 0) == 0) {
-                    is_zero_y_corr = (y_corr == 0);
-                    linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+              } else {
+                if (sum (y_corr <= 0) == 0) {
+                  linear_terms = y_corr ^ link_power;
                 } else { isNaN = 1; }
-                } else { if (link_type == 1 & link_power >  0)  { # Binomial.power_nonlog pos
-                    if (sum (y_corr < 0) == 0) {
-                        is_zero_y_corr = (y_corr == 0);
-                        linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
-                    } else { isNaN = 1; }
-                    } else { if (link_type == 1)                      { # Binomial.power_nonlog neg
-                        if (sum (y_corr <= 0) == 0) {
-                            linear_terms = y_corr ^ link_power;
-                        } else { isNaN = 1; }
-                    } else { 
-                        is_zero_y_corr = (y_corr <= 0);
-                        is_one_y_corr  = (y_corr >= 1.0);
-                        y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr);
-                        if (link_type == 2)                           { # Binomial.logit
-                            linear_terms = log (y_corr / (1.0 - y_corr)) 
-                            + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-                            } else { if (link_type == 3)                  { # Binomial.probit
-                                y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5);
-                                t = sqrt (- 2.0 * log (y_below_half));
-                                approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
-                                linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5))
-                                + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-                                } else { if (link_type == 4)                  { # Binomial.cloglog
-                                    linear_terms = log (- log (1.0 - y_corr))
-                                    - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
-                                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-                                    } else { if (link_type == 5)                  { # Binomial.cauchit
-                                        linear_terms = tan ((y_corr - 0.5) * pi)
-                                        + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-            }}  }}}}}
-        }
+      }}}}}
+    }
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+      if          (link_type == 1 & link_power == 0)  { # Binomial.log
+        if (sum (y_corr < 0) == 0) {
+          is_zero_y_corr = (y_corr == 0);
+          linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+        } else { isNaN = 1; }
+        } else { if (link_type == 1 & link_power >  0)  { # Binomial.power_nonlog pos
+          if (sum (y_corr < 0) == 0) {
+            is_zero_y_corr = (y_corr == 0);
+            linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
+          } else { isNaN = 1; }
+          } else { if (link_type == 1)                      { # Binomial.power_nonlog neg
+            if (sum (y_corr <= 0) == 0) {
+              linear_terms = y_corr ^ link_power;
+            } else { isNaN = 1; }
+          } else { 
+            is_zero_y_corr = (y_corr <= 0);
+            is_one_y_corr  = (y_corr >= 1.0);
+            y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr);
+            if (link_type == 2)                           { # Binomial.logit
+              linear_terms = log (y_corr / (1.0 - y_corr)) 
+              + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+              } else { if (link_type == 3)                  { # Binomial.probit
+                y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5);
+                t = sqrt (- 2.0 * log (y_below_half));
+                approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
+                linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5))
+                + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+                } else { if (link_type == 4)                  { # Binomial.cloglog
+                  linear_terms = log (- log (1.0 - y_corr))
+                  - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
+                  + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+                  } else { if (link_type == 5)                  { # Binomial.cauchit
+                    linear_terms = tan ((y_corr - 0.5) * pi)
+                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+      }}  }}}}}
+    }
 
-        if (isNaN == 0) {
-            [saturated_log_l, isNaN] = 
-                glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power);
-        }
+    if (isNaN == 0) {
+      [saturated_log_l, isNaN] = 
+        glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power);
+    }
 
-        if ((dist_type == 1 & link_type == 1 & link_power == 0) |
-            (dist_type == 2 & link_type >= 2))
-        {    
-            desired_eta = 0.0;
-            } else { if (link_type == 1 & link_power == 0) {
-                desired_eta = log (0.5);
-                } else { if (link_type == 1) {
-                    desired_eta = 0.5 ^ link_power;
-                } else {
-                    desired_eta = 0.5;
-        }}}
+    if ((dist_type == 1 & link_type == 1 & link_power == 0) |
+        (dist_type == 2 & link_type >= 2))
+    {    
+      desired_eta = 0.0;
+      } else { if (link_type == 1 & link_power == 0) {
+        desired_eta = log (0.5);
+        } else { if (link_type == 1) {
+          desired_eta = 0.5 ^ link_power;
+        } else {
+          desired_eta = 0.5;
+    }}}
 
-        beta = matrix (0.0, rows = ncol(X), cols = 1);
+    beta = matrix (0.0, rows = ncol(X), cols = 1);
 
-        if (desired_eta != 0) {
-            if (icept_status == 1 | icept_status == 2) {
-                beta [nrow(beta), 1] = desired_eta;
-            } else {
-                # We want: avg (X %*% ssX_transform %*% beta) = desired_eta
-                # Note that "ssX_transform" is trivial here, hence ignored
+    if (desired_eta != 0) {
+      if (icept_status == 1 | icept_status == 2) {
+        beta [nrow(beta), 1] = desired_eta;
+      } else {
+        # We want: avg (X %*% ssX_transform %*% beta) = desired_eta
+        # Note that "ssX_transform" is trivial here, hence ignored
 
-                beta = straightenX (X, 0.000001, max_iter_CG);  
-                beta = beta * desired_eta;
-    }   }   }
+        beta = straightenX (X, 0.000001, max_iter_CG);  
+        beta = beta * desired_eta;
+  }   }   }
 
 
 glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
                      int dist_type, double var_power, int link_type, double link_power)
-    return (Matrix[double] g_Y, Matrix[double] w)
+  return (Matrix[double] g_Y, Matrix[double] w)
 # ORIGINALLY we returned more meaningful vectors, namely:
 # Matrix[double] y_residual    : y - y_mean, i.e. y observed - y predicted
 # Matrix[double] link_gradient : derivative of the link function
@@ -754,312 +750,312 @@ glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
 # and skip over the "meaningful intermediaries".  Now we output these two variables:
 #     g_Y = y_residual / (var_function * link_gradient);
 #     w   = 1.0 / (var_function * link_gradient ^ 2);
-    {
-        num_records = nrow (linear_terms);
-        zeros_r = matrix (0.0, rows = num_records, cols = 1);
-        ones_r = 1 + zeros_r;
-        g_Y  = zeros_r;
-        w  = zeros_r;
-
-        # Some constants
-
-        one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
-        ones_2 = matrix (1.0, rows = 1, cols = 2);
-        p_one_m_one = ones_2;
-        p_one_m_one [1, 2] = -1.0;
-        m_one_p_one = ones_2;
-        m_one_p_one [1, 1] = -1.0;
-        zero_one = ones_2;
-        zero_one [1, 1] = 0.0;
-        one_zero = ones_2;
-        one_zero [1, 2] = 0.0;
-        flip_pos = matrix (0, rows = 2, cols = 2);
-        flip_neg = flip_pos;
-        flip_pos [1, 2] = 1;
-        flip_pos [2, 1] = 1;
-        flip_neg [1, 2] = -1;
-        flip_neg [2, 1] = 1;
-
-        if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
-            y_mean = zeros_r;
-            if          (link_power ==  0) {
-                y_mean = exp (linear_terms);
-                y_mean_pow = y_mean ^ (1 - var_power);
-                w   = y_mean_pow * y_mean;
-                g_Y = y_mean_pow * (Y - y_mean);
-                } else { if (link_power ==  1.0) {
-                    y_mean = linear_terms;
-                    w   = y_mean ^ (- var_power);
-                    g_Y = w * (Y - y_mean);
-                } else {
-                    y_mean = linear_terms ^ (1.0 / link_power);
-                    c1  = (1 - var_power) / link_power - 1;
-                    c2  = (2 - var_power) / link_power - 2;
-                    g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power;
-                    w   = (linear_terms ^ c2) / (link_power ^ 2);
-        }   }}
-        if (dist_type == 2 & link_type >= 1 & link_type <= 5)
-        { # BINOMIAL/BERNOULLI DISTRIBUTION
-            if (link_type == 1) { # BINOMIAL.POWER LINKS
-                if (link_power == 0)  { # Binomial.log
-                    vec1 = 1 / (exp (- linear_terms) - 1);
-                    g_Y = Y [, 1] - Y [, 2] * vec1;
-                    w   = rowSums (Y) * vec1;
-                } else {                  # Binomial.nonlog
-                    vec1 = zeros_r;
-                    if (link_power == 0.5)  {
-                        vec1 = 1 / (1 - linear_terms ^ 2);
-                        } else { if (sum (linear_terms < 0) == 0) {
-                            vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power));
-                    } else {isNaN = 1;}}
-                    # We want a "zero-protected" version of
-                    #     vec2 = Y [, 1] / linear_terms;
-                    is_y_0 = (Y [, 1] == 0);
-                    vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0;
-                    g_Y =  (vec2 - Y [, 2] * vec1 * linear_terms) / link_power;
-                    w   =  rowSums (Y) * vec1 / link_power ^ 2;
-                }
-            } else {
-                is_LT_pos_infinite = (linear_terms == Inf);
-                is_LT_neg_infinite = (linear_terms == -Inf);
-                is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
-                finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
-                finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
-                if (link_type == 2)                           { # Binomial.logit
-                    Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
-                    Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
-                    Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
-                    g_Y = rowSums (Y * (Y_prob %*% flip_neg));           ### = y_residual;
-                    w   = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob);  ### = y_variance;
-                    } else { if (link_type == 3)                  { # Binomial.probit
-                        is_lt_pos = (linear_terms >= 0);
-                        t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
-                        pt_gp = t_gp * ( 0.254829592 
-                                        + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
-                                                  + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
-                                                            + t_gp * (-1.453152027 
-                                                                      + t_gp *   1.061405429))));
-                        the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
-                        vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
-                        vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5);
-                        w   = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1;
-                        g_Y = one_over_sqrt_two_pi * vec2 / vec1;
-                        } else { if (link_type == 4)                  { # Binomial.cloglog
-                            the_exp = exp (linear_terms)
-                            the_exp_exp = exp (- the_exp);
-                            is_too_small = ((10000000 + the_exp) == 10000000);
-                            the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2);
-                            g_Y =  (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio;
-                            w   =  the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio;
-                            } else { if (link_type == 5)                  { # Binomial.cauchit
-                                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
-                                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
-                                y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1];
-                                var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2];
-                                link_gradient_normalized = (1 + linear_terms ^ 2) * pi;
-                                g_Y =  rowSums (Y) * y_residual / (var_function * link_gradient_normalized);
-                                w   = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2);
-                }}}}   
-            }
+  {
+    num_records = nrow (linear_terms);
+    zeros_r = matrix (0.0, rows = num_records, cols = 1);
+    ones_r = 1 + zeros_r;
+    g_Y  = zeros_r;
+    w  = zeros_r;
+
+    # Some constants
+
+    one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
+    ones_2 = matrix (1.0, rows = 1, cols = 2);
+    p_one_m_one = ones_2;
+    p_one_m_one [1, 2] = -1.0;
+    m_one_p_one = ones_2;
+    m_one_p_one [1, 1] = -1.0;
+    zero_one = ones_2;
+    zero_one [1, 1] = 0.0;
+    one_zero = ones_2;
+    one_zero [1, 2] = 0.0;
+    flip_pos = matrix (0, rows = 2, cols = 2);
+    flip_neg = flip_pos;
+    flip_pos [1, 2] = 1;
+    flip_pos [2, 1] = 1;
+    flip_neg [1, 2] = -1;
+    flip_neg [2, 1] = 1;
+
+    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
+      y_mean = zeros_r;
+      if          (link_power ==  0) {
+        y_mean = exp (linear_terms);
+        y_mean_pow = y_mean ^ (1 - var_power);
+        w   = y_mean_pow * y_mean;
+        g_Y = y_mean_pow * (Y - y_mean);
+        } else { if (link_power ==  1.0) {
+          y_mean = linear_terms;
+          w   = y_mean ^ (- var_power);
+          g_Y = w * (Y - y_mean);
+        } else {
+          y_mean = linear_terms ^ (1.0 / link_power);
+          c1  = (1 - var_power) / link_power - 1;
+          c2  = (2 - var_power) / link_power - 2;
+          g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power;
+          w   = (linear_terms ^ c2) / (link_power ^ 2);
+    }   }}
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+      if (link_type == 1) { # BINOMIAL.POWER LINKS
+        if (link_power == 0)  { # Binomial.log
+          vec1 = 1 / (exp (- linear_terms) - 1);
+          g_Y = Y [, 1] - Y [, 2] * vec1;
+          w   = rowSums (Y) * vec1;
+        } else {                  # Binomial.nonlog
+          vec1 = zeros_r;
+          if (link_power == 0.5)  {
+            vec1 = 1 / (1 - linear_terms ^ 2);
+            } else { if (sum (linear_terms < 0) == 0) {
+              vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power));
+          } else {isNaN = 1;}}
+          # We want a "zero-protected" version of
+          #     vec2 = Y [, 1] / linear_terms;
+          is_y_0 = (Y [, 1] == 0);
+          vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0;
+          g_Y =  (vec2 - Y [, 2] * vec1 * linear_terms) / link_power;
+          w   =  rowSums (Y) * vec1 / link_power ^ 2;
         }
+      } else {
+        is_LT_pos_infinite = (linear_terms == Inf);
+        is_LT_neg_infinite = (linear_terms == -Inf);
+        is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
+        finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
+        finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
+        if (link_type == 2)                           { # Binomial.logit
+          Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
+          Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
+          Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+          g_Y = rowSums (Y * (Y_prob %*% flip_neg));           ### = y_residual;
+          w   = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob);  ### = y_variance;
+          } else { if (link_type == 3)                  { # Binomial.probit
+            is_lt_pos = (linear_terms >= 0);
+            t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
+            pt_gp = t_gp * ( 0.254829592 
+                            + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+                                      + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+                                                + t_gp * (-1.453152027 
+                                                          + t_gp *   1.061405429))));
+            the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
+            vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
+            vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5);
+            w   = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1;
+            g_Y = one_over_sqrt_two_pi * vec2 / vec1;
+            } else { if (link_type == 4)                  { # Binomial.cloglog
+              the_exp = exp (linear_terms)
+              the_exp_exp = exp (- the_exp);
+              is_too_small = ((10000000 + the_exp) == 10000000);
+              the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2);
+              g_Y =  (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio;
+              w   =  the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio;
+              } else { if (link_type == 5)                  { # Binomial.cauchit
+                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
+                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+                y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1];
+                var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2];
+                link_gradient_normalized = (1 + linear_terms ^ 2) * pi;
+                g_Y =  rowSums (Y) * y_residual / (var_function * link_gradient_normalized);
+                w   = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2);
+        }}}}   
+      }
     }
+  }
 
 
 glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y,
                                     int dist_type, double var_power, int link_type, double link_power)
-    return (double log_l, int isNaN)
-    {
-        isNaN = 0;
-        log_l = 0.0;
-        num_records = nrow (Y);
-        zeros_r = matrix (0.0, rows = num_records, cols = 1);
-
-        if (dist_type == 1 & link_type == 1)
-        { # POWER DISTRIBUTION
-            b_cumulant = zeros_r;
-            natural_parameters = zeros_r;
-            is_natural_parameter_log_zero = zeros_r;
-            if          (var_power == 1.0 & link_power == 0)  { # Poisson.log
-                b_cumulant = exp (linear_terms);
-                is_natural_parameter_log_zero = (linear_terms == -Inf);
-                natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0);
-                } else { if (var_power == 1.0 & link_power == 1.0)  { # Poisson.id
-                    if (sum (linear_terms < 0) == 0)  {
-                        b_cumulant = linear_terms;
-                        is_natural_parameter_log_zero = (linear_terms == 0);
-                        natural_parameters = log (linear_terms + is_natural_parameter_log_zero);
+  return (double log_l, int isNaN)
+  {
+    isNaN = 0;
+    log_l = 0.0;
+    num_records = nrow (Y);
+    zeros_r = matrix (0.0, rows = num_records, cols = 1);
+
+    if (dist_type == 1 & link_type == 1)
+    { # POWER DISTRIBUTION
+      b_cumulant = zeros_r;
+      natural_parameters = zeros_r;
+      is_natural_parameter_log_zero = zeros_r;
+      if          (var_power == 1.0 & link_power == 0)  { # Poisson.log
+        b_cumulant = exp (linear_terms);
+        is_natural_parameter_log_zero = (linear_terms == -Inf);
+        natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0);
+        } else { if (var_power == 1.0 & link_power == 1.0)  { # Poisson.id
+          if (sum (linear_terms < 0) == 0)  {
+            b_cumulant = linear_terms;
+            is_natural_parameter_log_zero = (linear_terms == 0);
+            natural_parameters = log (linear_terms + is_natural_parameter_log_zero);
+          } else {isNaN = 1;}
+          } else { if (var_power == 1.0 & link_power == 0.5)  { # Poisson.sqrt
+            if (sum (linear_terms < 0) == 0)  {
+              b_cumulant = linear_terms ^ 2;
+              is_natural_parameter_log_zero = (linear_terms == 0);
+              natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero);
+            } else {isNaN = 1;}
+            } else { if (var_power == 1.0 & link_power  > 0)  { # Poisson.power_nonlog, pos
+              if (sum (linear_terms < 0) == 0)  {
+                is_natural_parameter_log_zero = (linear_terms == 0);
+                b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
+                natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
+              } else {isNaN = 1;}
+              } else { if (var_power == 1.0)                      { # Poisson.power_nonlog, neg
+                if (sum (linear_terms <= 0) == 0) {
+                  b_cumulant = linear_terms ^ (1.0 / link_power);
+                  natural_parameters = log (linear_terms) / link_power;
+                } else {isNaN = 1;}
+                } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse
+                  if (sum (linear_terms <= 0) == 0) {
+                    b_cumulant = - log (linear_terms);
+                    natural_parameters = - linear_terms;
+                  } else {isNaN = 1;}
+                  } else { if (var_power == 2.0 & link_power ==  1.0) { # Gamma.id
+                    if (sum (linear_terms <= 0) == 0) {
+                      b_cumulant = log (linear_terms);
+                      natural_parameters = - 1.0 / linear_terms;
                     } else {isNaN = 1;}
-                    } else { if (var_power == 1.0 & link_power == 0.5)  { # Poisson.sqrt
-                        if (sum (linear_terms < 0) == 0)  {
-                            b_cumulant = linear_terms ^ 2;
-                            is_natural_parameter_log_zero = (linear_terms == 0);
-                            natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero);
+                    } else { if (var_power == 2.0 & link_power ==  0) { # Gamma.log
+                      b_cumulant = linear_terms;
+                      natural_parameters = - exp (- linear_terms);
+                      } else { if (var_power == 2.0)                      { # Gamma.power_nonlog
+                        if (sum (linear_terms <= 0) == 0) {
+                          b_cumulant = log (linear_terms) / link_power;
+                          natural_parameters = - linear_terms ^ (- 1.0 / link_power);
                         } else {isNaN = 1;}
-                        } else { if (var_power == 1.0 & link_power  > 0)  { # Poisson.power_nonlog, pos
-                            if (sum (linear_terms < 0) == 0)  {
-                                is_natural_parameter_log_zero = (linear_terms == 0);
-                                b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
-                                natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
-                            } else {isNaN = 1;}
-                            } else { if (var_power == 1.0)                      { # Poisson.power_nonlog, neg
-                                if (sum (linear_terms <= 0) == 0) {
-                                    b_cumulant = linear_terms ^ (1.0 / link_power);
-                                    natural_parameters = log (linear_terms) / link_power;
-                                } else {isNaN = 1;}
-                                } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse
-                                    if (sum (linear_terms <= 0) == 0) {
-                                        b_cumulant = - log (linear_terms);
-                                        natural_parameters = - linear_terms;
-                                    } else {isNaN = 1;}
-                                    } else { if (var_power == 2.0 & link_power ==  1.0) { # Gamma.id
-                                        if (sum (linear_terms <= 0) == 0) {
-                                            b_cumulant = log (linear_terms);
-                                            natural_parameters = - 1.0 / linear_terms;
-                                        } else {isNaN = 1;}
-                                        } else { if (var_power == 2.0 & link_power ==  0) { # Gamma.log
-                                            b_cumulant = linear_terms;
-                                            natural_parameters = - exp (- linear_terms);
-                                            } else { if (var_power == 2.0)                      { # Gamma.power_nonlog
-                                                if (sum (linear_terms <= 0) == 0) {
-                                                    b_cumulant = log (linear_terms) / link_power;
-                                                    natural_parameters = - linear_terms ^ (- 1.0 / link_power);
-                                                } else {isNaN = 1;}
-                                                } else { if                    (link_power ==  0) { # PowerDist.log
-                                                    natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
-                                                    b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
-                                                } else {                                              # PowerDist.power_nonlog
-                                                    if          (-2 * link_power == 1.0 - var_power) {
-                                                        natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power);
-                                                        } else { if (-1 * link_power == 1.0 - var_power) {
-                                                            natural_parameters = 1.0 / linear_terms / (1.0 - var_power);
-                                                            } else { if (     link_power == 1.0 - var_power) {
-                                                                natural_parameters = linear_terms / (1.0 - var_power);
-                                                                } else { if ( 2 * link_power == 1.0 - var_power) {
-                                                                    natural_parameters = linear_terms ^ 2 / (1.0 - var_power);
-                                                                } else {
-                                                                    if (sum (linear_terms <= 0) == 0) {
-                                                                        power = (1.0 - var_power) / link_power;
-                                                                        natural_parameters = (linear_terms ^ power) / (1.0 - var_power);
-                                                                    } else {isNaN = 1;}
-                                                    }}}}
-                                                    if          (-2 * link_power == 2.0 - var_power) {
-                                                        b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power);
-                                                        } else { if (-1 * link_power == 2.0 - var_power) {
-                                                            b_cumulant = 1.0 / linear_terms / (2.0 - var_power);
-                                                            } else { if (     link_power == 2.0 - var_power) {
-                                                                b_cumulant = linear_terms / (2.0 - var_power);
-                                                                } else { if ( 2 * link_power == 2.0 - var_power) {
-                                                                    b_cumulant = linear_terms ^ 2 / (2.0 - var_power);
-                                                                } else {
-                                                                    if (sum (linear_terms <= 0) == 0) {
-                                                                        power = (2.0 - var_power) / link_power;
-                                                                        b_cumulant = (linear_terms ^ power) / (2.0 - var_power);
-                                                                    } else {isNaN = 1;}
-                                                    }}}}
-            }}}}} }}}}}
-            if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) {
-                log_l = -Inf;
-                isNaN = 1;
-            }
-            if (isNaN == 0)
-            {
-                log_l = sum (Y * natural_parameters - b_cumulant);
-                if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
-                    log_l = -Inf;
-                    isNaN = 1;
-        }   }   }
-
-        if (dist_type == 2 & link_type >= 1 & link_type <= 5)
-        { # BINOMIAL/BERNOULLI DISTRIBUTION
-
-            [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power);
-
-            if (isNaN == 0) {            
-                does_prob_contradict = (Y_prob <= 0);
-                if (sum (does_prob_contradict * abs (Y)) == 0) {
-                    log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict));
-                    if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
-                        isNaN = 1;
-                    }
-                } else {
-                    log_l = -Inf;
-                    isNaN = 1;
-        }   }   }
+                        } else { if                    (link_power ==  0) { # PowerDist.log
+                          natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
+                          b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
+                        } else {                                              # PowerDist.power_nonlog
+                          if          (-2 * link_power == 1.0 - var_power) {
+                            natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power);
+                            } else { if (-1 * link_power == 1.0 - var_power) {
+                              natural_parameters = 1.0 / linear_terms / (1.0 - var_power);
+                              } else { if (     link_power == 1.0 - var_power) {
+                                natural_parameters = linear_terms / (1.0 - var_power);
+                                } else { if ( 2 * link_power == 1.0 - var_power) {
+                                  natural_parameters = linear_terms ^ 2 / (1.0 - var_power);
+                                } else {
+                                  if (sum (linear_terms <= 0) == 0) {
+                                    power = (1.0 - var_power) / link_power;
+                                    natural_parameters = (linear_terms ^ power) / (1.0 - var_power);
+                                  } else {isNaN = 1;}
+                          }}}}
+                          if          (-2 * link_power == 2.0 - var_power) {
+                            b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power);
+                            } else { if (-1 * link_power == 2.0 - var_power) {
+                              b_cumulant = 1.0 / linear_terms / (2.0 - var_power);
+                              } else { if (     link_power == 2.0 - var_power) {
+                                b_cumulant = linear_terms / (2.0 - var_power);
+                                } else { if ( 2 * link_power == 2.0 - var_power) {
+                                  b_cumulant = linear_terms ^ 2 / (2.0 - var_power);
+                                } else {
+                                  if (sum (linear_terms <= 0) == 0) {
+                                    power = (2.0 - var_power) / link_power;
+                                    b_cumulant = (linear_terms ^ power) / (2.0 - var_power);
+                                  } else {isNaN = 1;}
+                          }}}}
+      }}}}} }}}}}
+      if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) {
+        log_l = -Inf;
+        isNaN = 1;
+      }
+      if (isNaN == 0)
+      {
+        log_l = sum (Y * natural_parameters - b_cumulant);
+        if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
+          log_l = -Inf;
+          isNaN = 1;
+    }   }   }
 
-        if (isNaN == 1) {
-            log_l = - Inf; 
-        }
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+
+      [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power);
+
+      if (isNaN == 0) {            
+        does_prob_contradict = (Y_prob <= 0);
+        if (sum (does_prob_contradict * abs (Y)) == 0) {
+          log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict));
+          if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
+            isNaN = 1;
+          }
+        } else {
+          log_l = -Inf;
+          isNaN = 1;
+    }   }   }
+
+    if (isNaN == 1) {
+      log_l = - Inf; 
     }
+  }
 
 
 
 binomial_probability_two_column =
-    function (Matrix[double] linear_terms, int link_type, double link_power)
-        return   (Matrix[double] Y_prob, int isNaN)
-        {
-            isNaN = 0;
-            num_records = nrow (linear_terms);
-
-            # Define some auxiliary matrices
-
-            ones_2 = matrix (1.0, rows = 1, cols = 2);
-            p_one_m_one = ones_2;
-            p_one_m_one [1, 2] = -1.0;
-            m_one_p_one = ones_2;
-            m_one_p_one [1, 1] = -1.0;
-            zero_one = ones_2;
-            zero_one [1, 1] = 0.0;
-            one_zero = ones_2;
-            one_zero [1, 2] = 0.0;
-
-            zeros_r = matrix (0.0, rows = num_records, cols = 1);
-            ones_r = 1.0 + zeros_r;
-
-            # Begin the function body
-
-            Y_prob = zeros_r %*% ones_2;
-            if (link_type == 1) { # Binomial.power
-                if          (link_power == 0) { # Binomial.log
-                    Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one;    
-                    } else { if (link_power == 0.5) { # Binomial.sqrt
-                        Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one;    
-                    } else {                          # Binomial.power_nonlog
-                        if (sum (linear_terms < 0) == 0) {
-                            Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one;    
-                        } else {isNaN = 1;}
-                }}
-            } else {              # Binomial.non_power
-                is_LT_pos_infinite = (linear_terms == Inf);
-                is_LT_neg_infinite = (linear_terms == -Inf);
-                is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
-                finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
-                finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
-                if (link_type == 2)             { # Binomial.logit
-                    Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
-                    Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
-                    } else { if (link_type == 3)    { # Binomial.probit
-                        lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one;
-                        t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
-                        pt_gp = t_gp * ( 0.254829592 
-                                        + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
-                                                  + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
-                                                            + t_gp * (-1.453152027 
-                                                                      + t_gp *   1.061405429))));
-                        the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
-                        Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg);
-                        } else { if (link_type == 4)    { # Binomial.cloglog
-                            the_exp = exp (finite_linear_terms);
-                            the_exp_exp = exp (- the_exp);
-                            is_too_small = ((10000000 + the_exp) == 10000000);
-                            Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2);
-                            Y_prob [, 2] = the_exp_exp;
-                            } else { if (link_type == 5)    { # Binomial.cauchit
-                                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
-                            } else {
-                                isNaN = 1;
-                }}}}
-                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
-        }   }            
+  function (Matrix[double] linear_terms, int link_type, double link_power)
+    return   (Matrix[double] Y_prob, int isNaN)
+    {
+      isNaN = 0;
+      num_records = nrow (linear_terms);
+
+      # Define some auxiliary matrices
+
+      ones_2 = matrix (1.0, rows = 1, cols = 2);
+      p_one_m_one = ones_2;
+      p_one_m_one [1, 2] = -1.0;
+      m_one_p_one = ones_2;
+      m_one_p_one [1, 1] = -1.0;
+      zero_one = ones_2;
+      zero_one [1, 1] = 0.0;
+      one_zero = ones_2;
+      one_zero [1, 2] = 0.0;
+
+      zeros_r = matrix (0.0, rows = num_records, cols = 1);
+      ones_r = 1.0 + zeros_r;
+
+      # Begin the function body
+
+      Y_prob = zeros_r %*% ones_2;
+      if (link_type == 1) { # Binomial.power
+        if          (link_power == 0) { # Binomial.log
+          Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one;    
+          } else { if (link_power == 0.5) { # Binomial.sqrt
+            Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one;    
+          } else {                          # Binomial.power_nonlog
+            if (sum (linear_terms < 0) == 0) {
+              Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one;    
+            } else {isNaN = 1;}
+        }}
+      } else {              # Binomial.non_power
+        is_LT_pos_infinite = (linear_terms == Inf);
+        is_LT_neg_infinite = (linear_terms == -Inf);
+        is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
+        finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
+        finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
+        if (link_type == 2)             { # Binomial.logit
+          Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
+          Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
+          } else { if (link_type == 3)    { # Binomial.probit
+            lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one;
+            t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
+            pt_gp = t_gp * ( 0.254829592 
+                            + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+                                      + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+                                                + t_gp * (-1.453152027 
+                                                          + t_gp *   1.061405429))));
+            the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
+            Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg);
+            } else { if (link_type == 4)    { # Binomial.cloglog
+              the_exp = exp (finite_linear_terms);
+              the_exp_exp = exp (- the_exp);
+              is_too_small = ((10000000 + the_exp) == 10000000);
+              Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2);
+              Y_prob [, 2] = the_exp_exp;
+              } else { if (link_type == 5)    { # Binomial.cauchit
+                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
+              } else {
+                isNaN = 1;
+        }}}}
+        Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+    }   }            
 
 
 # THE CG-STEIHAUG PROCEDURE SCRIPT
@@ -1072,130 +1068,130 @@ binomial_probability_two_column =
 # is given separately because sparse "X" may become dense after applying the transform.
 #
 get_CG_Steihaug_point =
-    function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w,
-              Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG)
-        return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary)
-        {
-            trust_delta_sq = trust_delta ^ 2;
-            size_CG = nrow (g);
-            z = matrix (0.0, rows = size_CG, cols = 1);
-            neg_log_l_change = 0.0;
-            reached_trust_boundary = 0;
-            g_reg = g + lambda * beta;
-            r_CG = g_reg;
-            p_CG = -r_CG;
+  function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w,
+            Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG)
+    return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary)
+    {
+      trust_delta_sq = trust_delta ^ 2;
+      size_CG = nrow (g);
+      z = matrix (0.0, rows = size_CG, cols = 1);
+      neg_log_l_change = 0.0;
+      reached_trust_boundary = 0;
+      g_reg = g + lambda * beta;
+      r_CG = g_reg;
+      p_CG = -r_CG;
+      rr_CG = sum(r_CG * r_CG);
+      eps_CG = rr_CG * min (0.25, sqrt (rr_CG));
+      converged_CG = 0;
+      if (rr_CG < eps_CG) {
+        converged_CG = 1;
+      }
+
+      max_iteration_CG = max_iter_CG;
+      if (max_iteration_CG <= 0) {
+        max_iteration_CG = size_CG;
+      }
+      i_CG = 0;
+      while (converged_CG == 0)
+      {
+        i_CG = i_CG + 1;
+        ssX_p_CG = diag (scale_X) %*% p_CG;
+        ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG;
+        temp_CG = t(X) %*% (w * (X %*% ssX_p_CG));
+        q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ];
+        pq_CG = sum (p_CG * q_CG);
+        if (pq_CG <= 0) {
+          pp_CG = sum (p_CG * p_CG);  
+          if (pp_CG > 0) {
+            [z, neg_log_l_change] = 
+              get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
+            reached_trust_boundary = 1;
+          } else {
+            neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
+          }
+          converged_CG = 1;
+        }
+        if (converged_CG == 0) {
+          alpha_CG = rr_CG / pq_CG;
+          new_z = z + alpha_CG * p_CG;
+          if (sum(new_z * new_z) >= trust_delta_sq) {
+            pp_CG = sum (p_CG * p_CG);  
+            [z, neg_log_l_change] = 
+              get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
+            reached_trust_boundary = 1;
+            converged_CG = 1;
+          }
+          if (converged_CG == 0) {
+            z = new_z;
+            old_rr_CG = rr_CG;
+            r_CG = r_CG + alpha_CG * q_CG;
             rr_CG = sum(r_CG * r_CG);
-            eps_CG = rr_CG * min (0.25, sqrt (rr_CG));
-            converged_CG = 0;
-            if (rr_CG < eps_CG) {
-                converged_CG = 1;
-            }
-
-            max_iteration_CG = max_iter_CG;
-            if (max_iteration_CG <= 0) {
-                max_iteration_CG = size_CG;
+            if (i_CG == max_iteration_CG | rr_CG < eps_CG) {
+              neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
+              reached_trust_boundary = 0;
+              converged_CG = 1;
             }
-            i_CG = 0;
-            while (converged_CG == 0)
-            {
-                i_CG = i_CG + 1;
-                ssX_p_CG = diag (scale_X) %*% p_CG;
-                ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG;
-                temp_CG = t(X) %*% (w * (X %*% ssX_p_CG));
-                q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ];
-                pq_CG = sum (p_CG * q_CG);
-                if (pq_CG <= 0) {
-                    pp_CG = sum (p_CG * p_CG);  
-                    if (pp_CG > 0) {
-                        [z, neg_log_l_change] = 
-                            get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
-                        reached_trust_boundary = 1;
-                    } else {
-                        neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
-                    }
-                    converged_CG = 1;
-                }
-                if (converged_CG == 0) {
-                    alpha_CG = rr_CG / pq_CG;
-                    new_z = z + alpha_CG * p_CG;
-                    if (sum(new_z * new_z) >= trust_delta_sq) {
-                        pp_CG = sum (p_CG * p_CG);  
-                        [z, neg_log_l_change] = 
-                            get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
-                        reached_trust_boundary = 1;
-                        converged_CG = 1;
-                    }
-                    if (converged_CG == 0) {
-                        z = new_z;
-                        old_rr_CG = rr_CG;
-                        r_CG = r_CG + alpha_CG * q_CG;
-                        rr_CG = sum(r_CG * r_CG);
-                        if (i_CG == max_iteration_CG | rr_CG < eps_CG) {
-                            neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
-                            reached_trust_boundary = 0;
-                            converged_CG = 1;
-                        }
-                        if (converged_CG == 0) {
-                            p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG;
-        }   }   }   }   }
+            if (converged_CG == 0) {
+              p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG;
+    }   }   }   }   }
 
 
 # An auxiliary function used twice inside the CG-STEIHAUG loop:
 get_trust_boundary_point = 
-    function (Matrix[double] g, Matrix[double] z, Matrix[double] p, 
-              Matrix[double] q, Matrix[double] r, double pp, double pq, 
-              double trust_delta_sq)
-        return (Matrix[double] new_z, double f_change)
-        {
-            zz = sum (z * z);  pz = sum (p * z);
-            sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq));
-            tau_1 = (- pz + sq_root_d) / pp;
-            tau_2 = (- pz - sq_root_d) / pp;
-            zq = sum (z * q);  gp = sum (g * p);
-            f_extra = 0.5 * sum (z * (r + g));
-            f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1;
-            f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2;
-            ind1 = as.integer(f_change_1 < f_change_2);
-            ind2 = as.integer(f_change_1 >= f_change_2);
-            new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p);
-            f_change = ind1 * f_change_1 + ind2 * f_change_2;
-        }
+  function (Matrix[double] g, Matrix[double] z, Matrix[double] p, 
+            Matrix[double] q, Matrix[double] r, double pp, double pq, 
+            double trust_delta_sq)
+    return (Matrix[double] new_z, double f_change)
+    {
+      zz = sum (z * z);  pz = sum (p * z);
+      sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq));
+      tau_1 = (- pz + sq_root_d) / pp;
+      tau_2 = (- pz - sq_root_d) / pp;
+      zq = sum (z * q);  gp = sum (g * p);
+      f_extra = 0.5 * sum (z * (r + g));
+      f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1;
+      f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2;
+      ind1 = as.integer(f_change_1 < f_change_2);
+      ind2 = as.integer(f_change_1 >= f_change_2);
+      new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p);
+      f_change = ind1 * f_change_1 + ind2 * f_change_2;
+    }
 
 
 # Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
 # We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
 # it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
 straightenX =
-    function (Matrix[double] X, double eps, int max_iter_CG)
-        return   (Matrix[double] w)
-        {
-            w_X = t(colSums(X));
-            lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
-            eps_LS = eps * nrow(X);
-
-            # BEGIN LEAST SQUARES
-
-            r_LS = - w_X;
-            z_LS = matrix (0.0, rows = ncol(X), cols = 1);
-            p_LS = - r_LS;
-            norm_r2_LS = sum (r_LS ^ 2);
-            i_LS = 0;
-            while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS)
-            {
-                q_LS = t(X) %*% X %*% p_LS;
-                q_LS = q_LS + lambda_LS * p_LS;
-                alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
-                z_LS = z_LS + alpha_LS * p_LS;
-                old_norm_r2_LS = norm_r2_LS;
-                r_LS = r_LS + alpha_LS * q_LS;
-                norm_r2_LS = sum (r_LS ^ 2);
-                p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
-                i_LS = i_LS + 1;
-            }
-
-            # END LEAST SQUARES
-
-            w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
-        }
+  function (Matrix[double] X, double eps, int max_iter_CG)
+    return   (Matrix[double] w)
+    {
+      w_X = t(colSums(X));
+      lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+      eps_LS = eps * nrow(X);
+
+      # BEGIN LEAST SQUARES
+
+      r_LS = - w_X;
+      z_LS = matrix (0.0, rows = ncol(X), cols = 1);
+      p_LS = - r_LS;
+      norm_r2_LS = sum (r_LS ^ 2);
+      i_LS = 0;
+      while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS)
+      {
+        q_LS = t(X) %*% X %*% p_LS;
+        q_LS = q_LS + lambda_LS * p_LS;
+        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+        z_LS = z_LS + alpha_LS * p_LS;
+        old_norm_r2_LS = norm_r2_LS;
+        r_LS = r_LS + alpha_LS * q_LS;
+        norm_r2_LS = sum (r_LS ^ 2);
+        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+        i_LS = i_LS + 1;
+      }
+
+      # END LEAST SQUARES
+
+      w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+    }
 
 

From e60c2dd1ab3c1b52f947463cd6a2515db21de8c1 Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Fri, 26 Jun 2026 05:48:34 +0200
Subject: [PATCH 05/12] fix glm_fit call syntax

---
 scripts/builtin/stepGLM.dml | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml
index c8be7e582b1..0431b27ddde 100644
--- a/scripts/builtin/stepGLM.dml
+++ b/scripts/builtin/stepGLM.dml
@@ -134,18 +134,18 @@ stepGLM = function (
 
         if (intercept_status == 0) {
             # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
-            [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered);
+            [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm_fit (X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
         } else {
             # compute AIC of an empty model with only intercept (all Ys are constant)
             all_ones = matrix (1, rows = num_records, cols = 1);
-            [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered);
+            [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm_fit (X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
         }
         #print ("Best AIC without any features: " + AIC_best);
 
         # First pass to examine single features
         AICs = matrix (AIC_best, rows = 1, cols = num_features);
         parfor (i in 1:num_features) { 	
-            [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered);
+            [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm_fit (X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
             AICs[1,i] = AIC_1;
         }
 
@@ -163,11 +163,11 @@ stepGLM = function (
             #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
             if (intercept_status == 0) {
                 # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
-                [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered);
+                [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm_fit (X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
             } else {
                 # compute AIC of an empty model with only intercept (all Ys are constant)
                 ###all_ones = matrix (1, rows = num_records, cols = 1);
-                [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered);
+                [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm_fit (X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
             }
         };
 
@@ -184,7 +184,7 @@ stepGLM = function (
                     # Construct the feature matrix
                     X_loop = cbind (X_global, X_orig[,i]);
 
-                    [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm_fit (X_loop, Y, intercept_status, num_features, columns_fixed_ordered);
+                    [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm_fit (X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
                     AICs[1,i] = AIC_2;
                 }		
             }
@@ -216,7 +216,7 @@ stepGLM = function (
 
         # run GLM with selected set of features
         print ("Running GLM with selected features...");
-        [AIC, B, S, O] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered);
+        [AIC, B, S, O] = glm_fit (X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
     }
 
 
@@ -834,7 +834,7 @@ glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
                             + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
                                       + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
                                                 + t_gp * (-1.453152027 
-                                                          + t_gp *   1.061405429))));
+                                                          + t_gp * 1.061405429))));
             the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
             vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
             vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5);
@@ -1040,7 +1040,7 @@ binomial_probability_two_column =
                             + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
                                       + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
                                                 + t_gp * (-1.453152027 
-                                                          + t_gp *   1.061405429))));
+                                                          + t_gp * 1.061405429))));
             the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
             Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg);
             } else { if (link_type == 4)    { # Binomial.cloglog
@@ -1193,5 +1193,3 @@ straightenX =
 
       w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
     }
-
-

From 9de039d912e842bdeaec7b8a2231550893e6258a Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Fri, 26 Jun 2026 06:20:55 +0200
Subject: [PATCH 06/12] remove fit_glm

---
 scripts/builtin/stepGLM.dml | 973 ------------------------------------
 1 file changed, 973 deletions(-)

diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml
index 0431b27ddde..3ec1cffcb7f 100644
--- a/scripts/builtin/stepGLM.dml
+++ b/scripts/builtin/stepGLM.dml
@@ -220,976 +220,3 @@ stepGLM = function (
     }
 
 
-################### UDFS USED IN THIS SCRIPT ##################
-
-glm_fit = function (
-                    Matrix[Double] X, 
-                    Matrix[Double] Y, 
-                    Int intercept_status, 
-                    Double num_features_orig, 
-                    Matrix[Double] Selected, 
-                    Int link = 2,
-                    Double disp = 0.0,
-                    Double tol = 0.000001,
-                    Int moi = 200,
-                    Int mii = 0
-) 
-    return (
-            Double AIC, 
-            Matrix[Double] beta,
-            Matrix[Double] S,
-            String O
-        ) 
-    {
-
-        # distribution family code: 1 = Power, 2 = Bernoulli/Binomial; currently only Bernouli distribution family is supported!		
-        distribution_type = 2;                		# $dfam = 2;
-        variance_as_power_of_the_mean = 0.0;  		# $vpow = 0.0;
-        # link function code: 0 = canonical (depends on distribution), 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit;
-        # currently only log (link = 1), logit (link = 2), probit (link = 3), and cloglog (link = 4) are supported!
-        link_type = link;
-        link_as_power_of_the_mean = 0.0;      		# $lpow = 0.0;
-
-        dispersion = disp;
-        eps = tol;
-        max_iteration_IRLS = moi;
-        max_iteration_CG = mii;
-
-        variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean);
-        link_as_power_of_the_mean = as.double (link_as_power_of_the_mean);
-
-        dispersion = as.double (dispersion);
-        eps = as.double (eps);              
-
-        # Default values for output statistics:
-        regularization = 0.0;
-        termination_code     = 0.0;
-        min_beta             = NaN;
-        i_min_beta           = NaN;
-        max_beta             = NaN;
-        i_max_beta           = NaN;
-        intercept_value      = NaN;
-        dispersion           = NaN;
-        estimated_dispersion = NaN;
-        deviance_nodisp      = NaN;
-        deviance             = NaN;
-
-        #####   INITIALIZE THE PARAMETERS   #####
-
-        num_records  = nrow (X);
-        num_features = ncol (X);
-        zeros_r = matrix (0, rows = num_records, cols = 1);
-        ones_r = 1 + zeros_r;
-
-        # Introduce the intercept, shift and rescale the columns of X if needed
-
-        if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
-            X = cbind (X, ones_r);
-            num_features = ncol (X);
-        }
-
-        scale_lambda = matrix (1, rows = num_features, cols = 1);
-        if (intercept_status == 1 | intercept_status == 2) {
-            scale_lambda [num_features, 1] = 0;
-        }
-
-        if (intercept_status == 2) {  # scale-&-shift X columns to mean 0, variance 1
-            # Important assumption: X [, num_features] = ones_r
-            avg_X_cols = t(colSums(X)) / num_records;
-            var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1);
-            is_unsafe = (var_X_cols <= 0);
-            scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
-            scale_X [num_features, 1] = 1;
-            shift_X = - avg_X_cols * scale_X;
-            shift_X [num_features, 1] = 0;
-            rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
-        } else {
-            scale_X = matrix (1, rows = num_features, cols = 1);
-            shift_X = matrix (0, rows = num_features, cols = 1);
-            rowSums_X_sq = rowSums (X ^ 2);
-        }	
-
-        # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
-        # with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
-        # The transform is then associatively applied to the other side of the expression,
-        # and is rewritten via "scale_X" and "shift_X" as follows:
-        #
-        # ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
-        # ssX_A  = diag (scale_X) %*% A;
-        # ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A;
-        #
-        # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
-        # tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ];
-
-        # Initialize other input-dependent parameters
-
-        lambda = scale_lambda * regularization;
-        if (max_iteration_CG == 0) {
-            max_iteration_CG = num_features;
-        }
-
-        # Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const]
-
-        if (link_type == 0) {
-            if (distribution_type == 1) {
-                link_type = 1;
-                link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean;
-            } else { 
-                if (distribution_type == 2) {
-                    link_type = 2;
-                }   
-            }   
-        }
-
-        # For power distributions and/or links, we use two constants,
-        # "variance as power of the mean" and "link_as_power_of_the_mean",
-        # to specify the variance and the link as arbitrary powers of the
-        # mean.  However, the variance-powers of 1.0 (Poisson family) and
-        # 2.0 (Gamma family) have to be treated as special cases, because
-        # these values integrate into logarithms.  The link-power of 0.0
-        # is also special as it represents the logarithm link.
-
-        num_response_columns = ncol (Y);
-        is_supported = 0;              
-        if (num_response_columns == 2 & distribution_type == 2 & link_type >= 1 & link_type <= 4) { # BERNOULLI DISTRIBUTION
-            is_supported = 1;			  
-        }
-        if (num_response_columns == 1 & distribution_type == 2) {
-            print ("Error: Bernoulli response matrix has not been converted into two-column format.");
-        }
-
-        if (is_supported != 1) {
-            stop ("Response matrix with " + num_response_columns + " columns, distribution family (" + distribution_type + ", " + variance_as_power_of_the_mean
-                  + ") and link family (" + link_type + ", " + link_as_power_of_the_mean + ") are NOT supported together.");
-        }
-
-        #####   INITIALIZE THE BETAS   #####
-
-        [beta, saturated_log_l, isNaN] = 
-            glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG);
-
-        # print(" --- saturated logLik " + saturated_log_l);
-
-        if (isNaN != 0) {
-            stop ("Input matrices X and/or Y are out of range!"); 
-        }
-
-        #####  START OF THE MAIN PART  #####
-
-        sum_X_sq = sum (rowSums_X_sq);
-        trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq));
-        ###  max_trust_delta = trust_delta * 10000.0;
-        log_l = 0.0;
-        deviance_nodisp = 0.0;
-        new_deviance_nodisp = 0.0;
-        isNaN_log_l = 2;
-        newbeta = beta;
-        g = matrix (0.0, rows = num_features, cols = 1);
-        g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
-        accept_new_beta = 1;
-        reached_trust_boundary = 0;
-        neg_log_l_change_predicted = 0.0;
-        i_IRLS = 0;
-
-        # print ("BEGIN IRLS ITERATIONS...");
-
-        ssX_newbeta = diag (scale_X) %*% newbeta;
-        ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
-        all_linear_terms = X %*% ssX_newbeta;
-
-        [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
-        (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
-
-        if (isNaN_new_log_l == 0) {
-            new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
-            new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
-        }
-
-        while (termination_code == 0) {
-            accept_new_beta = 1;
-
-            if (i_IRLS > 0) {
-                if (isNaN_log_l == 0) {
-                    accept_new_beta = 0;
-                }
-
-                # Decide whether to accept a new iteration point and update the trust region
-                # See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
-
-                rho = (- new_log_l + log_l) / neg_log_l_change_predicted;
-                if (rho < 0.25 | isNaN_new_log_l == 1) {
-                    trust_delta = 0.25 * trust_delta;
-                }
-                if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) {
-                    trust_delta = 2 * trust_delta;
-
-                    ### if (trust_delta > max_trust_delta) {
-                    ###     trust_delta = max_trust_delta;
-                    ### }
-                }
-                if (rho > 0.1 & isNaN_new_log_l == 0) {
-                    accept_new_beta = 1;
-                }
-            }
-
-            if (accept_new_beta == 1) {
-                beta = newbeta;  log_l = new_log_l;  deviance_nodisp = new_deviance_nodisp;  isNaN_log_l = isNaN_new_log_l;
-
-                [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
-
-                # We introduced these variables to avoid roundoff errors:
-                #     g_Y = y_residual / (y_var * link_grad);
-                #     w   = 1.0 / (y_var * link_grad * link_grad);
-
-                gXY = - t(X) %*% g_Y;
-                g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ];
-                g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
-            }
-
-            [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] = 
-                get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG);
-
-            newbeta = beta + z;
-
-            ssX_newbeta = diag (scale_X) %*% newbeta;
-            ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
-            all_linear_terms = X %*% ssX_newbeta;
-
-            [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
-            (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
-
-            if (isNaN_new_log_l == 0) {
-                new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
-                new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
-            }
-
-            log_l_change = new_log_l - log_l;               # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps
-
-            if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & 
-                (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) {
-                termination_code = 1;
-            }
-            rho = - log_l_change / neg_log_l_change_predicted;
-            z_norm = sqrt (sum (z * z));
-
-            i_IRLS = i_IRLS + 1;                        
-
-            if (i_IRLS == max_iteration_IRLS) {
-                termination_code = 2;
-            }
-        }
-
-        beta = newbeta;
-        log_l = new_log_l;
-        deviance_nodisp = new_deviance_nodisp;
-
-        #---------------------------- last part
-
-        if (termination_code != 1) {
-            print ("One of the runs of GLM did not converged in " + i_IRLS + " steps!");
-        }
-
-        ##### COMPUTE AIC ##### 
-
-        if (distribution_type == 2 & link_type >= 1 & link_type <= 4) {			
-            AIC = -2 * log_l;
-            if (sum (X) != 0) {
-                AIC = AIC + 2 * num_features;	
-            }
-        } else {
-            stop ("Currently only the Bernoulli distribution family the following link functions are supported: log, logit, probit, and cloglog!");
-        }
-
-        # Output which features give the best AIC and are being used for linear regression 
-        S = Selected;
-
-        ssX_beta = diag (scale_X) %*% beta;
-        ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta;
-        if (intercept_status == 2) {
-            beta_out = cbind (ssX_beta, beta);
-        } else {
-            beta_out = ssX_beta;
-        }
-
-        if (intercept_status == 0 & num_features == 1) {
-            p = sum (X == 1);
-            if (p == num_records) {
-                beta_out = beta_out[1,];
-            }					
-        } 
-
-
-        if (intercept_status == 1 | intercept_status == 2) {
-            intercept_value = as.scalar (beta_out [num_features, 1]);
-            beta_noicept = beta_out [1 : (num_features - 1), 1];
-        } else {
-            beta_noicept = beta_out [1 : num_features, 1];
-        }
-        min_beta = min (beta_noicept);
-        max_beta = max (beta_noicept);
-        tmp_i_min_beta = rowIndexMin (t(beta_noicept))
-        i_min_beta = as.scalar (tmp_i_min_beta [1, 1]);
-        tmp_i_max_beta = rowIndexMax (t(beta_noicept))
-        i_max_beta = as.scalar (tmp_i_max_beta [1, 1]);
-
-        #####  OVER-DISPERSION PART  #####
-
-        all_linear_terms = X %*% ssX_beta;
-        [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
-
-        pearson_residual_sq = g_Y ^ 2 / w;
-        pearson_residual_sq = replace (target = pearson_residual_sq, pattern = NaN, replacement = 0);
-        # pearson_residual_sq = (y_residual ^ 2) / y_var;
-
-        if (num_records > num_features) {
-            estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features);
-        }
-        if (dispersion <= 0) {
-            dispersion = estimated_dispersion;
-        }
-        deviance = deviance_nodisp / dispersion;
-
-        #####  END OF THE MAIN PART  #####
-
-        str = "BETA_MIN," + min_beta;
-        str = append (str, "BETA_MIN_INDEX," + i_min_beta);
-        str = append (str, "BETA_MAX," + max_beta);
-        str = append (str, "BETA_MAX_INDEX," + i_max_beta);
-        str = append (str, "INTERCEPT," + intercept_value);
-        str = append (str, "DISPERSION," + dispersion);
-        str = append (str, "DISPERSION_EST," + estimated_dispersion);
-        str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp);
-        str = append (str, "DEVIANCE_SCALED," + deviance);
-        O = str
-
-        do_return = 0;
-        # Prepare the output matrix
-        if (intercept_status == 0 & num_features == 1) { 
-            if (p == num_records) {
-                beta_out_tmp = matrix (0, rows = num_features_orig + 1, cols = 1); 
-                beta_out_tmp[num_features_orig + 1,] = beta_out;
-                beta = beta_out_tmp;
-                do_return = 1;
-            } else if (sum (X) == 0){
-                beta = matrix (0, rows = num_features_orig, cols = 1);
-                do_return = 1;
-            }
-        }
-        if (do_return != 0) {
-          no_selected = ncol (Selected);
-          max_selected = max (Selected);
-          last = max_selected + 1;	
-
-          if (intercept_status != 0) {
-
-            Selected_ext = cbind (Selected, as.matrix (last));			
-            P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); 
-
-            if (intercept_status == 2) {
-
-              P1_ssX_beta = P1 * ssX_beta;
-              P2_ssX_beta = colSums (P1_ssX_beta);
-              P1_beta = P1 * beta;
-              P2_beta = colSums (P1_beta);
-
-              if (max_selected < num_features_orig) {
-
-                P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-                P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-
-                P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; 
-                P2_ssX_beta[1, max_selected + 1] = 0;
-
-                P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1]; 
-                P2_beta[1, max_selected + 1] = 0;
-
-              }
-              beta = cbind (t(P2_ssX_beta), t(P2_beta));
-
-            } else {
-
-              P1_beta = P1 * beta;
-              P2_beta = colSums (P1_beta);
-
-              if (max_selected < num_features_orig) {
-                P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-                P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ; 
-                P2_beta[1, max_selected + 1] = 0;
-              }
-              beta = t(P2_beta);
-            }
-          } else {
-            P1 = table (seq (1, no_selected), t(Selected)); 
-            P1_beta = P1 * beta;
-            P2_beta = colSums (P1_beta);	
-
-            if (max_selected < num_features_orig) {
-              P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-            }		
-
-            beta = t(P2_beta);	
-          }
-        }
-    }
-
-glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG)
-  return (Matrix[double] beta, double saturated_log_l, int isNaN)
-  {
-    saturated_log_l = 0.0;
-    isNaN = 0;
-    y_corr = Y [, 1];
-    if (dist_type == 2) {
-      n_corr = rowSums (Y);
-      is_n_zero = (n_corr == 0);
-      y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;    
-    }
-    linear_terms = y_corr;
-    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
-      if          (link_power ==  0) {
-        if (sum (y_corr < 0) == 0) {
-          is_zero_y_corr = (y_corr == 0);
-          linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-        } else { isNaN = 1; }
-        } else { if (link_power ==  1.0) {
-          linear_terms = y_corr;
-          } else { if (link_power == -1.0) {
-            linear_terms = 1.0 / y_corr;
-            } else { if (link_power ==  0.5) {
-              if (sum (y_corr < 0) == 0) {
-                linear_terms = sqrt (y_corr);
-              } else { isNaN = 1; }
-              } else { if (link_power >   0) {
-                if (sum (y_corr < 0) == 0) {
-                  is_zero_y_corr = (y_corr == 0);
-                  linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
-                } else { isNaN = 1; }
-              } else {
-                if (sum (y_corr <= 0) == 0) {
-                  linear_terms = y_corr ^ link_power;
-                } else { isNaN = 1; }
-      }}}}}
-    }
-    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
-    { # BINOMIAL/BERNOULLI DISTRIBUTION
-      if          (link_type == 1 & link_power == 0)  { # Binomial.log
-        if (sum (y_corr < 0) == 0) {
-          is_zero_y_corr = (y_corr == 0);
-          linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-        } else { isNaN = 1; }
-        } else { if (link_type == 1 & link_power >  0)  { # Binomial.power_nonlog pos
-          if (sum (y_corr < 0) == 0) {
-            is_zero_y_corr = (y_corr == 0);
-            linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
-          } else { isNaN = 1; }
-          } else { if (link_type == 1)                      { # Binomial.power_nonlog neg
-            if (sum (y_corr <= 0) == 0) {
-              linear_terms = y_corr ^ link_power;
-            } else { isNaN = 1; }
-          } else { 
-            is_zero_y_corr = (y_corr <= 0);
-            is_one_y_corr  = (y_corr >= 1.0);
-            y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr);
-            if (link_type == 2)                           { # Binomial.logit
-              linear_terms = log (y_corr / (1.0 - y_corr)) 
-              + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-              } else { if (link_type == 3)                  { # Binomial.probit
-                y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5);
-                t = sqrt (- 2.0 * log (y_below_half));
-                approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
-                linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5))
-                + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-                } else { if (link_type == 4)                  { # Binomial.cloglog
-                  linear_terms = log (- log (1.0 - y_corr))
-                  - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
-                  + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-                  } else { if (link_type == 5)                  { # Binomial.cauchit
-                    linear_terms = tan ((y_corr - 0.5) * pi)
-                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
-      }}  }}}}}
-    }
-
-    if (isNaN == 0) {
-      [saturated_log_l, isNaN] = 
-        glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power);
-    }
-
-    if ((dist_type == 1 & link_type == 1 & link_power == 0) |
-        (dist_type == 2 & link_type >= 2))
-    {    
-      desired_eta = 0.0;
-      } else { if (link_type == 1 & link_power == 0) {
-        desired_eta = log (0.5);
-        } else { if (link_type == 1) {
-          desired_eta = 0.5 ^ link_power;
-        } else {
-          desired_eta = 0.5;
-    }}}
-
-    beta = matrix (0.0, rows = ncol(X), cols = 1);
-
-    if (desired_eta != 0) {
-      if (icept_status == 1 | icept_status == 2) {
-        beta [nrow(beta), 1] = desired_eta;
-      } else {
-        # We want: avg (X %*% ssX_transform %*% beta) = desired_eta
-        # Note that "ssX_transform" is trivial here, hence ignored
-
-        beta = straightenX (X, 0.000001, max_iter_CG);  
-        beta = beta * desired_eta;
-  }   }   }
-
-
-glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
-                     int dist_type, double var_power, int link_type, double link_power)
-  return (Matrix[double] g_Y, Matrix[double] w)
-# ORIGINALLY we returned more meaningful vectors, namely:
-# Matrix[double] y_residual    : y - y_mean, i.e. y observed - y predicted
-# Matrix[double] link_gradient : derivative of the link function
-# Matrix[double] var_function  : variance without dispersion, i.e. the V(mu) function
-# BUT, this caused roundoff errors, so we had to compute "directly useful" vectors
-# and skip over the "meaningful intermediaries".  Now we output these two variables:
-#     g_Y = y_residual / (var_function * link_gradient);
-#     w   = 1.0 / (var_function * link_gradient ^ 2);
-  {
-    num_records = nrow (linear_terms);
-    zeros_r = matrix (0.0, rows = num_records, cols = 1);
-    ones_r = 1 + zeros_r;
-    g_Y  = zeros_r;
-    w  = zeros_r;
-
-    # Some constants
-
-    one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
-    ones_2 = matrix (1.0, rows = 1, cols = 2);
-    p_one_m_one = ones_2;
-    p_one_m_one [1, 2] = -1.0;
-    m_one_p_one = ones_2;
-    m_one_p_one [1, 1] = -1.0;
-    zero_one = ones_2;
-    zero_one [1, 1] = 0.0;
-    one_zero = ones_2;
-    one_zero [1, 2] = 0.0;
-    flip_pos = matrix (0, rows = 2, cols = 2);
-    flip_neg = flip_pos;
-    flip_pos [1, 2] = 1;
-    flip_pos [2, 1] = 1;
-    flip_neg [1, 2] = -1;
-    flip_neg [2, 1] = 1;
-
-    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
-      y_mean = zeros_r;
-      if          (link_power ==  0) {
-        y_mean = exp (linear_terms);
-        y_mean_pow = y_mean ^ (1 - var_power);
-        w   = y_mean_pow * y_mean;
-        g_Y = y_mean_pow * (Y - y_mean);
-        } else { if (link_power ==  1.0) {
-          y_mean = linear_terms;
-          w   = y_mean ^ (- var_power);
-          g_Y = w * (Y - y_mean);
-        } else {
-          y_mean = linear_terms ^ (1.0 / link_power);
-          c1  = (1 - var_power) / link_power - 1;
-          c2  = (2 - var_power) / link_power - 2;
-          g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power;
-          w   = (linear_terms ^ c2) / (link_power ^ 2);
-    }   }}
-    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
-    { # BINOMIAL/BERNOULLI DISTRIBUTION
-      if (link_type == 1) { # BINOMIAL.POWER LINKS
-        if (link_power == 0)  { # Binomial.log
-          vec1 = 1 / (exp (- linear_terms) - 1);
-          g_Y = Y [, 1] - Y [, 2] * vec1;
-          w   = rowSums (Y) * vec1;
-        } else {                  # Binomial.nonlog
-          vec1 = zeros_r;
-          if (link_power == 0.5)  {
-            vec1 = 1 / (1 - linear_terms ^ 2);
-            } else { if (sum (linear_terms < 0) == 0) {
-              vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power));
-          } else {isNaN = 1;}}
-          # We want a "zero-protected" version of
-          #     vec2 = Y [, 1] / linear_terms;
-          is_y_0 = (Y [, 1] == 0);
-          vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0;
-          g_Y =  (vec2 - Y [, 2] * vec1 * linear_terms) / link_power;
-          w   =  rowSums (Y) * vec1 / link_power ^ 2;
-        }
-      } else {
-        is_LT_pos_infinite = (linear_terms == Inf);
-        is_LT_neg_infinite = (linear_terms == -Inf);
-        is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
-        finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
-        finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
-        if (link_type == 2)                           { # Binomial.logit
-          Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
-          Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
-          Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
-          g_Y = rowSums (Y * (Y_prob %*% flip_neg));           ### = y_residual;
-          w   = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob);  ### = y_variance;
-          } else { if (link_type == 3)                  { # Binomial.probit
-            is_lt_pos = (linear_terms >= 0);
-            t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
-            pt_gp = t_gp * ( 0.254829592 
-                            + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
-                                      + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
-                                                + t_gp * (-1.453152027 
-                                                          + t_gp * 1.061405429))));
-            the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
-            vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
-            vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5);
-            w   = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1;
-            g_Y = one_over_sqrt_two_pi * vec2 / vec1;
-            } else { if (link_type == 4)                  { # Binomial.cloglog
-              the_exp = exp (linear_terms)
-              the_exp_exp = exp (- the_exp);
-              is_too_small = ((10000000 + the_exp) == 10000000);
-              the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2);
-              g_Y =  (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio;
-              w   =  the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio;
-              } else { if (link_type == 5)                  { # Binomial.cauchit
-                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
-                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
-                y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1];
-                var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2];
-                link_gradient_normalized = (1 + linear_terms ^ 2) * pi;
-                g_Y =  rowSums (Y) * y_residual / (var_function * link_gradient_normalized);
-                w   = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2);
-        }}}}   
-      }
-    }
-  }
-
-
-glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y,
-                                    int dist_type, double var_power, int link_type, double link_power)
-  return (double log_l, int isNaN)
-  {
-    isNaN = 0;
-    log_l = 0.0;
-    num_records = nrow (Y);
-    zeros_r = matrix (0.0, rows = num_records, cols = 1);
-
-    if (dist_type == 1 & link_type == 1)
-    { # POWER DISTRIBUTION
-      b_cumulant = zeros_r;
-      natural_parameters = zeros_r;
-      is_natural_parameter_log_zero = zeros_r;
-      if          (var_power == 1.0 & link_power == 0)  { # Poisson.log
-        b_cumulant = exp (linear_terms);
-        is_natural_parameter_log_zero = (linear_terms == -Inf);
-        natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0);
-        } else { if (var_power == 1.0 & link_power == 1.0)  { # Poisson.id
-          if (sum (linear_terms < 0) == 0)  {
-            b_cumulant = linear_terms;
-            is_natural_parameter_log_zero = (linear_terms == 0);
-            natural_parameters = log (linear_terms + is_natural_parameter_log_zero);
-          } else {isNaN = 1;}
-          } else { if (var_power == 1.0 & link_power == 0.5)  { # Poisson.sqrt
-            if (sum (linear_terms < 0) == 0)  {
-              b_cumulant = linear_terms ^ 2;
-              is_natural_parameter_log_zero = (linear_terms == 0);
-              natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero);
-            } else {isNaN = 1;}
-            } else { if (var_power == 1.0 & link_power  > 0)  { # Poisson.power_nonlog, pos
-              if (sum (linear_terms < 0) == 0)  {
-                is_natural_parameter_log_zero = (linear_terms == 0);
-                b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
-                natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
-              } else {isNaN = 1;}
-              } else { if (var_power == 1.0)                      { # Poisson.power_nonlog, neg
-                if (sum (linear_terms <= 0) == 0) {
-                  b_cumulant = linear_terms ^ (1.0 / link_power);
-                  natural_parameters = log (linear_terms) / link_power;
-                } else {isNaN = 1;}
-                } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse
-                  if (sum (linear_terms <= 0) == 0) {
-                    b_cumulant = - log (linear_terms);
-                    natural_parameters = - linear_terms;
-                  } else {isNaN = 1;}
-                  } else { if (var_power == 2.0 & link_power ==  1.0) { # Gamma.id
-                    if (sum (linear_terms <= 0) == 0) {
-                      b_cumulant = log (linear_terms);
-                      natural_parameters = - 1.0 / linear_terms;
-                    } else {isNaN = 1;}
-                    } else { if (var_power == 2.0 & link_power ==  0) { # Gamma.log
-                      b_cumulant = linear_terms;
-                      natural_parameters = - exp (- linear_terms);
-                      } else { if (var_power == 2.0)                      { # Gamma.power_nonlog
-                        if (sum (linear_terms <= 0) == 0) {
-                          b_cumulant = log (linear_terms) / link_power;
-                          natural_parameters = - linear_terms ^ (- 1.0 / link_power);
-                        } else {isNaN = 1;}
-                        } else { if                    (link_power ==  0) { # PowerDist.log
-                          natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
-                          b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
-                        } else {                                              # PowerDist.power_nonlog
-                          if          (-2 * link_power == 1.0 - var_power) {
-                            natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power);
-                            } else { if (-1 * link_power == 1.0 - var_power) {
-                              natural_parameters = 1.0 / linear_terms / (1.0 - var_power);
-                              } else { if (     link_power == 1.0 - var_power) {
-                                natural_parameters = linear_terms / (1.0 - var_power);
-                                } else { if ( 2 * link_power == 1.0 - var_power) {
-                                  natural_parameters = linear_terms ^ 2 / (1.0 - var_power);
-                                } else {
-                                  if (sum (linear_terms <= 0) == 0) {
-                                    power = (1.0 - var_power) / link_power;
-                                    natural_parameters = (linear_terms ^ power) / (1.0 - var_power);
-                                  } else {isNaN = 1;}
-                          }}}}
-                          if          (-2 * link_power == 2.0 - var_power) {
-                            b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power);
-                            } else { if (-1 * link_power == 2.0 - var_power) {
-                              b_cumulant = 1.0 / linear_terms / (2.0 - var_power);
-                              } else { if (     link_power == 2.0 - var_power) {
-                                b_cumulant = linear_terms / (2.0 - var_power);
-                                } else { if ( 2 * link_power == 2.0 - var_power) {
-                                  b_cumulant = linear_terms ^ 2 / (2.0 - var_power);
-                                } else {
-                                  if (sum (linear_terms <= 0) == 0) {
-                                    power = (2.0 - var_power) / link_power;
-                                    b_cumulant = (linear_terms ^ power) / (2.0 - var_power);
-                                  } else {isNaN = 1;}
-                          }}}}
-      }}}}} }}}}}
-      if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) {
-        log_l = -Inf;
-        isNaN = 1;
-      }
-      if (isNaN == 0)
-      {
-        log_l = sum (Y * natural_parameters - b_cumulant);
-        if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
-          log_l = -Inf;
-          isNaN = 1;
-    }   }   }
-
-    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
-    { # BINOMIAL/BERNOULLI DISTRIBUTION
-
-      [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power);
-
-      if (isNaN == 0) {            
-        does_prob_contradict = (Y_prob <= 0);
-        if (sum (does_prob_contradict * abs (Y)) == 0) {
-          log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict));
-          if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
-            isNaN = 1;
-          }
-        } else {
-          log_l = -Inf;
-          isNaN = 1;
-    }   }   }
-
-    if (isNaN == 1) {
-      log_l = - Inf; 
-    }
-  }
-
-
-
-binomial_probability_two_column =
-  function (Matrix[double] linear_terms, int link_type, double link_power)
-    return   (Matrix[double] Y_prob, int isNaN)
-    {
-      isNaN = 0;
-      num_records = nrow (linear_terms);
-
-      # Define some auxiliary matrices
-
-      ones_2 = matrix (1.0, rows = 1, cols = 2);
-      p_one_m_one = ones_2;
-      p_one_m_one [1, 2] = -1.0;
-      m_one_p_one = ones_2;
-      m_one_p_one [1, 1] = -1.0;
-      zero_one = ones_2;
-      zero_one [1, 1] = 0.0;
-      one_zero = ones_2;
-      one_zero [1, 2] = 0.0;
-
-      zeros_r = matrix (0.0, rows = num_records, cols = 1);
-      ones_r = 1.0 + zeros_r;
-
-      # Begin the function body
-
-      Y_prob = zeros_r %*% ones_2;
-      if (link_type == 1) { # Binomial.power
-        if          (link_power == 0) { # Binomial.log
-          Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one;    
-          } else { if (link_power == 0.5) { # Binomial.sqrt
-            Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one;    
-          } else {                          # Binomial.power_nonlog
-            if (sum (linear_terms < 0) == 0) {
-              Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one;    
-            } else {isNaN = 1;}
-        }}
-      } else {              # Binomial.non_power
-        is_LT_pos_infinite = (linear_terms == Inf);
-        is_LT_neg_infinite = (linear_terms == -Inf);
-        is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
-        finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
-        finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
-        if (link_type == 2)             { # Binomial.logit
-          Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
-          Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
-          } else { if (link_type == 3)    { # Binomial.probit
-            lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one;
-            t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
-            pt_gp = t_gp * ( 0.254829592 
-                            + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
-                                      + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
-                                                + t_gp * (-1.453152027 
-                                                          + t_gp * 1.061405429))));
-            the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
-            Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg);
-            } else { if (link_type == 4)    { # Binomial.cloglog
-              the_exp = exp (finite_linear_terms);
-              the_exp_exp = exp (- the_exp);
-              is_too_small = ((10000000 + the_exp) == 10000000);
-              Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2);
-              Y_prob [, 2] = the_exp_exp;
-              } else { if (link_type == 5)    { # Binomial.cauchit
-                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
-              } else {
-                isNaN = 1;
-        }}}}
-        Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
-    }   }            
-
-
-# THE CG-STEIHAUG PROCEDURE SCRIPT
-
-# Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize
-# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z
-# under constraint:  ||z|| <= trust_delta.
-# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
-# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform
-# is given separately because sparse "X" may become dense after applying the transform.
-#
-get_CG_Steihaug_point =
-  function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w,
-            Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG)
-    return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary)
-    {
-      trust_delta_sq = trust_delta ^ 2;
-      size_CG = nrow (g);
-      z = matrix (0.0, rows = size_CG, cols = 1);
-      neg_log_l_change = 0.0;
-      reached_trust_boundary = 0;
-      g_reg = g + lambda * beta;
-      r_CG = g_reg;
-      p_CG = -r_CG;
-      rr_CG = sum(r_CG * r_CG);
-      eps_CG = rr_CG * min (0.25, sqrt (rr_CG));
-      converged_CG = 0;
-      if (rr_CG < eps_CG) {
-        converged_CG = 1;
-      }
-
-      max_iteration_CG = max_iter_CG;
-      if (max_iteration_CG <= 0) {
-        max_iteration_CG = size_CG;
-      }
-      i_CG = 0;
-      while (converged_CG == 0)
-      {
-        i_CG = i_CG + 1;
-        ssX_p_CG = diag (scale_X) %*% p_CG;
-        ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG;
-        temp_CG = t(X) %*% (w * (X %*% ssX_p_CG));
-        q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ];
-        pq_CG = sum (p_CG * q_CG);
-        if (pq_CG <= 0) {
-          pp_CG = sum (p_CG * p_CG);  
-          if (pp_CG > 0) {
-            [z, neg_log_l_change] = 
-              get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
-            reached_trust_boundary = 1;
-          } else {
-            neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
-          }
-          converged_CG = 1;
-        }
-        if (converged_CG == 0) {
-          alpha_CG = rr_CG / pq_CG;
-          new_z = z + alpha_CG * p_CG;
-          if (sum(new_z * new_z) >= trust_delta_sq) {
-            pp_CG = sum (p_CG * p_CG);  
-            [z, neg_log_l_change] = 
-              get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
-            reached_trust_boundary = 1;
-            converged_CG = 1;
-          }
-          if (converged_CG == 0) {
-            z = new_z;
-            old_rr_CG = rr_CG;
-            r_CG = r_CG + alpha_CG * q_CG;
-            rr_CG = sum(r_CG * r_CG);
-            if (i_CG == max_iteration_CG | rr_CG < eps_CG) {
-              neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
-              reached_trust_boundary = 0;
-              converged_CG = 1;
-            }
-            if (converged_CG == 0) {
-              p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG;
-    }   }   }   }   }
-
-
-# An auxiliary function used twice inside the CG-STEIHAUG loop:
-get_trust_boundary_point = 
-  function (Matrix[double] g, Matrix[double] z, Matrix[double] p, 
-            Matrix[double] q, Matrix[double] r, double pp, double pq, 
-            double trust_delta_sq)
-    return (Matrix[double] new_z, double f_change)
-    {
-      zz = sum (z * z);  pz = sum (p * z);
-      sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq));
-      tau_1 = (- pz + sq_root_d) / pp;
-      tau_2 = (- pz - sq_root_d) / pp;
-      zq = sum (z * q);  gp = sum (g * p);
-      f_extra = 0.5 * sum (z * (r + g));
-      f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1;
-      f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2;
-      ind1 = as.integer(f_change_1 < f_change_2);
-      ind2 = as.integer(f_change_1 >= f_change_2);
-      new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p);
-      f_change = ind1 * f_change_1 + ind2 * f_change_2;
-    }
-
-
-# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
-# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
-# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
-straightenX =
-  function (Matrix[double] X, double eps, int max_iter_CG)
-    return   (Matrix[double] w)
-    {
-      w_X = t(colSums(X));
-      lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
-      eps_LS = eps * nrow(X);
-
-      # BEGIN LEAST SQUARES
-
-      r_LS = - w_X;
-      z_LS = matrix (0.0, rows = ncol(X), cols = 1);
-      p_LS = - r_LS;
-      norm_r2_LS = sum (r_LS ^ 2);
-      i_LS = 0;
-      while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS)
-      {
-        q_LS = t(X) %*% X %*% p_LS;
-        q_LS = q_LS + lambda_LS * p_LS;
-        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
-        z_LS = z_LS + alpha_LS * p_LS;
-        old_norm_r2_LS = norm_r2_LS;
-        r_LS = r_LS + alpha_LS * q_LS;
-        norm_r2_LS = sum (r_LS ^ 2);
-        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
-        i_LS = i_LS + 1;
-      }
-
-      # END LEAST SQUARES
-
-      w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
-    }

From 482349c03a6ba48197748dd43b74cc8cf40cc49f Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Fri, 26 Jun 2026 07:12:46 +0200
Subject: [PATCH 07/12] replace glm_fit with m_glm

---
 scripts/builtin/stepGLM.dml | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml
index 3ec1cffcb7f..f7bd72e6b51 100644
--- a/scripts/builtin/stepGLM.dml
+++ b/scripts/builtin/stepGLM.dml
@@ -47,7 +47,6 @@
 #
 # B     Matrix    ---      Estimated regression parameters (betas)
 # S     Matrix    ---      The selected features ordered as computed by the algorithm
-# O     String    ---      The statistics
 # ---------------------------------------------------------------------------------------------
 
 # THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY!
@@ -76,6 +75,7 @@
 # DEVIANCE_SCALED       Deviance from the saturated model, scaled by the DISPERSION value
 # -------------------------------------------------------------------------------------------
 
+source("./scripts/builtin/glm.dlm") as glm;
 
 stepGLM = function (
                     Matrix[Double] X, 
@@ -92,16 +92,12 @@ stepGLM = function (
           Double AIC,
           Matrix[Double] B, 
           Matrix[Double] S, 
-          String O
         )
     {
         intercept_status = icpt;
         bernoulli_No_label = yneg;
         distribution_type = 2;
 
-        # currently only the forward selection strategy in supported: start from one feature and iteratively add 
-        # features until AIC improves
-        dir = "forward";
 
         if (distribution_type == 2 & ncol(Y) == 1) {
             is_Y_negative = (Y == bernoulli_No_label);
@@ -121,10 +117,6 @@ stepGLM = function (
 
         # BEGIN STEPWISE GENERALIZED LINEAR MODELS 
 
-        if (dir != "forward") {  
-            stop ("Currently only forward selection strategy is supported!");
-        }
-
         continue = TRUE;
         columns_fixed = matrix (0, rows = 1, cols = num_features);
         columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
@@ -134,18 +126,18 @@ stepGLM = function (
 
         if (intercept_status == 0) {
             # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
-            [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm_fit (X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+            [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm::m_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
         } else {
             # compute AIC of an empty model with only intercept (all Ys are constant)
             all_ones = matrix (1, rows = num_records, cols = 1);
-            [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm_fit (X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+            [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm::m_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
         }
         #print ("Best AIC without any features: " + AIC_best);
 
         # First pass to examine single features
         AICs = matrix (AIC_best, rows = 1, cols = num_features);
         parfor (i in 1:num_features) { 	
-            [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm_fit (X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+            [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm::m_glm(X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
             AICs[1,i] = AIC_1;
         }
 
@@ -163,11 +155,11 @@ stepGLM = function (
             #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
             if (intercept_status == 0) {
                 # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
-                [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm_fit (X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+                [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm::m_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
             } else {
                 # compute AIC of an empty model with only intercept (all Ys are constant)
                 ###all_ones = matrix (1, rows = num_records, cols = 1);
-                [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm_fit (X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+                [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm::m_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
             }
         };
 
@@ -184,7 +176,7 @@ stepGLM = function (
                     # Construct the feature matrix
                     X_loop = cbind (X_global, X_orig[,i]);
 
-                    [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm_fit (X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+                    [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm::m_glm(X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
                     AICs[1,i] = AIC_2;
                 }		
             }
@@ -216,7 +208,7 @@ stepGLM = function (
 
         # run GLM with selected set of features
         print ("Running GLM with selected features...");
-        [AIC, B, S, O] = glm_fit (X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+        [AIC, B, S, O] = glm::m_glm(X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
     }
 
 

From 8e24e9c4a1aabebbd5c0f6994aef7c57f6b49579 Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Sun, 28 Jun 2026 03:01:15 +0200
Subject: [PATCH 08/12] add missing glm_fit replacment

---
 scripts/algorithms/TestBuiltinStepGLM.dml |  19 +++-
 scripts/builtin/stepGLM.dml               | 118 ++++++++++++++++------
 2 files changed, 106 insertions(+), 31 deletions(-)

diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml
index ab64960b35a..ffe1a1f64db 100644
--- a/scripts/algorithms/TestBuiltinStepGLM.dml
+++ b/scripts/algorithms/TestBuiltinStepGLM.dml
@@ -13,9 +13,26 @@ Z = X %*% beta_true;
 P_y = 1.0 / (1.0 + exp(-Z));
 Y = (rand(rows=N, cols=1, min=0.0, max=1.0, seed=456) < P_y) * 1.0;
 
-[AIC, B, S, O] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01);
+[AIC, B, S] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01);
 
 print("Optimal AIC: " + AIC);
 print("Selected Feature Indices:\n" + toString(S));
 print("Estimated Coefficients:\n" + toString(B));
 
+epsilon = 0.5;
+beta_est = matrix(0, rows=P, cols=1);
+for (i in 1:nrow(B)) {
+    idx = as.scalar(S[1, i]);
+    if (nrow(S) > 1) { idx = as.scalar(S[i, 1]); }
+    beta_est[idx, 1] = B[i, 1];
+}
+
+if (nrow(B) != 3 | sum(beta_est != 0 & beta_true == 0) > 0 | sum(beta_est == 0 & beta_true != 0) > 0) {
+    stop("Test failed: Inexact feature support recovery.");
+}
+
+if (max(abs(beta_est - beta_true)) > epsilon) {
+    stop("Test failed: Parameter estimates exceed tolerance bound epsilon = " + epsilon + ".");
+}
+
+print("Automated convergence and estimation validation passed.");
diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml
index f7bd72e6b51..d4f894a52ac 100644
--- a/scripts/builtin/stepGLM.dml
+++ b/scripts/builtin/stepGLM.dml
@@ -45,37 +45,18 @@
 # OUTPUT: Matrix beta, whose size depends on icpt:
 #     icpt=0: ncol(X) x 1;  icpt=1: (ncol(X) + 1) x 1;  icpt=2: (ncol(X) + 1) x 2
 #
+# AIC   Double    ---      AIC value
 # B     Matrix    ---      Estimated regression parameters (betas)
 # S     Matrix    ---      The selected features ordered as computed by the algorithm
 # ---------------------------------------------------------------------------------------------
 
 # THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY!
-# 	- LOG
+# - LOG
 #	- LOGIT
 #	- PROBIT
 #	- CLOGLOG
 
-# In addition, in the last run of GLM some statistics are provided in CSV format, one comma-separated name-value
-# pair per each line, as follows:
-#
-# NAME                  MEANING
-# -------------------------------------------------------------------------------------------
-# TERMINATION_CODE      A positive integer indicating success/failure as follows:
-#                       1 = Converged successfully; 2 = Maximum number of iterations reached; 
-#                       3 = Input (X, Y) out of range; 4 = Distribution/link is not supported
-# BETA_MIN              Smallest beta value (regression coefficient), excluding the intercept
-# BETA_MIN_INDEX        Column index for the smallest beta value
-# BETA_MAX              Largest beta value (regression coefficient), excluding the intercept
-# BETA_MAX_INDEX        Column index for the largest beta value
-# INTERCEPT             Intercept value, or NaN if there is no intercept (if icpt=0)
-# DISPERSION            Dispersion used to scale deviance, provided as "disp" input parameter
-#                       or estimated (same as DISPERSION_EST) if the "disp" parameter is <= 0
-# DISPERSION_EST        Dispersion estimated from the dataset
-# DEVIANCE_UNSCALED     Deviance from the saturated model, assuming dispersion == 1.0
-# DEVIANCE_SCALED       Deviance from the saturated model, scaled by the DISPERSION value
-# -------------------------------------------------------------------------------------------
-
-source("./scripts/builtin/glm.dlm") as glm;
+source("./scripts/builtin/glm.dml") as glm;
 
 stepGLM = function (
                     Matrix[Double] X, 
@@ -91,7 +72,7 @@ stepGLM = function (
 ) return (
           Double AIC,
           Matrix[Double] B, 
-          Matrix[Double] S, 
+          Matrix[Double] S
         )
     {
         intercept_status = icpt;
@@ -126,18 +107,18 @@ stepGLM = function (
 
         if (intercept_status == 0) {
             # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
-            [AIC_best, ignore_B1, ignore_S1, ignore_O1] = glm::m_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+            [AIC_best, ignore_B1, ignore_S1] = internal_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
         } else {
             # compute AIC of an empty model with only intercept (all Ys are constant)
             all_ones = matrix (1, rows = num_records, cols = 1);
-            [AIC_best, ignore_beta2, ignore_S2, ignore_O2] = glm::m_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+            [AIC_best, ignore_beta2, ignore_S2] = internal_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
         }
         #print ("Best AIC without any features: " + AIC_best);
 
         # First pass to examine single features
         AICs = matrix (AIC_best, rows = 1, cols = num_features);
         parfor (i in 1:num_features) { 	
-            [AIC_1, ignore_beta3, ignore_S3, ignore_O3] = glm::m_glm(X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+            [AIC_1, ignore_beta3, ignore_S3] = internal_glm(X=X_orig[,i], Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
             AICs[1,i] = AIC_1;
         }
 
@@ -155,11 +136,11 @@ stepGLM = function (
             #print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
             if (intercept_status == 0) {
                 # Compute AIC of an empty model with no features and no intercept (all Ys are zero)
-                [AIC_best, ignore_beta4, ignore_S4, ignore_O4] = glm::m_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+                [AIC_best, ignore_beta4, ignore_S4] = internal_glm(X=X_global, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
             } else {
                 # compute AIC of an empty model with only intercept (all Ys are constant)
                 ###all_ones = matrix (1, rows = num_records, cols = 1);
-                [AIC_best, ignore_beta5, ignore_S5, ignore_O5] = glm::m_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+                [AIC_best, ignore_beta5, ignore_S5] = internal_glm(X=all_ones, Y=Y, intercept_status=0, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
             }
         };
 
@@ -176,7 +157,7 @@ stepGLM = function (
                     # Construct the feature matrix
                     X_loop = cbind (X_global, X_orig[,i]);
 
-                    [AIC_2, ignore_beta6, ignore_S6, ignore_O6] = glm::m_glm(X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+                    [AIC_2, ignore_beta6, ignore_S6] = internal_glm(X=X_loop, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
                     AICs[1,i] = AIC_2;
                 }		
             }
@@ -208,7 +189,84 @@ stepGLM = function (
 
         # run GLM with selected set of features
         print ("Running GLM with selected features...");
-        [AIC, B, S, O] = glm::m_glm(X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+        [AIC, B, S] = internal_glm(X=X_global, Y=Y, intercept_status=intercept_status, num_features_orig=num_features, Selected=columns_fixed_ordered, link=link, disp=disp, tol=tol, moi=moi, mii=mii);
+    }
+
+compute_AIC = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] B, Int link, Int icpt)
+    return (Double aic)
+{
+    # Isolate unscaled parameters; accounts for m_glm returning 2 columns when icpt=2
+    beta = B[, 1];
+
+    if (icpt > 0) {
+        ones = matrix(1, rows=nrow(X), cols=1);
+        X_design = cbind(X, ones);
+    } else {
+        X_design = X;
+    }
+
+    eta = X_design %*% beta;
+
+    # Map inverse link functions 
+    if (link == 1) {
+        # log, see https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function
+        mu = exp(eta); 
+    } else if (link == 2) { 
+        # logit, see https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function
+        mu = 1 / (1 + exp(-eta)); 
+    } else if (link == 3) { 
+        # probit approximation, page 1487 in "Qualitative Response Models: A Survey (1981)" by Takeshi Amemiya
+        mu = 1 / (1 + exp(-1.6 * eta));
+    } else if (link == 4) { 
+        # cloglog, see https://search.r-project.org/CRAN/refmans/VGAM/html/clogloglink.html
+        mu = 1 - exp(-exp(eta)); 
+    } else {
+        stop("Unsupported link function code: " + link);
+        mu = eta; 
     }
 
+    # Constrain to prevent numerical discontinuity in log(0)
+    mu = max(mu, 1e-15);
+    mu = min(mu, 1 - 1e-15);
+
+    Y_yes = Y[, 1];
+    Y_neg = Y[, 2];
+
+    LL = sum(Y_yes * log(mu) + Y_neg * log(1 - mu));
+    k = nrow(beta);
+    
+    aic = 2 * k - 2 * LL;
+}
+
+internal_glm = function (
+    Matrix[Double] X, 
+    Matrix[Double] Y, 
+    Int intercept_status, 
+    Double num_features_orig, 
+    Matrix[Double] Selected, 
+    Int link, 
+    Double disp, 
+    Double tol, 
+    Int moi, 
+    Int mii
+) return (
+    Double AIC, 
+    Matrix[Double] B, 
+    Matrix[Double] S
+) {
+    # bernoulli family parameters, see table in ./scripts/builtin/m_glm.dml for details. 
+    new_link = link;
+    new_lpow = 1.0;
+    
+    if (link == 1) {
+        new_link = 1; 
+        new_lpow = 0.0; 
+    }
+    B = glm::m_glm(X=X, Y=Y, dfam=2, vpow=0.0, link=new_link, lpow=new_lpow, yneg=0.0, 
+              icpt=intercept_status, disp=disp, reg=0.0, tol=tol, 
+              moi=moi, mii=mii, verbose=FALSE);
 
+    AIC = compute_AIC(X, Y, B, link, intercept_status);
+    
+    S = Selected;
+}

From ca5429b3f3c99c1011118c728a031a7f79563470 Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Sun, 28 Jun 2026 05:52:08 +0200
Subject: [PATCH 09/12] edit test script

---
 scripts/algorithms/TestBuiltinStepGLM.dml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml
index ffe1a1f64db..f22bb07d107 100644
--- a/scripts/algorithms/TestBuiltinStepGLM.dml
+++ b/scripts/algorithms/TestBuiltinStepGLM.dml
@@ -15,24 +15,27 @@ Y = (rand(rows=N, cols=1, min=0.0, max=1.0, seed=456) < P_y) * 1.0;
 
 [AIC, B, S] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01);
 
+print("\n\n\n\n\n\n\nTest Results:");
 print("Optimal AIC: " + AIC);
 print("Selected Feature Indices:\n" + toString(S));
 print("Estimated Coefficients:\n" + toString(B));
 
-epsilon = 0.5;
 beta_est = matrix(0, rows=P, cols=1);
 for (i in 1:nrow(B)) {
     idx = as.scalar(S[1, i]);
-    if (nrow(S) > 1) { idx = as.scalar(S[i, 1]); }
     beta_est[idx, 1] = B[i, 1];
 }
 
+# Case 01
 if (nrow(B) != 3 | sum(beta_est != 0 & beta_true == 0) > 0 | sum(beta_est == 0 & beta_true != 0) > 0) {
     stop("Test failed: Inexact feature support recovery.");
 }
+print("passed test 1")
 
+# Case 02
+epsilon = 0.5;
 if (max(abs(beta_est - beta_true)) > epsilon) {
     stop("Test failed: Parameter estimates exceed tolerance bound epsilon = " + epsilon + ".");
 }
+print("passed test 2")
 
-print("Automated convergence and estimation validation passed.");

From 3321316255de955baf085d353a69ade2cee71026 Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Mon, 29 Jun 2026 15:45:21 +0200
Subject: [PATCH 10/12] add license to TestBuiltinStepGLM.dml

---
 scripts/algorithms/TestBuiltinStepGLM.dml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml
index f22bb07d107..9924728b992 100644
--- a/scripts/algorithms/TestBuiltinStepGLM.dml
+++ b/scripts/algorithms/TestBuiltinStepGLM.dml
@@ -1,3 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
 source("scripts/builtin/stepGLM.dml") as stepGLM;
 
 N = 1000;

From e13e83af0272959df105283b9f6cb3bd7cba4d6f Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Wed, 1 Jul 2026 15:31:40 +0200
Subject: [PATCH 11/12] rename stepGLM to m_stepGLM; register stepGLM in
 Builtins.java

---
 scripts/algorithms/TestBuiltinStepGLM.dml           | 2 +-
 scripts/builtin/stepGLM.dml                         | 2 +-
 src/main/java/org/apache/sysds/common/Builtins.java | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/scripts/algorithms/TestBuiltinStepGLM.dml
index 9924728b992..52d393bd29f 100644
--- a/scripts/algorithms/TestBuiltinStepGLM.dml
+++ b/scripts/algorithms/TestBuiltinStepGLM.dml
@@ -35,7 +35,7 @@ Z = X %*% beta_true;
 P_y = 1.0 / (1.0 + exp(-Z));
 Y = (rand(rows=N, cols=1, min=0.0, max=1.0, seed=456) < P_y) * 1.0;
 
-[AIC, B, S] = stepGLM::stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01);
+[AIC, B, S] = stepGLM::m_stepGLM(X=X, Y=Y, link=2, yneg=0.0, icpt=0, tol=1e-6, disp=0.0, moi=200, mii=0, thr=0.01);
 
 print("\n\n\n\n\n\n\nTest Results:");
 print("Optimal AIC: " + AIC);
diff --git a/scripts/builtin/stepGLM.dml b/scripts/builtin/stepGLM.dml
index d4f894a52ac..00230ca455c 100644
--- a/scripts/builtin/stepGLM.dml
+++ b/scripts/builtin/stepGLM.dml
@@ -58,7 +58,7 @@
 
 source("./scripts/builtin/glm.dml") as glm;
 
-stepGLM = function (
+m_stepGLM = function (
                     Matrix[Double] X, 
                     Matrix[Double] Y, 
                     Int link = 2,
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index e21c539d6d8..62145124d82 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -333,6 +333,7 @@ public enum Builtins {
 	STATSNA("statsNA", true),
 	STRATSTATS("stratstats", true),
 	STEPLM("steplm",true, ReturnType.MULTI_RETURN),
+	STEPGLM("stepGLM",true, ReturnType.MULTI_RETURN),
 	STFT("stft", false, ReturnType.MULTI_RETURN),
 	SQRT("sqrt", false),
 	SQRT_MATRIX("sqrtMatrix", true),

From c3410638cf2748de20b429a34a58674f39fddd4e Mon Sep 17 00:00:00 2001
From: bruno <bruno.ignaz.rothschild@gmail.com>
Date: Wed, 1 Jul 2026 16:40:11 +0200
Subject: [PATCH 12/12] add BuiltinSTEPGlmTest.java

---
 .../builtin/part2/BuiltinSTEPGlmTest.java     | 68 +++++++++++++++++++
 .../scripts/functions/builtin/stepGLM.dml     |  4 ++
 2 files changed, 72 insertions(+)
 create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java
 rename scripts/algorithms/TestBuiltinStepGLM.dml => src/test/scripts/functions/builtin/stepGLM.dml (97%)

diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java
new file mode 100644
index 00000000000..7d95dc6ec28
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinSTEPGlmTest.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import org.junit.Test;
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.common.Types.ExecType;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+
+public class BuiltinSTEPGlmTest extends AutomatedTestBase 
+{
+	private final static String TEST_NAME = "stepGLM";
+	private final static String TEST_DIR = "functions/builtin/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinSTEPGlmTest.class.getSimpleName() + "/";
+
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{}));
+	}
+
+	@Test
+	public void testLmMatrixDenseCPlm() {
+		runSTEPGlmTest(ExecType.CP);
+	}
+
+	@Test
+	public void testLmMatrixSparseSPlm() {
+		runSTEPGlmTest(ExecType.SPARK);
+	}
+
+	private void runSTEPGlmTest(ExecType instType) {
+		ExecMode platformOld = setExecMode(instType);
+
+		try {
+			loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+			String HOME = SCRIPT_DIR + TEST_DIR;
+
+      // Pointing to the generated validation DML script
+      fullDMLScriptName = HOME + TEST_NAME + ".dml";
+      programArgs = new String[]{};
+
+      // runTest executes the script; fails if the DML script invokes stop()
+      runTest(true, false, null, -1);
+		}
+		finally {
+			rtplatform = platformOld;
+		}
+	}
+}
diff --git a/scripts/algorithms/TestBuiltinStepGLM.dml b/src/test/scripts/functions/builtin/stepGLM.dml
similarity index 97%
rename from scripts/algorithms/TestBuiltinStepGLM.dml
rename to src/test/scripts/functions/builtin/stepGLM.dml
index 52d393bd29f..95a4dfb9908 100644
--- a/scripts/algorithms/TestBuiltinStepGLM.dml
+++ b/src/test/scripts/functions/builtin/stepGLM.dml
@@ -61,3 +61,7 @@ if (max(abs(beta_est - beta_true)) > epsilon) {
 }
 print("passed test 2")
 
+
+
+#stop("!!!Sucess!!!") # uncomment for letting the test fail
+