From ef03d1089bb8e8acdce3c852f78330054924a736 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Aug 2025 19:38:59 -0700
Subject: [PATCH 01/11] src/radmeth/radmeth.cpp: moving some functions out of
 this source file and into the radmeth_model.chpp sources

---
 src/radmeth/radmeth.cpp | 162 +++++++++++++++++++---------------------
 1 file changed, 75 insertions(+), 87 deletions(-)
diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp
index 0fa8fcbe..e4a89aa3 100644
--- a/src/radmeth/radmeth.cpp
+++ b/src/radmeth/radmeth.cpp
@@ -1,9 +1,7 @@
-/* Copyright (C) 2013-2023 University of Southern California and
- *                         Egor Dolzhenko
- *                         Andrew D Smith
- *                         Guilherme Sena
+/* Copyright (C) 2013-2025 Andrew D Smith
  *
- * Authors: Andrew D. Smith and Egor Dolzhenko and Guilherme Sena
+ * Author: Andrew D. Smith
+ * Contributors: Andrew D. Smith and Egor Dolzhenko and Guilherme Sena
  *
  * This program is free software: you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -16,6 +14,15 @@
  * General Public License for more details.
  */
 
+#include "radmeth_model.hpp"
+#include "radmeth_optimize.hpp"
+
+// smithlab headers
+#include "GenomicRegion.hpp"
+#include "OptionParser.hpp"
+#include "smithlab_os.hpp"
+#include "smithlab_utils.hpp"
+
 #include <gsl/gsl_cdf.h>  // GSL header for chisqrd distribution
 
 #include <algorithm>
@@ -29,20 +36,6 @@
 #include <thread>
 #include <vector>
 
-// smithlab headers
-#include "GenomicRegion.hpp"
-#include "OptionParser.hpp"
-#include "smithlab_os.hpp"
-#include "smithlab_utils.hpp"
-
-#include "radmeth_model.hpp"
-#include "radmeth_optimize.hpp"
-
-using std::begin;
-using std::cout;
-using std::distance;
-using std::end;
-
 struct file_progress {
   double one_hundred_over_filesize{};
   std::size_t prev_offset{};
@@ -50,24 +43,17 @@ struct file_progress {
     one_hundred_over_filesize{100.0 / std::filesystem::file_size(filename)} {}
   void
   operator()(std::ifstream &in) {
-    const std::size_t curr_offset = in.tellg() * one_hundred_over_filesize;
+    const std::size_t curr_offset =
+      in.eof() ? 100 : in.tellg() * one_hundred_over_filesize;
     if (curr_offset <= prev_offset)
       return;
-    std::cerr << "[progress: " << std::setw(3) << curr_offset
-              << (curr_offset == 100 ? "%]\n" : "%]\r");
+    std::cerr << "\r[progress: " << std::setw(3) << curr_offset
+              << (curr_offset == 100 ? "%]\n" : "%]");
     prev_offset = (curr_offset == 100) ? std::numeric_limits<std::size_t>::max()
                                        : curr_offset;
   }
 };
 
-static void
-remove_factor(Design &design, const std::size_t factor_idx) {
-  design.factor_names.erase(std::begin(design.factor_names) + factor_idx);
-  for (std::size_t i = 0; i < design.n_samples(); ++i)
-    design.matrix[i].erase(std::begin(design.matrix[i]) + factor_idx);
-}
-
-/***************** RADMETH ALGORITHM *****************/
 static bool
 consistent_sample_names(const Regression &reg, const std::string &header) {
   std::istringstream iss(header);
@@ -84,7 +70,7 @@ consistent_sample_names(const Regression &reg, const std::string &header) {
 // function outputs the p-value of the log-likelihood ratio. *Note* that it is
 // assumed that the reduced model has one fewer factor than the reduced model.
 static double
-llr_test(double null_loglik, double full_loglik) {
+llr_test(const double null_loglik, const double full_loglik) {
   // The log-likelihood ratio statistic.
   const double log_lik_stat = -2 * (null_loglik - full_loglik);
 
@@ -100,16 +86,15 @@ llr_test(double null_loglik, double full_loglik) {
 }
 
 static bool
-has_low_coverage(const Regression &reg, const std::size_t test_fac) {
+has_low_coverage(const Regression &reg, const std::size_t test_factor) {
   bool cvrd_in_test_fact_smpls = false;
+  const auto &tcol = reg.design.tmatrix[test_factor];
   for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_test_fact_smpls; ++i)
-    cvrd_in_test_fact_smpls =
-      (reg.design.matrix[i][test_fac] == 1 && reg.props.mc[i].n_reads != 0);
+    cvrd_in_test_fact_smpls = (tcol[i] == 1 && reg.props.mc[i].n_reads != 0);
 
   bool cvrd_in_other_smpls = false;
   for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_other_smpls; ++i)
-    cvrd_in_other_smpls =
-      (reg.design.matrix[i][test_fac] != 1 && reg.props.mc[i].n_reads != 0);
+    cvrd_in_other_smpls = (tcol[i] != 1 && reg.props.mc[i].n_reads != 0);
 
   return !cvrd_in_test_fact_smpls || !cvrd_in_other_smpls;
 }
@@ -131,9 +116,9 @@ has_extreme_counts(const Regression &reg) {
 
 [[nodiscard]] static bool
 has_two_values(const Regression &reg, const std::size_t test_factor) {
-  const auto &v = reg.design.tmatrix[test_factor];
-  for (auto i = std::cbegin(v); i != std::cend(v); ++i)
-    if (*i != v[0])
+  const auto &tcol = reg.design.tmatrix[test_factor];
+  for (const auto x : tcol)
+    if (x != tcol[0])
       return true;
   return false;
 }
@@ -145,25 +130,19 @@ get_test_factor_idx(const Regression &model, const std::string &test_factor) {
     std::find(std::cbegin(factors), std::cend(factors), test_factor);
 
   if (itr == std::cend(factors))
-    throw std::runtime_error("Factor not part of design: " + test_factor);
+    throw std::runtime_error("factor not part of design: " + test_factor);
 
   return std::distance(std::cbegin(factors), itr);
 }
 
-static void
-read_design(const bool verbose, const std::string &design_filename,
-            Design &design) {
-  if (verbose)
-    std::cerr << "design table filename: " << design_filename << std::endl;
+[[nodiscard]] static Design
+read_design(const std::string &design_filename) {
   std::ifstream design_file(design_filename);
   if (!design_file)
     throw std::runtime_error("could not open file: " + design_filename);
-
-  // initialize full design matrix from file
+  Design design;
   design_file >> design;
-
-  if (verbose)
-    std::cerr << design << std::endl;
+  return design;
 }
 
 enum class row_status : std::uint8_t {
@@ -174,11 +153,11 @@ enum class row_status : std::uint8_t {
 };
 
 [[nodiscard]] static std::vector<double>
-drop_idx(const std::vector<double> &v, const std::size_t to_drop) {
+drop_idx(const std::vector<double> &v, const std::size_t idx_to_drop) {
   std::vector<double> u;
   u.reserve(std::size(v) - 1);
   for (auto i = 0u; i < std::size(v); ++i)
-    if (i != to_drop)
+    if (i != idx_to_drop)
       u.push_back(v[i]);
   return u;
 }
@@ -218,14 +197,18 @@ radmeth(const bool show_progress, const bool more_na_info,
   std::string sample_names_header;
   std::getline(table_file, sample_names_header);
 
-  if (!consistent_sample_names(alt_model, sample_names_header))
-    throw std::runtime_error(
-      "header:\n" + sample_names_header + "\n" +
-      "does not match factor names or their order in the\n"
-      "design matrix. Check that the design matrix and\n"
-      "the proportion table are correctly formatted.");
+  if (!consistent_sample_names(alt_model, sample_names_header)) {
+    // clang-format off
+    const auto message =
+      R"(
+does not match factor names or their order in the design matrix. Check
+that the design matrix and the proportion table are correctly formatted.
+)";
+    // clang-format on
+    throw std::runtime_error("header:\n" + sample_names_header + message);
+  }
 
-  const std::size_t n_samples = alt_model.design.n_samples();
+  const std::size_t n_samples = alt_model.n_samples();
 
   std::ofstream out(outfile);
   if (!out)
@@ -266,9 +249,9 @@ radmeth(const bool show_progress, const bool more_na_info,
             throw std::runtime_error("found row with wrong number of columns");
 
           const auto [p_val, status] = [&]() -> std::tuple<double, row_status> {
-            // Skip the test if (1) no coverage in all cases or in all controls,
-            // or (2) the site is completely methylated or completely
-            // unmethylated across all samples.
+            // Skip the test if (1) no coverage in all cases or in all
+            // controls, or (2) the site is completely methylated or
+            // completely unmethylated across all samples.
             if (has_low_coverage(t_alt_model, test_factor_idx))
               return std::tuple{1.0, row_status::na_low_cov};
 
@@ -308,11 +291,12 @@ radmeth(const bool show_progress, const bool more_na_info,
 
           n_bytes[b] = [&] {
             // clang-format off
-            const int n_prefix_bytes = std::sprintf(bufs[b].data(), prefix_fmt,
-                                                    t_alt_model.props.chrom.data(),
-                                                    t_alt_model.props.position,
-                                                    t_alt_model.props.strand,
-                                                    t_alt_model.props.context.data());
+            const int n_prefix_bytes =
+              std::sprintf(bufs[b].data(), prefix_fmt,
+                           t_alt_model.props.chrom.data(),
+                           t_alt_model.props.position,
+                           t_alt_model.props.strand,
+                           t_alt_model.props.context.data());
             // clang-format on
             if (n_prefix_bytes < 0)
               return n_prefix_bytes;
@@ -336,11 +320,11 @@ radmeth(const bool show_progress, const bool more_na_info,
             cursor += n_pval_bytes;
 
             // clang-format off
-          const int n_suffix_bytes = std::sprintf(cursor, suffix_fmt,
-                                                  n_reads_factor,
-                                                  n_meth_factor,
-                                                  n_reads_others,
-                                                  n_meth_others);
+            const int n_suffix_bytes = std::sprintf(cursor, suffix_fmt,
+                                                    n_reads_factor,
+                                                    n_meth_factor,
+                                                    n_reads_others,
+                                                    n_meth_others);
             // clang-format on
             if (n_suffix_bytes < 0)
               return n_suffix_bytes;
@@ -366,12 +350,11 @@ radmeth(const bool show_progress, const bool more_na_info,
     if (n_lines < n_threads)
       break;
   }
+
+  if (show_progress)
+    progress(table_file);
 }
 
-/***********************************************************************
- * Run beta-binoimial regression using the specified table with
- * proportions and design matrix
- */
 int
 main_radmeth(int argc, char *argv[]) {
   try {
@@ -431,31 +414,36 @@ main_radmeth(int argc, char *argv[]) {
     const std::string table_filename(leftover_args.back());
     /****************** END COMMAND LINE OPTIONS *****************/
 
+    if (verbose)
+      std::cerr << "design table filename: " << design_filename << '\n';
+
     // initialize full design matrix from file
-    Regression orig_alt_model;
-    read_design(verbose, design_filename, orig_alt_model.design);
+    Regression alt_model;
+    alt_model.design = read_design(design_filename);
+    if (verbose)
+      std::cerr << "Alternate model:\n" << alt_model.design << '\n';
 
     // Check that provided test factor name exists and find its index.
-    const auto test_factor_idx =
-      get_test_factor_idx(orig_alt_model, test_factor);
+    const auto test_factor_idx = get_test_factor_idx(alt_model, test_factor);
 
     // verify that the design includes more than one level for the
     // test factor
-    if (!has_two_values(orig_alt_model, test_factor_idx)) {
-      const auto first_level = orig_alt_model.design.matrix[0][test_factor_idx];
+    if (!has_two_values(alt_model, test_factor_idx)) {
+      const auto first_level = alt_model.design.matrix[0][test_factor_idx];
       throw std::runtime_error("test factor only one level: " + test_factor +
                                ", " + std::to_string(first_level));
     }
 
-    Regression orig_null_model;
-    orig_null_model.design = orig_alt_model.design;
-    remove_factor(orig_null_model.design, test_factor_idx);
+    Regression null_model = alt_model;
+    null_model.design = alt_model.design.drop_factor(test_factor_idx);
+    if (verbose)
+      std::cerr << "Null model:\n" << null_model.design << '\n';
 
     radmeth(show_progress, more_na_info, n_threads, table_filename, outfile,
-            orig_alt_model, orig_null_model, test_factor_idx);
+            alt_model, null_model, test_factor_idx);
   }
   catch (const std::exception &e) {
-    std::cerr << "ERROR: " << e.what() << std::endl;
+    std::cerr << e.what() << '\n';
     return EXIT_FAILURE;
   }
   return EXIT_SUCCESS;

From 0f1c05b90c60af5f998007eb837ef8550d6644e4 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Aug 2025 19:39:32 -0700
Subject: [PATCH 02/11] src/radmeth/radmeth_model.hpp: adding cumulative counts
 for the new algorithm to calculate the beta-binomial likelihood

---
 src/radmeth/radmeth_model.hpp | 41 ++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp
index d7a01d63..bf60620d 100644
--- a/src/radmeth/radmeth_model.hpp
+++ b/src/radmeth/radmeth_model.hpp
@@ -25,19 +25,37 @@
 #include <string>
 #include <vector>
 
+struct cumulative_counts {
+  std::vector<std::uint32_t> m_counts;
+  std::vector<std::uint32_t> r_counts;
+  std::vector<std::uint32_t> d_counts;
+};
+
 struct Design {
   std::vector<std::string> factor_names;
   std::vector<std::string> sample_names;
   std::vector<std::vector<std::uint8_t>> matrix;
   std::vector<std::vector<std::uint8_t>> tmatrix;
-  std::size_t
+  std::vector<std::vector<std::uint8_t>> groups;
+  std::vector<std::uint32_t> group_id;
+
+  [[nodiscard]] std::size_t
   n_factors() const {
-    return factor_names.size();
+    return std::size(factor_names);
+  }
+
+  [[nodiscard]] std::size_t
+  n_groups() const {
+    return std::size(groups);
   }
-  std::size_t
+
+  [[nodiscard]] std::size_t
   n_samples() const {
-    return sample_names.size();
+    return std::size(sample_names);
   }
+
+  [[nodiscard]] Design
+  drop_factor(const std::size_t factor_idx);
 };
 
 std::istream &
@@ -56,7 +74,7 @@ operator>>(std::istream &in, mcounts &rm) {
   return in >> rm.n_reads >> rm.n_meth;
 }
 
-struct SiteProportions {
+struct SiteProp {
   std::string chrom;
   std::size_t position{};
   char strand{};
@@ -68,13 +86,15 @@ struct SiteProportions {
 };
 
 struct Regression {
-  static double tolerance;        // 1e-4;
+  static double tolerance;        // 1e-3;
   static double stepsize;         // 0.001;
-  static std::uint32_t max_iter;  // 700;
+  static std::uint32_t max_iter;  // 250;
 
   Design design;
-  SiteProportions props;
+  SiteProp props;
   double max_loglik{};
+
+  std::vector<cumulative_counts> cumul;
   std::vector<double> p_v;  // scratch space
 
   [[nodiscard]] std::size_t
@@ -82,6 +102,11 @@ struct Regression {
     return design.n_factors();
   }
 
+  [[nodiscard]] std::size_t
+  n_groups() const {
+    return design.n_groups();
+  }
+
   [[nodiscard]] std::size_t
   props_size() const {
     return std::size(props.mc);

From 6ac4a904c2a6c86fd7192b0ea87a79d9975c17f0 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Aug 2025 19:40:28 -0700
Subject: [PATCH 03/11] src/radmeth/radmeth_model.cpp: added functions to make
 groups and to get the mapping from sample to group; also put some functions
 here that were previously in radmeth.cpp because they belong with the model
 structs

---
 src/radmeth/radmeth_model.cpp | 49 +++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/src/radmeth/radmeth_model.cpp b/src/radmeth/radmeth_model.cpp
index faa4a8bb..a4b6ef6f 100644
--- a/src/radmeth/radmeth_model.cpp
+++ b/src/radmeth/radmeth_model.cpp
@@ -29,7 +29,7 @@ double Regression::stepsize = 0.001;
 std::uint32_t Regression::max_iter = 250;
 
 void
-SiteProportions::parse(const std::string &line) {
+SiteProp::parse(const std::string &line) {
   const auto first_ws = line.find_first_of(" \t");
 
   // Parse the row name (must be like: "chr:position:strand:context")
@@ -105,6 +105,27 @@ SiteProportions::parse(const std::string &line) {
     throw std::runtime_error("failed to parse counts from:\n" + line);
 }
 
+static void
+make_groups(Design &design) {
+  auto s = design.matrix;
+  std::sort(std::begin(s), std::end(s));
+  s.erase(std::unique(std::begin(s), std::end(s)), std::end(s));
+  design.groups = std::move(s);
+}
+
+static void
+assign_groups(Design &design) {
+  const auto &matrix = design.matrix;
+  const auto &s = design.groups;
+  const auto n_samples = design.n_samples();
+  auto &group_id = design.group_id;
+  group_id.resize(n_samples);
+  for (auto i = 0u; i < n_samples; ++i) {
+    const auto x = std::find(std::cbegin(s), std::cend(s), matrix[i]);
+    group_id[i] = std::distance(std::cbegin(s), x);
+  }
+}
+
 template <typename T>
 static void
 transpose(const std::vector<std::vector<T>> &mat,
@@ -152,10 +173,33 @@ operator>>(std::istream &is, Design &design) {
   }
 
   transpose(design.matrix, design.tmatrix);
+  make_groups(design);
+  assign_groups(design);
 
   return is;
 }
 
+[[nodiscard]] Design
+Design::drop_factor(const std::size_t factor_idx) {
+  // clang-format off
+  Design design{
+    factor_names,
+    sample_names,
+    matrix,
+    tmatrix,
+    groups,
+    group_id,
+  };
+  // clang-format on
+  design.factor_names.erase(std::begin(design.factor_names) + factor_idx);
+  for (auto i = 0u; i < n_samples(); ++i)
+    design.matrix[i].erase(std::begin(design.matrix[i]) + factor_idx);
+  transpose(design.matrix, design.tmatrix);
+  make_groups(design);
+  assign_groups(design);
+  return design;
+}
+
 std::ostream &
 operator<<(std::ostream &out, const Design &design) {
   for (std::size_t factor = 0; factor < design.factor_names.size(); ++factor) {
@@ -168,8 +212,9 @@ operator<<(std::ostream &out, const Design &design) {
   for (std::size_t i = 0; i < design.n_samples(); ++i) {
     out << design.sample_names[i];
     for (std::size_t j = 0; j < design.n_factors(); ++j)
-      out << "\t" << design.matrix[i][j];
+      out << "\t" << static_cast<std::uint32_t>(design.matrix[i][j]);
     out << "\n";
   }
+
   return out;
 }

From 5cfdb63b90f644a421f3005478b5e1db8f7e968c Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Aug 2025 19:54:23 -0700
Subject: [PATCH 04/11] src/radmeth/radmeth_optimize.cpp: implementing a new
 algorithm for computing the beta-binomial likelihood and gradients. This
 algorithm accumulates mass without the use of unstable gamma functions but
 minimizes the number of summations by re-grouping terms. Doing this required
 a focus on groups rather than factors and samples, where a group is a set of
 factor level combinations

---
 src/radmeth/radmeth_optimize.cpp | 221 ++++++++++++++++++-------------
 1 file changed, 131 insertions(+), 90 deletions(-)

diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp
index 8affe87e..e58f117d 100644
--- a/src/radmeth/radmeth_optimize.cpp
+++ b/src/radmeth/radmeth_optimize.cpp
@@ -1,9 +1,8 @@
-/* Copyright (C) 2013-2023 University of Southern California and
- *                         Egor Dolzhenko
+/* Copyright (C) 2013-2025 University of Southern California and
  *                         Andrew D Smith
- *                         Guilherme Sena
  *
- * Authors: Andrew D. Smith and Egor Dolzhenko and Guilherme Sena
+ * Author: Andrew D. Smith
+ * Contributors: Egor Dolzhenko and Guilherme Sena
  *
  * This program is free software: you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -23,140 +22,133 @@
 #include <gsl/gsl_multimin.h>
 #include <gsl/gsl_vector.h>
 
-#include <iomanip>
-#include <iostream>
 #include <numeric>
 #include <stdexcept>
 #include <vector>
 
+#include <iomanip>
+#include <iostream>
+
+[[nodiscard]] static inline double
+logistic(const double x) {
+  return 1.0 / (1.0 / std::exp(x) + 1.0);
+}
+
 template <typename T>
 [[nodiscard]] static double
-pi(const std::vector<T> &v, const gsl_vector *params) {
-  // ADS: this function doesn't have a very helpful name
+get_p(const std::vector<T> &v, const gsl_vector *params) {
   const auto a = v.data();
-  const double dot = std::inner_product(a, a + std::size(v), params->data, 0.0);
-  // const double p = std::exp(dot) / (1.0 + std::exp(dot));
-  return 1.0 / (1.0 / std::exp(dot) + 1.0);
+  return logistic(std::inner_product(a, a + std::size(v), params->data, 0.0));
 }
 
 [[nodiscard]] static double
 neg_loglik(const gsl_vector *params, void *object) {
   Regression *reg = (Regression *)(object);
 
-  // ADS: the dispersion parameter phi is the last element of
-  // parameter vector
+  // ADS: dispersion param phi is the last element of parameter vector
   const double disp_param = gsl_vector_get(params, reg->design.n_factors());
-  // const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param));
-  const double phi = 1.0 / (1.0 / std::exp(disp_param) + 1.0);
+  const double phi = logistic(disp_param);
   const double one_minus_phi = 1.0 - phi;
 
-  const auto &mc = reg->props.mc;
-  const auto &mat = reg->design.matrix;
+  const auto n_groups = reg->n_groups();
+  const auto &groups = reg->design.groups;
+  const auto &cumul = reg->cumul;
 
   double log_lik = 0;
-  const std::size_t n_samples = reg->design.n_samples();
-  for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) {
-    const auto n = mc[col_idx].n_reads;
-    const auto y = mc[col_idx].n_meth;
-    const double p = pi(mat[col_idx], params);
+  for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) {
+    const double p = get_p(groups[g_idx], params);
     const double one_minus_p = 1.0 - p;
 
     const double term1 = one_minus_phi * p;
-    for (auto k = 0u; k < y; ++k)
-      log_lik += std::log(term1 + phi * k);
+    const auto &cumul_y = cumul[g_idx].m_counts;
+    for (std::size_t k = 0; k < std::size(cumul_y); ++k)
+      log_lik += cumul_y[k] * std::log(term1 + phi * k);
 
     const double term2 = one_minus_phi * one_minus_p;
-    for (auto k = 0u; k < n - y; ++k)
-      log_lik += std::log(term2 + phi * k);
+    const auto &cumul_d = cumul[g_idx].d_counts;
+    for (std::size_t k = 0; k < std::size(cumul_d); ++k)
+      log_lik += cumul_d[k] * std::log(term2 + phi * k);
 
-    for (auto k = 0u; k < n; ++k)
-      // log_lik -= std::log(1.0 + phi * (k - 1.0));
-      log_lik -= std::log1p(phi * (k - 1.0));
+    const auto &cumul_n = cumul[g_idx].r_counts;
+    for (std::size_t k = 0; k < std::size(cumul_n); ++k)
+      log_lik -= cumul_n[k] * std::log1p(phi * (k - 1.0));
   }
+
   return -log_lik;
 }
 
 static void
 neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) {
   Regression *reg = (Regression *)(object);
-  const std::size_t n_samples = reg->design.n_samples();
-  const std::size_t n_factors = reg->design.n_factors();
 
-  /// ADS: using scratch space held in reg instead of allocating here
-  // std::vector<double> p_v(n_samples, 0.0);
+  const auto n_groups = reg->n_groups();
+  const auto &groups = reg->design.groups;
+  const auto &cumul = reg->cumul;
+
+  // ADS: using scratch space held in reg instead of allocating here
   auto &p_v = reg->p_v;
-  for (auto col_idx = 0u; col_idx < n_samples; ++col_idx)
-    p_v[col_idx] = pi(reg->design.matrix[col_idx], params);
+  for (auto g_idx = 0u; g_idx < n_groups; ++g_idx)
+    p_v[g_idx] = get_p(groups[g_idx], params);
 
+  const std::size_t n_factors = reg->design.n_factors();
   const double disp_param = gsl_vector_get(params, n_factors);
 
-  const auto &tmat = reg->design.tmatrix;
-  // const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param));
-  const double phi = 1.0 / (1.0 / std::exp(disp_param) + 1.0);
+  const double phi = logistic(disp_param);
   const double one_minus_phi = 1.0 - phi;
 
-  const auto &mc = reg->props.mc;
-
-  for (std::size_t factor_idx = 0; factor_idx < n_factors; ++factor_idx) {
-    double deriv = 0;
-    const auto &vec = tmat[factor_idx];
-    for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) {
-      const auto n = mc[col_idx].n_reads;
-      const auto y = mc[col_idx].n_meth;
-      const double p = p_v[col_idx];
-      const double one_minus_p = 1.0 - p;
+  // init output to zero for all factors
+  gsl_vector_set_all(output, 0.0);
 
-      const double denom_term1 = one_minus_phi * p;
-      const double factor = denom_term1 * one_minus_p * vec[col_idx];
-      if (factor == 0)
-        continue;
+  for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) {
+    const double p = p_v[g_idx];
+    const double one_minus_p = 1.0 - p;
+    const double denom_term1 = one_minus_phi * p;
+    const double denom_term2 = one_minus_phi * one_minus_p;
 
-      double term = 0;
+    double term = 0;
 
-      const auto accum_term = [&](auto k, const auto lim,
-                                  const double denom_base) {
-        for (; k < lim; ++k)
-          term += 1.0 / (denom_base + phi * k);
-      };
+    const auto &cumul_y = cumul[g_idx].m_counts;
+    for (auto k = 0u; k < std::size(cumul_y); ++k)
+      term += cumul_y[k] / (denom_term1 + phi * k);
 
-      const auto accum_subtract_term = [&](auto k, const auto lim,
-                                           const double denom_base) {
-        for (; k < lim; ++k)
-          term -= 1.0 / (denom_base + phi * k);
-      };
+    const auto &cumul_d = cumul[g_idx].d_counts;
+    for (auto k = 0u; k < std::size(cumul_d); ++k)
+      term -= cumul_d[k] / (denom_term2 + phi * k);
 
-      accum_term(0u, y, denom_term1);
-      accum_subtract_term(0u, n - y, one_minus_phi * one_minus_p);
+    const auto &g = groups[g_idx];
 
+    for (std::size_t factor_idx = 0; factor_idx < n_factors; ++factor_idx) {
+      const auto level = g[factor_idx];
+      if (level == 0)
+        continue;
+      const double factor = denom_term1 * one_minus_p * level;
+      double deriv = gsl_vector_get(output, factor_idx);
       deriv += term * factor;
+      gsl_vector_set(output, factor_idx, deriv);
     }
-    gsl_vector_set(output, factor_idx, deriv);
   }
 
   double deriv = 0;
-  for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) {
-    const auto n = mc[col_idx].n_reads;
-    const auto y = mc[col_idx].n_meth;
-    const double p = p_v[col_idx];
+  for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) {
+    const double p = get_p(groups[g_idx], params);
     const double one_minus_p = 1.0 - p;
 
-    double term = 0;
-    const auto accum_term = [&](auto k, const auto lim, const double num_shift,
-                                const double denom_base) {
-      for (; k < lim; ++k)
-        term += (k - num_shift) / (denom_base + phi * k);
-    };
-    const auto accum_subtract_term = [&](auto k, const auto lim) {
-      for (; k < lim; ++k)
-        term -= (k - 1.0) / (1.0 + phi * (k - 1.0));
-    };
-
-    accum_term(0u, y, p, one_minus_phi * p);
-    accum_term(0u, n - y, one_minus_p, one_minus_phi * one_minus_p);
-    accum_subtract_term(0u, n);
-
-    deriv += term * phi * one_minus_phi;
+    const double term1 = one_minus_phi * p;
+    const auto &cumul_y = cumul[g_idx].m_counts;
+    for (auto k = 0u; k < std::size(cumul_y); ++k)
+      deriv += cumul_y[k] * (k - p) / (term1 + phi * k);
+
+    const double term2 = one_minus_phi * one_minus_p;
+    const auto &cumul_d = cumul[g_idx].d_counts;
+    for (auto k = 0u; k < std::size(cumul_d); ++k)
+      deriv += cumul_d[k] * (k - one_minus_p) / (term2 + phi * k);
+
+    const auto &cumul_n = cumul[g_idx].r_counts;
+    for (std::size_t k = 0; k < std::size(cumul_n); ++k)
+      deriv -= cumul_n[k] * (k - 1.0) / (1.0 + phi * (k - 1.0));
   }
+  deriv *= (phi * one_minus_phi);
+
   gsl_vector_set(output, n_factors, deriv);
   gsl_vector_scale(output, -1.0);
 }
@@ -168,11 +160,61 @@ neg_loglik_and_grad(const gsl_vector *params, void *object, double *loglik_val,
   neg_gradient(params, object, d_loglik_val);
 }
 
-bool
+static void
+get_cumulative(const std::vector<std::uint32_t> &group_id,
+               const std::uint32_t n_groups, const std::vector<mcounts> &mc,
+               std::vector<cumulative_counts> &cumul) {
+  const auto n_cols = std::size(mc);
+  cumul.clear();
+  cumul.resize(n_groups);
+
+  const auto compute_cumulative = [&](auto get_value, auto get_vector) {
+    // phase 1: determine max value for each group
+    for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) {
+      std::uint32_t max_v{};
+      for (auto c_idx = 0u; c_idx < n_cols; ++c_idx) {
+        if (group_id[c_idx] == g_idx) {
+          const auto val = get_value(mc[c_idx]);
+          if (val > max_v)
+            max_v = val;
+        }
+      }
+      get_vector(cumul[g_idx]).resize(max_v, 0);
+    }
+
+    // phase 2: fill cumulative counts
+    for (auto c_idx = 0u; c_idx < n_cols; ++c_idx) {
+      const auto g_idx = group_id[c_idx];
+      const auto val = get_value(mc[c_idx]);
+      auto &vec = get_vector(cumul[g_idx]);
+      for (auto i = 0u; i < val; ++i)
+        vec[i]++;
+    }
+  };
+  // call the lambda 3 times for m_counts, r_counts, d_counts
+  compute_cumulative([](const mcounts &m) { return m.n_meth; },
+                     [](cumulative_counts &c) -> std::vector<std::uint32_t> & {
+                       return c.m_counts;
+                     });
+
+  compute_cumulative([](const mcounts &m) { return m.n_reads; },
+                     [](cumulative_counts &c) -> std::vector<std::uint32_t> & {
+                       return c.r_counts;
+                     });
+
+  compute_cumulative([](const mcounts &m) { return m.n_reads - m.n_meth; },
+                     [](cumulative_counts &c) -> std::vector<std::uint32_t> & {
+                       return c.d_counts;
+                     });
+}
+
+[[nodiscard]] bool
 fit_regression_model(Regression &r, std::vector<double> &params_init) {
   const auto stepsize = Regression::stepsize;
   const auto max_iter = Regression::max_iter;
 
+  get_cumulative(r.design.group_id, r.design.n_groups(), r.props.mc, r.cumul);
+
   // one more than the number of factors
   const std::size_t n_params = r.n_factors() + 1;
   if (params_init.empty()) {
@@ -180,7 +222,7 @@ fit_regression_model(Regression &r, std::vector<double> &params_init) {
     params_init.back() = -2.5;
   }
 
-  r.p_v.resize(r.n_samples());
+  r.p_v.resize(r.n_groups());
 
   const double tolerance =
     std::sqrt(n_params) * r.n_samples() * Regression::tolerance;
@@ -204,7 +246,7 @@ fit_regression_model(Regression &r, std::vector<double> &params_init) {
   // - gsl_multimin_fdfminimizer_vector_bfgs2
   // - gsl_multimin_fdfminimizer_steepest_descent
   const gsl_multimin_fdfminimizer_type *minimizer =
-    gsl_multimin_fdfminimizer_conjugate_fr;
+    gsl_multimin_fdfminimizer_conjugate_pr;
 
   gsl_multimin_fdfminimizer *s =
     gsl_multimin_fdfminimizer_alloc(minimizer, n_params);
@@ -225,7 +267,6 @@ fit_regression_model(Regression &r, std::vector<double> &params_init) {
     // check status from gradient
     status = gsl_multimin_test_gradient(s->gradient, tolerance);
   } while (status == GSL_CONTINUE && ++iter < max_iter);
-  // ADS: What is a reasonable number of iterations?
 
   r.max_loglik = -1.0 * neg_loglik(s->x, &r);
 

From 3b4a2a2b14d26e0d86ac20710a7191208e078c82 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Aug 2025 21:11:13 -0700
Subject: [PATCH 05/11] src/radmeth/radmeth_model.hpp: shortening some names

---
 src/radmeth/radmeth_model.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp
index bf60620d..5b2641d8 100644
--- a/src/radmeth/radmeth_model.hpp
+++ b/src/radmeth/radmeth_model.hpp
@@ -25,7 +25,7 @@
 #include <string>
 #include <vector>
 
-struct cumulative_counts {
+struct cumul_counts {
   std::vector<std::uint32_t> m_counts;
   std::vector<std::uint32_t> r_counts;
   std::vector<std::uint32_t> d_counts;
@@ -94,7 +94,7 @@ struct Regression {
   SiteProp props;
   double max_loglik{};
 
-  std::vector<cumulative_counts> cumul;
+  std::vector<cumul_counts> cumul;
   std::vector<double> p_v;  // scratch space
 
   [[nodiscard]] std::size_t

From a9631fe3d54988f5caecb750d301be2701046b1c Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Aug 2025 21:15:52 -0700
Subject: [PATCH 06/11] src/radmeth/radmeth_optimize.cpp: cleaning up some code
 and tweaking for more optimization

---
 src/radmeth/radmeth_optimize.cpp | 185 ++++++++++++++-----------------
 1 file changed, 86 insertions(+), 99 deletions(-)

diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp
index e58f117d..c10b8cda 100644
--- a/src/radmeth/radmeth_optimize.cpp
+++ b/src/radmeth/radmeth_optimize.cpp
@@ -16,7 +16,6 @@
  */
 
 #include "radmeth_optimize.hpp"
-
 #include "radmeth_model.hpp"
 
 #include <gsl/gsl_multimin.h>
@@ -26,9 +25,6 @@
 #include <stdexcept>
 #include <vector>
 
-#include <iomanip>
-#include <iostream>
-
 [[nodiscard]] static inline double
 logistic(const double x) {
   return 1.0 / (1.0 / std::exp(x) + 1.0);
@@ -42,29 +38,25 @@ get_p(const std::vector<T> &v, const gsl_vector *params) {
 }
 
 [[nodiscard]] static double
-neg_loglik(const gsl_vector *params, void *object) {
-  Regression *reg = (Regression *)(object);
+log_likelihood(const gsl_vector *params, const Regression &reg) {
+  const auto phi = logistic(gsl_vector_get(params, reg.design.n_factors()));
+  const auto one_minus_phi = 1.0 - phi;
 
-  // ADS: dispersion param phi is the last element of parameter vector
-  const double disp_param = gsl_vector_get(params, reg->design.n_factors());
-  const double phi = logistic(disp_param);
-  const double one_minus_phi = 1.0 - phi;
+  const auto n_groups = reg.n_groups();
+  const auto &groups = reg.design.groups;
+  const auto &cumul = reg.cumul;
 
-  const auto n_groups = reg->n_groups();
-  const auto &groups = reg->design.groups;
-  const auto &cumul = reg->cumul;
-
-  double log_lik = 0;
+  double log_lik = 0.0;
   for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) {
-    const double p = get_p(groups[g_idx], params);
-    const double one_minus_p = 1.0 - p;
+    const auto p = get_p(groups[g_idx], params);
+    const auto one_minus_p = 1.0 - p;
 
-    const double term1 = one_minus_phi * p;
+    const auto term1 = one_minus_phi * p;
     const auto &cumul_y = cumul[g_idx].m_counts;
     for (std::size_t k = 0; k < std::size(cumul_y); ++k)
       log_lik += cumul_y[k] * std::log(term1 + phi * k);
 
-    const double term2 = one_minus_phi * one_minus_p;
+    const auto term2 = one_minus_phi * one_minus_p;
     const auto &cumul_d = cumul[g_idx].d_counts;
     for (std::size_t k = 0; k < std::size(cumul_d); ++k)
       log_lik += cumul_d[k] * std::log(term2 + phi * k);
@@ -73,83 +65,88 @@ neg_loglik(const gsl_vector *params, void *object) {
     for (std::size_t k = 0; k < std::size(cumul_n); ++k)
       log_lik -= cumul_n[k] * std::log1p(phi * (k - 1.0));
   }
-
-  return -log_lik;
+  return log_lik;
 }
 
 static void
-neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) {
-  Regression *reg = (Regression *)(object);
+gradient(const gsl_vector *params, Regression &reg, gsl_vector *output) {
+  const auto n_factors = reg.design.n_factors();
+  const auto phi = logistic(gsl_vector_get(params, n_factors));
+  const auto one_minus_phi = 1.0 - phi;
 
-  const auto n_groups = reg->n_groups();
-  const auto &groups = reg->design.groups;
-  const auto &cumul = reg->cumul;
+  const auto n_groups = reg.n_groups();
+  const auto &groups = reg.design.groups;
+  const auto &cumul = reg.cumul;
 
-  // ADS: using scratch space held in reg instead of allocating here
-  auto &p_v = reg->p_v;
+  auto &p_v = reg.p_v;  // ADS: reusing scratch space
   for (auto g_idx = 0u; g_idx < n_groups; ++g_idx)
     p_v[g_idx] = get_p(groups[g_idx], params);
 
-  const std::size_t n_factors = reg->design.n_factors();
-  const double disp_param = gsl_vector_get(params, n_factors);
-
-  const double phi = logistic(disp_param);
-  const double one_minus_phi = 1.0 - phi;
-
   // init output to zero for all factors
   gsl_vector_set_all(output, 0.0);
+  auto &data = output->data;
 
   for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) {
-    const double p = p_v[g_idx];
-    const double one_minus_p = 1.0 - p;
-    const double denom_term1 = one_minus_phi * p;
-    const double denom_term2 = one_minus_phi * one_minus_p;
+    const auto p = p_v[g_idx];
+    const auto one_minus_p = 1.0 - p;
 
-    double term = 0;
+    double deriv = 0.0;
 
+    const auto denom_term1 = one_minus_phi * p;
     const auto &cumul_y = cumul[g_idx].m_counts;
     for (auto k = 0u; k < std::size(cumul_y); ++k)
-      term += cumul_y[k] / (denom_term1 + phi * k);
+      deriv += cumul_y[k] / (denom_term1 + phi * k);
 
+    const auto denom_term2 = one_minus_phi * one_minus_p;
     const auto &cumul_d = cumul[g_idx].d_counts;
     for (auto k = 0u; k < std::size(cumul_d); ++k)
-      term -= cumul_d[k] / (denom_term2 + phi * k);
+      deriv -= cumul_d[k] / (denom_term2 + phi * k);
 
     const auto &g = groups[g_idx];
-
-    for (std::size_t factor_idx = 0; factor_idx < n_factors; ++factor_idx) {
-      const auto level = g[factor_idx];
+    for (auto fact_idx = 0u; fact_idx < n_factors; ++fact_idx) {
+      const auto level = g[fact_idx];
       if (level == 0)
         continue;
-      const double factor = denom_term1 * one_minus_p * level;
-      double deriv = gsl_vector_get(output, factor_idx);
-      deriv += term * factor;
-      gsl_vector_set(output, factor_idx, deriv);
+      data[fact_idx] += deriv * (denom_term1 * one_minus_p * level);
     }
   }
 
-  double deriv = 0;
-  for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) {
-    const double p = get_p(groups[g_idx], params);
-    const double one_minus_p = 1.0 - p;
+  double deriv = 0.0;
+  auto cumul_itr = std::cbegin(cumul);
+  for (auto g_idx = 0u; g_idx < n_groups; ++g_idx, ++cumul_itr) {
+    const auto p = get_p(groups[g_idx], params);
+    const auto one_minus_p = 1.0 - p;
 
-    const double term1 = one_minus_phi * p;
-    const auto &cumul_y = cumul[g_idx].m_counts;
-    for (auto k = 0u; k < std::size(cumul_y); ++k)
+    const auto term1 = one_minus_phi * p;
+    const auto &cumul_y = cumul_itr->m_counts;
+    const auto y_lim = std::size(cumul_y);
+    for (auto k = 0u; k < y_lim; ++k)
       deriv += cumul_y[k] * (k - p) / (term1 + phi * k);
 
-    const double term2 = one_minus_phi * one_minus_p;
-    const auto &cumul_d = cumul[g_idx].d_counts;
-    for (auto k = 0u; k < std::size(cumul_d); ++k)
+    const auto term2 = one_minus_phi * one_minus_p;
+    const auto &cumul_d = cumul_itr->d_counts;
+    const auto d_lim = std::size(cumul_d);
+    for (auto k = 0u; k < d_lim; ++k)
       deriv += cumul_d[k] * (k - one_minus_p) / (term2 + phi * k);
 
-    const auto &cumul_n = cumul[g_idx].r_counts;
-    for (std::size_t k = 0; k < std::size(cumul_n); ++k)
+    const auto &cumul_n = cumul_itr->r_counts;
+    const auto n_lim = std::size(cumul_n);
+    for (auto k = 0u; k < n_lim; ++k)
       deriv -= cumul_n[k] * (k - 1.0) / (1.0 + phi * (k - 1.0));
   }
-  deriv *= (phi * one_minus_phi);
+  gsl_vector_set(output, n_factors, deriv * (phi * one_minus_phi));
+}
 
-  gsl_vector_set(output, n_factors, deriv);
+[[nodiscard]] static double
+neg_loglik(const gsl_vector *params, void *object) {
+  auto reg = static_cast<Regression *>(object);
+  return -log_likelihood(params, *reg);
+}
+
+static void
+neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) {
+  auto reg = static_cast<Regression *>(object);
+  gradient(params, *reg, output);
   gsl_vector_scale(output, -1.0);
 }
 
@@ -163,12 +160,12 @@ neg_loglik_and_grad(const gsl_vector *params, void *object, double *loglik_val,
 static void
 get_cumulative(const std::vector<std::uint32_t> &group_id,
                const std::uint32_t n_groups, const std::vector<mcounts> &mc,
-               std::vector<cumulative_counts> &cumul) {
+               std::vector<cumul_counts> &cumul) {
   const auto n_cols = std::size(mc);
   cumul.clear();
   cumul.resize(n_groups);
 
-  const auto compute_cumulative = [&](auto get_value, auto get_vector) {
+  const auto comp_cumul = [&](auto get_value, auto get_vector) {
     // phase 1: determine max value for each group
     for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) {
       std::uint32_t max_v{};
@@ -192,24 +189,22 @@ get_cumulative(const std::vector<std::uint32_t> &group_id,
     }
   };
   // call the lambda 3 times for m_counts, r_counts, d_counts
-  compute_cumulative([](const mcounts &m) { return m.n_meth; },
-                     [](cumulative_counts &c) -> std::vector<std::uint32_t> & {
-                       return c.m_counts;
-                     });
-
-  compute_cumulative([](const mcounts &m) { return m.n_reads; },
-                     [](cumulative_counts &c) -> std::vector<std::uint32_t> & {
-                       return c.r_counts;
-                     });
-
-  compute_cumulative([](const mcounts &m) { return m.n_reads - m.n_meth; },
-                     [](cumulative_counts &c) -> std::vector<std::uint32_t> & {
-                       return c.d_counts;
-                     });
+  comp_cumul(
+    [](const mcounts &m) { return m.n_meth; },
+    [](cumul_counts &c) -> std::vector<std::uint32_t> & { return c.m_counts; });
+
+  comp_cumul(
+    [](const mcounts &m) { return m.n_reads; },
+    [](cumul_counts &c) -> std::vector<std::uint32_t> & { return c.r_counts; });
+
+  comp_cumul(
+    [](const mcounts &m) { return m.n_reads - m.n_meth; },
+    [](cumul_counts &c) -> std::vector<std::uint32_t> & { return c.d_counts; });
 }
 
 [[nodiscard]] bool
 fit_regression_model(Regression &r, std::vector<double> &params_init) {
+  static constexpr auto init_dispersion_param = -2.5;
   const auto stepsize = Regression::stepsize;
   const auto max_iter = Regression::max_iter;
 
@@ -219,19 +214,14 @@ fit_regression_model(Regression &r, std::vector<double> &params_init) {
   const std::size_t n_params = r.n_factors() + 1;
   if (params_init.empty()) {
     params_init.resize(n_params, 0.0);
-    params_init.back() = -2.5;
+    params_init.back() = init_dispersion_param;
   }
-
-  r.p_v.resize(r.n_groups());
-
-  const double tolerance =
-    std::sqrt(n_params) * r.n_samples() * Regression::tolerance;
-
-  if (params_init.size() != n_params)
+  if (std::size(params_init) != n_params)
     throw std::runtime_error("Wrong number of initial parameters.");
-
+  r.p_v.resize(r.n_groups());
+  const auto tol = std::sqrt(n_params) * r.n_samples() * Regression::tolerance;
   // clang-format off
-  gsl_multimin_function_fdf loglik_bundle = {
+  auto loglik_bundle = gsl_multimin_function_fdf{
     &neg_loglik,              // objective function
     &neg_gradient,            // gradient
     &neg_loglik_and_grad,     // combined obj and grad
@@ -245,30 +235,27 @@ fit_regression_model(Regression &r, std::vector<double> &params_init) {
   // - gsl_multimin_fdfminimizer_conjugate_fr
   // - gsl_multimin_fdfminimizer_vector_bfgs2
   // - gsl_multimin_fdfminimizer_steepest_descent
-  const gsl_multimin_fdfminimizer_type *minimizer =
-    gsl_multimin_fdfminimizer_conjugate_pr;
+  const auto minimizer = gsl_multimin_fdfminimizer_conjugate_pr;
+  auto s = gsl_multimin_fdfminimizer_alloc(minimizer, n_params);
 
-  gsl_multimin_fdfminimizer *s =
-    gsl_multimin_fdfminimizer_alloc(minimizer, n_params);
-
-  gsl_vector *params = gsl_vector_alloc(n_params);
-  for (std::size_t i = 0; i < n_params; ++i)
+  auto params = gsl_vector_alloc(n_params);
+  for (auto i = 0u; i < n_params; ++i)
     gsl_vector_set(params, i, params_init[i]);
 
-  gsl_multimin_fdfminimizer_set(s, &loglik_bundle, params, stepsize, tolerance);
+  gsl_multimin_fdfminimizer_set(s, &loglik_bundle, params, stepsize, tol);
 
   int status = 0;
   std::size_t iter = 0;
-
   do {
     status = gsl_multimin_fdfminimizer_iterate(s);  // one iter and get status
     if (status)
       break;
     // check status from gradient
-    status = gsl_multimin_test_gradient(s->gradient, tolerance);
+    status = gsl_multimin_test_gradient(s->gradient, tol);
   } while (status == GSL_CONTINUE && ++iter < max_iter);
 
-  r.max_loglik = -1.0 * neg_loglik(s->x, &r);
+  const auto param_estimates = gsl_multimin_fdfminimizer_x(s);
+  r.max_loglik = log_likelihood(param_estimates, r);
 
   gsl_multimin_fdfminimizer_free(s);
   gsl_vector_free(params);

From 45220731bd83ae1d3b3a196e67b8dd3735da2c96 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sun, 10 Aug 2025 16:00:17 -0700
Subject: [PATCH 07/11] src/radmeth/radmeth.cpp: no longer iterating over
 chunks because consecutive sites of extreme coverage can cause an imbalance
 in terms of work done by threads. Also using a decimal digit of precision in
 the progress reporting

---
 src/radmeth/radmeth.cpp | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp
index e4a89aa3..97ea9ad7 100644
--- a/src/radmeth/radmeth.cpp
+++ b/src/radmeth/radmeth.cpp
@@ -37,20 +37,25 @@
 #include <vector>
 
 struct file_progress {
-  double one_hundred_over_filesize{};
+  double one_thousand_over_filesize{};
   std::size_t prev_offset{};
   explicit file_progress(const std::string &filename) :
-    one_hundred_over_filesize{100.0 / std::filesystem::file_size(filename)} {}
+    one_thousand_over_filesize{1000.0 / std::filesystem::file_size(filename)} {}
   void
   operator()(std::ifstream &in) {
     const std::size_t curr_offset =
-      in.eof() ? 100 : in.tellg() * one_hundred_over_filesize;
+      in.eof() ? 1000 : in.tellg() * one_thousand_over_filesize;
     if (curr_offset <= prev_offset)
       return;
-    std::cerr << "\r[progress: " << std::setw(3) << curr_offset
-              << (curr_offset == 100 ? "%]\n" : "%]");
-    prev_offset = (curr_offset == 100) ? std::numeric_limits<std::size_t>::max()
-                                       : curr_offset;
+    std::ios old_state(nullptr);
+    old_state.copyfmt(std::cerr);
+    std::cerr << "\r[progress: " << std::setw(5) << std::fixed
+              << std::setprecision(1) << (curr_offset / 10.0)
+              << (curr_offset == 1000 ? "%]\n" : "%]");
+    std::cerr.copyfmt(old_state);
+    prev_offset = (curr_offset == 1000)
+                    ? std::numeric_limits<std::size_t>::max()
+                    : curr_offset;
   }
 };
 
@@ -162,6 +167,9 @@ drop_idx(const std::vector<double> &v, const std::size_t idx_to_drop) {
   return u;
 }
 
+/// ADS: this function is not currently used, as the threads do not operate in
+/// "chunks"
+/*
 static inline void
 get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks,
                  std::vector<std::pair<std::uint32_t, std::uint32_t>> &chunks) {
@@ -175,6 +183,7 @@ get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks,
     block_start = block_end;
   }
 }
+*/
 
 static void
 radmeth(const bool show_progress, const bool more_na_info,
@@ -224,7 +233,7 @@ that the design matrix and the proportion table are correctly formatted.
   std::vector<Regression> alt_models(n_threads, alt_model);
   std::vector<Regression> null_models(n_threads, null_model);
 
-  std::vector<std::pair<std::uint32_t, std::uint32_t>> chunks(n_threads);
+  // std::vector<std::pair<std::uint32_t, std::uint32_t>> chunks(n_threads);
 
   // Iterate over rows in the file and do llr test on proportions from
   // each. Do this in sets of rows to avoid having to spawn too many threads.
@@ -235,15 +244,21 @@ that the design matrix and the proportion table are correctly formatted.
     if (n_lines == 0)
       break;
 
-    get_chunk_bounds(n_lines, n_threads, chunks);
+    /// ADS: chunks not used
+    // get_chunk_bounds(n_lines, n_threads, chunks);
 
     std::vector<std::thread> threads;
     for (auto thread_id = 0u; thread_id < n_threads; ++thread_id) {
       threads.emplace_back([&, thread_id] {
-        const auto &[chunk_beg, chunk_end] = chunks[thread_id];
+        /// ADS: chunks not used
+        // const auto &[chunk_beg, chunk_end] = chunks[thread_id];
         auto &t_alt_model = alt_models[thread_id];
         auto &t_null_model = null_models[thread_id];
-        for (auto b = chunk_beg; b != chunk_end; ++b) {
+        /// ADS: chunks not used
+        // for (auto b = chunk_beg; b != chunk_end; ++b) {
+        for (auto b = 0u; b < n_lines; ++b) {
+          if (b % n_threads != thread_id)
+            continue;
           t_alt_model.props.parse(lines[b]);
           if (t_alt_model.props_size() != n_samples)
             throw std::runtime_error("found row with wrong number of columns");

From 23166b5153404f81669ce957ee754b4b21592ec1 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sun, 10 Aug 2025 17:10:43 -0700
Subject: [PATCH 08/11] src/radmeth/radmeth_model.hpp: adding a vector to
 Regression struct to cache some log1p calculations

---
 src/radmeth/radmeth_model.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp
index 5b2641d8..8775ff5a 100644
--- a/src/radmeth/radmeth_model.hpp
+++ b/src/radmeth/radmeth_model.hpp
@@ -94,8 +94,10 @@ struct Regression {
   SiteProp props;
   double max_loglik{};
 
+  // scratch space
   std::vector<cumul_counts> cumul;
-  std::vector<double> p_v;  // scratch space
+  std::vector<double> p_v;
+  std::vector<double> log1p_fact_v;
 
   [[nodiscard]] std::size_t
   n_factors() const {

From 771d0c87b4f2658bd8dfc21565855b816ae45da6 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sun, 10 Aug 2025 17:11:29 -0700
Subject: [PATCH 09/11] src/radmeth/radmeth_optimize.cpp: adding a caching
 strategy for computing some log1p that can be reused across groups

---
 src/radmeth/radmeth_optimize.cpp | 33 ++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp
index c10b8cda..4839c229 100644
--- a/src/radmeth/radmeth_optimize.cpp
+++ b/src/radmeth/radmeth_optimize.cpp
@@ -1,8 +1,6 @@
-/* Copyright (C) 2013-2025 University of Southern California and
- *                         Andrew D Smith
+/* Copyright (C) 2013-2025 Andrew D Smith
  *
  * Author: Andrew D. Smith
- * Contributors: Egor Dolzhenko and Guilherme Sena
  *
  * This program is free software: you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -21,6 +19,7 @@
 #include <gsl/gsl_multimin.h>
 #include <gsl/gsl_vector.h>
 
+#include <algorithm>
 #include <numeric>
 #include <stdexcept>
 #include <vector>
@@ -37,8 +36,24 @@ get_p(const std::vector<T> &v, const gsl_vector *params) {
   return logistic(std::inner_product(a, a + std::size(v), params->data, 0.0));
 }
 
+static inline auto
+cache_log1p_factors(Regression &reg, const double phi) {
+  const auto &cumul = reg.cumul;
+  const auto max_itr = std::max_element(
+    std::cbegin(cumul), std::cend(cumul), [](const auto &a, const auto &b) {
+      return std::size(a.r_counts) < std::size(b.r_counts);
+    });
+  const std::size_t max_k = std::size(max_itr->r_counts);
+  auto &cache = reg.log1p_fact_v;
+  // ADS: avoid the realloc that can happen even for resize(smaller_size)
+  if (max_k > std::size(cache))
+    cache.resize(max_k, 0.0);
+  for (std::size_t k = 0; k < max_k; ++k)
+    cache[k] = std::log1p(phi * (k - 1.0));
+}
+
 [[nodiscard]] static double
-log_likelihood(const gsl_vector *params, const Regression &reg) {
+log_likelihood(const gsl_vector *params, Regression &reg) {
   const auto phi = logistic(gsl_vector_get(params, reg.design.n_factors()));
   const auto one_minus_phi = 1.0 - phi;
 
@@ -46,6 +61,11 @@ log_likelihood(const gsl_vector *params, const Regression &reg) {
   const auto &groups = reg.design.groups;
   const auto &cumul = reg.cumul;
 
+  // ADS: precompute the log1p(phi * (k - 1.0)) values, which are reused for
+  // each group.
+  cache_log1p_factors(reg, phi);
+  const auto &log1p_fact_v = reg.log1p_fact_v;
+
   double log_lik = 0.0;
   for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) {
     const auto p = get_p(groups[g_idx], params);
@@ -63,7 +83,7 @@ log_likelihood(const gsl_vector *params, const Regression &reg) {
 
     const auto &cumul_n = cumul[g_idx].r_counts;
     for (std::size_t k = 0; k < std::size(cumul_n); ++k)
-      log_lik -= cumul_n[k] * std::log1p(phi * (k - 1.0));
+      log_lik -= cumul_n[k] * log1p_fact_v[k];
   }
   return log_lik;
 }
@@ -103,11 +123,12 @@ gradient(const gsl_vector *params, Regression &reg, gsl_vector *output) {
       deriv -= cumul_d[k] / (denom_term2 + phi * k);
 
     const auto &g = groups[g_idx];
+    const auto denom_term1_one_minus_p = denom_term1 * one_minus_p;
     for (auto fact_idx = 0u; fact_idx < n_factors; ++fact_idx) {
       const auto level = g[fact_idx];
       if (level == 0)
         continue;
-      data[fact_idx] += deriv * (denom_term1 * one_minus_p * level);
+      data[fact_idx] += deriv * (denom_term1_one_minus_p * level);
     }
   }
 

From d9f88710033c105cc4679c0348f8ef69bc7308fd Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sun, 10 Aug 2025 17:29:01 -0700
Subject: [PATCH 10/11] src/radmeth/radmeth.cpp: merge

---
 src/radmeth/radmeth.cpp | 37 +++++++++++--------------------------
 1 file changed, 11 insertions(+), 26 deletions(-)

diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp
index 97ea9ad7..e4a89aa3 100644
--- a/src/radmeth/radmeth.cpp
+++ b/src/radmeth/radmeth.cpp
@@ -37,25 +37,20 @@
 #include <vector>
 
 struct file_progress {
-  double one_thousand_over_filesize{};
+  double one_hundred_over_filesize{};
   std::size_t prev_offset{};
   explicit file_progress(const std::string &filename) :
-    one_thousand_over_filesize{1000.0 / std::filesystem::file_size(filename)} {}
+    one_hundred_over_filesize{100.0 / std::filesystem::file_size(filename)} {}
   void
   operator()(std::ifstream &in) {
     const std::size_t curr_offset =
-      in.eof() ? 1000 : in.tellg() * one_thousand_over_filesize;
+      in.eof() ? 100 : in.tellg() * one_hundred_over_filesize;
     if (curr_offset <= prev_offset)
       return;
-    std::ios old_state(nullptr);
-    old_state.copyfmt(std::cerr);
-    std::cerr << "\r[progress: " << std::setw(5) << std::fixed
-              << std::setprecision(1) << (curr_offset / 10.0)
-              << (curr_offset == 1000 ? "%]\n" : "%]");
-    std::cerr.copyfmt(old_state);
-    prev_offset = (curr_offset == 1000)
-                    ? std::numeric_limits<std::size_t>::max()
-                    : curr_offset;
+    std::cerr << "\r[progress: " << std::setw(3) << curr_offset
+              << (curr_offset == 100 ? "%]\n" : "%]");
+    prev_offset = (curr_offset == 100) ? std::numeric_limits<std::size_t>::max()
+                                       : curr_offset;
   }
 };
 
@@ -167,9 +162,6 @@ drop_idx(const std::vector<double> &v, const std::size_t idx_to_drop) {
   return u;
 }
 
-/// ADS: this function is not currently used, as the threads do not operate in
-/// "chunks"
-/*
 static inline void
 get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks,
                  std::vector<std::pair<std::uint32_t, std::uint32_t>> &chunks) {
@@ -183,7 +175,6 @@ get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks,
     block_start = block_end;
   }
 }
-*/
 
 static void
 radmeth(const bool show_progress, const bool more_na_info,
@@ -233,7 +224,7 @@ that the design matrix and the proportion table are correctly formatted.
   std::vector<Regression> alt_models(n_threads, alt_model);
   std::vector<Regression> null_models(n_threads, null_model);
 
-  // std::vector<std::pair<std::uint32_t, std::uint32_t>> chunks(n_threads);
+  std::vector<std::pair<std::uint32_t, std::uint32_t>> chunks(n_threads);
 
   // Iterate over rows in the file and do llr test on proportions from
   // each. Do this in sets of rows to avoid having to spawn too many threads.
@@ -244,21 +235,15 @@ that the design matrix and the proportion table are correctly formatted.
     if (n_lines == 0)
       break;
 
-    /// ADS: chunks not used
-    // get_chunk_bounds(n_lines, n_threads, chunks);
+    get_chunk_bounds(n_lines, n_threads, chunks);
 
     std::vector<std::thread> threads;
     for (auto thread_id = 0u; thread_id < n_threads; ++thread_id) {
       threads.emplace_back([&, thread_id] {
-        /// ADS: chunks not used
-        // const auto &[chunk_beg, chunk_end] = chunks[thread_id];
+        const auto &[chunk_beg, chunk_end] = chunks[thread_id];
         auto &t_alt_model = alt_models[thread_id];
         auto &t_null_model = null_models[thread_id];
-        /// ADS: chunks not used
-        // for (auto b = chunk_beg; b != chunk_end; ++b) {
-        for (auto b = 0u; b < n_lines; ++b) {
-          if (b % n_threads != thread_id)
-            continue;
+        for (auto b = chunk_beg; b != chunk_end; ++b) {
           t_alt_model.props.parse(lines[b]);
           if (t_alt_model.props_size() != n_samples)
             throw std::runtime_error("found row with wrong number of columns");

From cb88503b1eec6c4a718dccb27de94eb17e05cc73 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Aug 2025 19:54:23 -0700
Subject: [PATCH 11/11] src/radmeth/radmeth_optimize.cpp: implementing a new
 algorithm for computing the beta-binomial likelihood and gradients. This
 algorithm accumulates mass without the use of unstable gamma functions but
 minimizes the number of summations by re-grouping terms. Doing this required
 a focus on groups rather than factors and samples, where a group is a set of
 factor level combinations

---
 src/radmeth/radmeth_optimize.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp
index 4839c229..5d103c6f 100644
--- a/src/radmeth/radmeth_optimize.cpp
+++ b/src/radmeth/radmeth_optimize.cpp
@@ -168,7 +168,6 @@ static void
 neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) {
   auto reg = static_cast<Regression *>(object);
   gradient(params, *reg, output);
-  gsl_vector_scale(output, -1.0);
 }
 
 static void