diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp
index 97ea9ad7..8fe1835d 100644
--- a/src/radmeth/radmeth.cpp
+++ b/src/radmeth/radmeth.cpp
@@ -26,6 +26,7 @@
 #include <gsl/gsl_cdf.h>  // GSL header for chisqrd distribution
 
 #include <algorithm>
+#include <chrono>
 #include <cmath>
 #include <filesystem>
 #include <fstream>
@@ -36,6 +37,24 @@
 #include <thread>
 #include <vector>
 
+[[nodiscard]] static std::string
+format_duration(const std::chrono::duration<double> elapsed) {
+  static constexpr auto s_per_h = 3600;
+  static constexpr auto s_per_m = 60;
+  const double tot_s = elapsed.count();
+
+  // break down into hours, minutes, seconds
+  const std::uint32_t hours = tot_s / 3600;
+  const std::uint32_t minutes = (static_cast<int>(tot_s) % s_per_h) / s_per_m;
+  const double seconds = tot_s - (hours * s_per_h) - (minutes * s_per_m);
+
+  std::ostringstream oss;
+  oss << std::setfill('0') << std::setw(2) << hours << ":" << std::setfill('0')
+      << std::setw(2) << minutes << ":" << std::fixed << std::setprecision(2)
+      << std::setw(5) << seconds;
+  return oss.str();
+}
+
 struct file_progress {
   double one_thousand_over_filesize{};
   std::size_t prev_offset{};
@@ -190,7 +209,6 @@ radmeth(const bool show_progress, const bool more_na_info,
         const std::uint32_t n_threads, const std::string &table_filename,
         const std::string &outfile, const Regression &alt_model,
         const Regression &null_model, const std::uint32_t test_factor_idx) {
-  static constexpr auto prefix_fmt = "%s\t%ld\t%c\t%s\t";
   static constexpr auto suffix_fmt = "\t%ld\t%ld\t%ld\t%ld\n";
   static constexpr auto buf_size = 1024;
   static constexpr auto max_lines = 16384;
@@ -307,11 +325,8 @@ that the design matrix and the proportion table are correctly formatted.
           n_bytes[b] = [&] {
             // clang-format off
             const int n_prefix_bytes =
-              std::sprintf(bufs[b].data(), prefix_fmt,
-                           t_alt_model.props.chrom.data(),
-                           t_alt_model.props.position,
-                           t_alt_model.props.strand,
-                           t_alt_model.props.context.data());
+              std::sprintf(bufs[b].data(), "%s\t",
+                           t_alt_model.props.rowname.data());
             // clang-format on
             if (n_prefix_bytes < 0)
               return n_prefix_bytes;
@@ -454,8 +469,14 @@ main_radmeth(int argc, char *argv[]) {
     if (verbose)
       std::cerr << "Null model:\n" << null_model.design << '\n';
 
+    const auto start_time = std::chrono::steady_clock::now();
     radmeth(show_progress, more_na_info, n_threads, table_filename, outfile,
             alt_model, null_model, test_factor_idx);
+    const auto stop_time = std::chrono::steady_clock::now();
+
+    if (verbose)
+      std::cerr << "[total time: " << format_duration(stop_time - start_time)
+                << "]\n";
   }
   catch (const std::exception &e) {
     std::cerr << e.what() << '\n';
diff --git a/src/radmeth/radmeth_model.cpp b/src/radmeth/radmeth_model.cpp
index a4b6ef6f..98fb2b84 100644
--- a/src/radmeth/radmeth_model.cpp
+++ b/src/radmeth/radmeth_model.cpp
@@ -24,8 +24,8 @@
 #include <string>
 #include <vector>
 
-double Regression::tolerance = 1e-3;
-double Regression::stepsize = 0.001;
+double Regression::tolerance = 1e-4;
+double Regression::stepsize = 0.01;
 std::uint32_t Regression::max_iter = 250;
 
 void
@@ -33,45 +33,12 @@ SiteProp::parse(const std::string &line) {
   const auto first_ws = line.find_first_of(" \t");
 
   // Parse the row name (must be like: "chr:position:strand:context")
-  bool failed = false;
+  bool failed = (first_ws == std::string::npos);
 
-  const auto label_end = line.data() + first_ws;
   auto field_s = line.data();
-  auto field_e = std::find(field_s + 1, label_end, ':');
-  if (field_e == label_end)
-    failed = true;
-
-  {
-    const std::uint32_t d = std::distance(field_s, field_e);
-    chrom = std::string{field_s, d};
-  }
-
-  field_s = field_e + 1;
-  field_e = std::find(field_s + 1, label_end, ':');
-  failed = failed || (field_e == label_end);
-
-  {
-    const auto [ptr, ec] = std::from_chars(field_s, field_e, position);
-    failed = failed || (ptr == field_s);
-  }
-
-  field_s = field_e + 1;
-  field_e = std::find(field_s + 1, label_end, ':');
-  failed = failed || (field_e != field_s + 1 || field_e == label_end);
-
-  strand = *field_s;
-  failed = failed || (strand != '-' && strand != '+');
-
-  field_s = field_e + 1;
-  field_e = std::find_if(field_s + 1, label_end,
-                         [](const auto x) { return x == ' ' || x == '\t'; });
-  failed = failed || (field_e != label_end);
-
-  {
-    const std::uint32_t d = std::distance(field_s, field_e);
-    context = std::string{field_s, d};
-  }
-
+  auto field_e = line.data() + first_ws;
+  rowname = std::string{field_s, field_e};
+  std::replace(std::begin(rowname), std::end(rowname), ':', '\t');
   if (failed)
     throw std::runtime_error("failed to parse label from:\n" + line);
 
diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp
index 8775ff5a..56b0dc30 100644
--- a/src/radmeth/radmeth_model.hpp
+++ b/src/radmeth/radmeth_model.hpp
@@ -75,10 +75,7 @@ operator>>(std::istream &in, mcounts &rm) {
 }
 
 struct SiteProp {
-  std::string chrom;
-  std::size_t position{};
-  char strand{};
-  std::string context;
+  std::string rowname;
   std::vector<mcounts> mc;
 
   void
@@ -97,7 +94,8 @@ struct Regression {
   // scratch space
   std::vector<cumul_counts> cumul;
   std::vector<double> p_v;
-  std::vector<double> log1p_fact_v;
+  std::vector<double> cache;
+  std::uint32_t max_r_count{};
 
   [[nodiscard]] std::size_t
   n_factors() const {
diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp
index 4839c229..3823e540 100644
--- a/src/radmeth/radmeth_optimize.cpp
+++ b/src/radmeth/radmeth_optimize.cpp
@@ -37,21 +37,34 @@ get_p(const std::vector<T> &v, const gsl_vector *params) {
 }
 
 static inline auto
-cache_log1p_factors(Regression &reg, const double phi) {
+set_max_r_count(Regression &reg) {
   const auto &cumul = reg.cumul;
   const auto max_itr = std::max_element(
     std::cbegin(cumul), std::cend(cumul), [](const auto &a, const auto &b) {
       return std::size(a.r_counts) < std::size(b.r_counts);
     });
-  const std::size_t max_k = std::size(max_itr->r_counts);
-  auto &cache = reg.log1p_fact_v;
+  reg.max_r_count = std::size(max_itr->r_counts);
   // ADS: avoid the realloc that can happen even for resize(smaller_size)
-  if (max_k > std::size(cache))
-    cache.resize(max_k, 0.0);
+  if (reg.max_r_count > std::size(reg.cache))
+    reg.cache.resize(reg.max_r_count);
+}
+
+static inline auto
+cache_log1p_factors(Regression &reg, const double phi) {
+  const std::size_t max_k = reg.max_r_count;
+  auto &cache = reg.cache;
   for (std::size_t k = 0; k < max_k; ++k)
     cache[k] = std::log1p(phi * (k - 1.0));
 }
 
+static inline auto
+cache_dispersion_effect(Regression &reg, const double phi) {
+  const std::size_t max_k = reg.max_r_count;
+  auto &cache = reg.cache;
+  for (std::size_t k = 0; k < max_k; ++k)
+    cache[k] = (k - 1.0) / (1.0 + phi * (k - 1.0));
+}
+
 [[nodiscard]] static double
 log_likelihood(const gsl_vector *params, Regression &reg) {
   const auto phi = logistic(gsl_vector_get(params, reg.design.n_factors()));
@@ -64,7 +77,7 @@ log_likelihood(const gsl_vector *params, Regression &reg) {
   // ADS: precompute the log1p(phi * (k - 1.0)) values, which are reused for
   // each group.
   cache_log1p_factors(reg, phi);
-  const auto &log1p_fact_v = reg.log1p_fact_v;
+  const auto &log1p_fact_v = reg.cache;
 
   double log_lik = 0.0;
   for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) {
@@ -102,10 +115,14 @@ gradient(const gsl_vector *params, Regression &reg, gsl_vector *output) {
   for (auto g_idx = 0u; g_idx < n_groups; ++g_idx)
     p_v[g_idx] = get_p(groups[g_idx], params);
 
+  cache_dispersion_effect(reg, phi);
+  const auto &dispersion_effect = reg.cache;  // (k-1)/(1 + phi(k-1))
+
   // init output to zero for all factors
   gsl_vector_set_all(output, 0.0);
   auto &data = output->data;
 
+  double disp_deriv = 0.0;
   for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) {
     const auto p = p_v[g_idx];
     const auto one_minus_p = 1.0 - p;
@@ -114,13 +131,26 @@ gradient(const gsl_vector *params, Regression &reg, gsl_vector *output) {
 
     const auto denom_term1 = one_minus_phi * p;
     const auto &cumul_y = cumul[g_idx].m_counts;
-    for (auto k = 0u; k < std::size(cumul_y); ++k)
-      deriv += cumul_y[k] / (denom_term1 + phi * k);
+    const auto y_lim = std::size(cumul_y);
+    for (auto k = 0u; k < y_lim; ++k) {
+      const auto common_factor = cumul_y[k] / (denom_term1 + phi * k);
+      deriv += common_factor;
+      disp_deriv += (k - p) * common_factor;
+    }
 
     const auto denom_term2 = one_minus_phi * one_minus_p;
     const auto &cumul_d = cumul[g_idx].d_counts;
-    for (auto k = 0u; k < std::size(cumul_d); ++k)
-      deriv -= cumul_d[k] / (denom_term2 + phi * k);
+    const auto d_lim = std::size(cumul_d);
+    for (auto k = 0u; k < d_lim; ++k) {
+      const auto common_factor = cumul_d[k] / (denom_term2 + phi * k);
+      deriv -= common_factor;
+      disp_deriv += (k - one_minus_p) * common_factor;
+    }
+
+    const auto &cumul_n = cumul[g_idx].r_counts;
+    const auto n_lim = std::size(cumul_n);
+    for (auto k = 0u; k < n_lim; ++k)
+      disp_deriv -= cumul_n[k] * dispersion_effect[k];
 
     const auto &g = groups[g_idx];
     const auto denom_term1_one_minus_p = denom_term1 * one_minus_p;
@@ -131,31 +161,7 @@ gradient(const gsl_vector *params, Regression &reg, gsl_vector *output) {
       data[fact_idx] += deriv * (denom_term1_one_minus_p * level);
     }
   }
-
-  double deriv = 0.0;
-  auto cumul_itr = std::cbegin(cumul);
-  for (auto g_idx = 0u; g_idx < n_groups; ++g_idx, ++cumul_itr) {
-    const auto p = get_p(groups[g_idx], params);
-    const auto one_minus_p = 1.0 - p;
-
-    const auto term1 = one_minus_phi * p;
-    const auto &cumul_y = cumul_itr->m_counts;
-    const auto y_lim = std::size(cumul_y);
-    for (auto k = 0u; k < y_lim; ++k)
-      deriv += cumul_y[k] * (k - p) / (term1 + phi * k);
-
-    const auto term2 = one_minus_phi * one_minus_p;
-    const auto &cumul_d = cumul_itr->d_counts;
-    const auto d_lim = std::size(cumul_d);
-    for (auto k = 0u; k < d_lim; ++k)
-      deriv += cumul_d[k] * (k - one_minus_p) / (term2 + phi * k);
-
-    const auto &cumul_n = cumul_itr->r_counts;
-    const auto n_lim = std::size(cumul_n);
-    for (auto k = 0u; k < n_lim; ++k)
-      deriv -= cumul_n[k] * (k - 1.0) / (1.0 + phi * (k - 1.0));
-  }
-  gsl_vector_set(output, n_factors, deriv * (phi * one_minus_phi));
+  gsl_vector_set(output, n_factors, disp_deriv * (phi * one_minus_phi));
 }
 
 [[nodiscard]] static double
@@ -230,6 +236,7 @@ fit_regression_model(Regression &r, std::vector<double> &params_init) {
   const auto max_iter = Regression::max_iter;
 
   get_cumulative(r.design.group_id, r.design.n_groups(), r.props.mc, r.cumul);
+  set_max_r_count(r);
 
   // one more than the number of factors
   const std::size_t n_params = r.n_factors() + 1;