From ef03d1089bb8e8acdce3c852f78330054924a736 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Aug 2025 19:38:59 -0700 Subject: [PATCH 01/11] src/radmeth/radmeth.cpp: moving some functions out of this source file and into the radmeth_model.chpp sources --- src/radmeth/radmeth.cpp | 162 +++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 87 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 0fa8fcbe..e4a89aa3 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -1,9 +1,7 @@ -/* Copyright (C) 2013-2023 University of Southern California and - * Egor Dolzhenko - * Andrew D Smith - * Guilherme Sena +/* Copyright (C) 2013-2025 Andrew D Smith * - * Authors: Andrew D. Smith and Egor Dolzhenko and Guilherme Sena + * Author: Andrew D. Smith + * Contributors: Andrew D. Smith and Egor Dolzhenko and Guilherme Sena * * This program is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -16,6 +14,15 @@ * General Public License for more details. */ +#include "radmeth_model.hpp" +#include "radmeth_optimize.hpp" + +// smithlab headers +#include "GenomicRegion.hpp" +#include "OptionParser.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" + #include // GSL header for chisqrd distribution #include @@ -29,20 +36,6 @@ #include #include -// smithlab headers -#include "GenomicRegion.hpp" -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - -#include "radmeth_model.hpp" -#include "radmeth_optimize.hpp" - -using std::begin; -using std::cout; -using std::distance; -using std::end; - struct file_progress { double one_hundred_over_filesize{}; std::size_t prev_offset{}; @@ -50,24 +43,17 @@ struct file_progress { one_hundred_over_filesize{100.0 / std::filesystem::file_size(filename)} {} void operator()(std::ifstream &in) { - const std::size_t curr_offset = in.tellg() * one_hundred_over_filesize; + const std::size_t curr_offset = + in.eof() ? 100 : in.tellg() * one_hundred_over_filesize; if (curr_offset <= prev_offset) return; - std::cerr << "[progress: " << std::setw(3) << curr_offset - << (curr_offset == 100 ? "%]\n" : "%]\r"); + std::cerr << "\r[progress: " << std::setw(3) << curr_offset + << (curr_offset == 100 ? "%]\n" : "%]"); prev_offset = (curr_offset == 100) ? std::numeric_limits::max() : curr_offset; } }; -static void -remove_factor(Design &design, const std::size_t factor_idx) { - design.factor_names.erase(std::begin(design.factor_names) + factor_idx); - for (std::size_t i = 0; i < design.n_samples(); ++i) - design.matrix[i].erase(std::begin(design.matrix[i]) + factor_idx); -} - -/***************** RADMETH ALGORITHM *****************/ static bool consistent_sample_names(const Regression ®, const std::string &header) { std::istringstream iss(header); @@ -84,7 +70,7 @@ consistent_sample_names(const Regression ®, const std::string &header) { // function outputs the p-value of the log-likelihood ratio. *Note* that it is // assumed that the reduced model has one fewer factor than the reduced model. static double -llr_test(double null_loglik, double full_loglik) { +llr_test(const double null_loglik, const double full_loglik) { // The log-likelihood ratio statistic. const double log_lik_stat = -2 * (null_loglik - full_loglik); @@ -100,16 +86,15 @@ llr_test(double null_loglik, double full_loglik) { } static bool -has_low_coverage(const Regression ®, const std::size_t test_fac) { +has_low_coverage(const Regression ®, const std::size_t test_factor) { bool cvrd_in_test_fact_smpls = false; + const auto &tcol = reg.design.tmatrix[test_factor]; for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_test_fact_smpls; ++i) - cvrd_in_test_fact_smpls = - (reg.design.matrix[i][test_fac] == 1 && reg.props.mc[i].n_reads != 0); + cvrd_in_test_fact_smpls = (tcol[i] == 1 && reg.props.mc[i].n_reads != 0); bool cvrd_in_other_smpls = false; for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_other_smpls; ++i) - cvrd_in_other_smpls = - (reg.design.matrix[i][test_fac] != 1 && reg.props.mc[i].n_reads != 0); + cvrd_in_other_smpls = (tcol[i] != 1 && reg.props.mc[i].n_reads != 0); return !cvrd_in_test_fact_smpls || !cvrd_in_other_smpls; } @@ -131,9 +116,9 @@ has_extreme_counts(const Regression ®) { [[nodiscard]] static bool has_two_values(const Regression ®, const std::size_t test_factor) { - const auto &v = reg.design.tmatrix[test_factor]; - for (auto i = std::cbegin(v); i != std::cend(v); ++i) - if (*i != v[0]) + const auto &tcol = reg.design.tmatrix[test_factor]; + for (const auto x : tcol) + if (x != tcol[0]) return true; return false; } @@ -145,25 +130,19 @@ get_test_factor_idx(const Regression &model, const std::string &test_factor) { std::find(std::cbegin(factors), std::cend(factors), test_factor); if (itr == std::cend(factors)) - throw std::runtime_error("Factor not part of design: " + test_factor); + throw std::runtime_error("factor not part of design: " + test_factor); return std::distance(std::cbegin(factors), itr); } -static void -read_design(const bool verbose, const std::string &design_filename, - Design &design) { - if (verbose) - std::cerr << "design table filename: " << design_filename << std::endl; +[[nodiscard]] static Design +read_design(const std::string &design_filename) { std::ifstream design_file(design_filename); if (!design_file) throw std::runtime_error("could not open file: " + design_filename); - - // initialize full design matrix from file + Design design; design_file >> design; - - if (verbose) - std::cerr << design << std::endl; + return design; } enum class row_status : std::uint8_t { @@ -174,11 +153,11 @@ enum class row_status : std::uint8_t { }; [[nodiscard]] static std::vector -drop_idx(const std::vector &v, const std::size_t to_drop) { +drop_idx(const std::vector &v, const std::size_t idx_to_drop) { std::vector u; u.reserve(std::size(v) - 1); for (auto i = 0u; i < std::size(v); ++i) - if (i != to_drop) + if (i != idx_to_drop) u.push_back(v[i]); return u; } @@ -218,14 +197,18 @@ radmeth(const bool show_progress, const bool more_na_info, std::string sample_names_header; std::getline(table_file, sample_names_header); - if (!consistent_sample_names(alt_model, sample_names_header)) - throw std::runtime_error( - "header:\n" + sample_names_header + "\n" + - "does not match factor names or their order in the\n" - "design matrix. Check that the design matrix and\n" - "the proportion table are correctly formatted."); + if (!consistent_sample_names(alt_model, sample_names_header)) { + // clang-format off + const auto message = + R"( +does not match factor names or their order in the design matrix. Check +that the design matrix and the proportion table are correctly formatted. +)"; + // clang-format on + throw std::runtime_error("header:\n" + sample_names_header + message); + } - const std::size_t n_samples = alt_model.design.n_samples(); + const std::size_t n_samples = alt_model.n_samples(); std::ofstream out(outfile); if (!out) @@ -266,9 +249,9 @@ radmeth(const bool show_progress, const bool more_na_info, throw std::runtime_error("found row with wrong number of columns"); const auto [p_val, status] = [&]() -> std::tuple { - // Skip the test if (1) no coverage in all cases or in all controls, - // or (2) the site is completely methylated or completely - // unmethylated across all samples. + // Skip the test if (1) no coverage in all cases or in all + // controls, or (2) the site is completely methylated or + // completely unmethylated across all samples. if (has_low_coverage(t_alt_model, test_factor_idx)) return std::tuple{1.0, row_status::na_low_cov}; @@ -308,11 +291,12 @@ radmeth(const bool show_progress, const bool more_na_info, n_bytes[b] = [&] { // clang-format off - const int n_prefix_bytes = std::sprintf(bufs[b].data(), prefix_fmt, - t_alt_model.props.chrom.data(), - t_alt_model.props.position, - t_alt_model.props.strand, - t_alt_model.props.context.data()); + const int n_prefix_bytes = + std::sprintf(bufs[b].data(), prefix_fmt, + t_alt_model.props.chrom.data(), + t_alt_model.props.position, + t_alt_model.props.strand, + t_alt_model.props.context.data()); // clang-format on if (n_prefix_bytes < 0) return n_prefix_bytes; @@ -336,11 +320,11 @@ radmeth(const bool show_progress, const bool more_na_info, cursor += n_pval_bytes; // clang-format off - const int n_suffix_bytes = std::sprintf(cursor, suffix_fmt, - n_reads_factor, - n_meth_factor, - n_reads_others, - n_meth_others); + const int n_suffix_bytes = std::sprintf(cursor, suffix_fmt, + n_reads_factor, + n_meth_factor, + n_reads_others, + n_meth_others); // clang-format on if (n_suffix_bytes < 0) return n_suffix_bytes; @@ -366,12 +350,11 @@ radmeth(const bool show_progress, const bool more_na_info, if (n_lines < n_threads) break; } + + if (show_progress) + progress(table_file); } -/*********************************************************************** - * Run beta-binoimial regression using the specified table with - * proportions and design matrix - */ int main_radmeth(int argc, char *argv[]) { try { @@ -431,31 +414,36 @@ main_radmeth(int argc, char *argv[]) { const std::string table_filename(leftover_args.back()); /****************** END COMMAND LINE OPTIONS *****************/ + if (verbose) + std::cerr << "design table filename: " << design_filename << '\n'; + // initialize full design matrix from file - Regression orig_alt_model; - read_design(verbose, design_filename, orig_alt_model.design); + Regression alt_model; + alt_model.design = read_design(design_filename); + if (verbose) + std::cerr << "Alternate model:\n" << alt_model.design << '\n'; // Check that provided test factor name exists and find its index. - const auto test_factor_idx = - get_test_factor_idx(orig_alt_model, test_factor); + const auto test_factor_idx = get_test_factor_idx(alt_model, test_factor); // verify that the design includes more than one level for the // test factor - if (!has_two_values(orig_alt_model, test_factor_idx)) { - const auto first_level = orig_alt_model.design.matrix[0][test_factor_idx]; + if (!has_two_values(alt_model, test_factor_idx)) { + const auto first_level = alt_model.design.matrix[0][test_factor_idx]; throw std::runtime_error("test factor only one level: " + test_factor + ", " + std::to_string(first_level)); } - Regression orig_null_model; - orig_null_model.design = orig_alt_model.design; - remove_factor(orig_null_model.design, test_factor_idx); + Regression null_model = alt_model; + null_model.design = alt_model.design.drop_factor(test_factor_idx); + if (verbose) + std::cerr << "Null model:\n" << null_model.design << '\n'; radmeth(show_progress, more_na_info, n_threads, table_filename, outfile, - orig_alt_model, orig_null_model, test_factor_idx); + alt_model, null_model, test_factor_idx); } catch (const std::exception &e) { - std::cerr << "ERROR: " << e.what() << std::endl; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 0f1c05b90c60af5f998007eb837ef8550d6644e4 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Aug 2025 19:39:32 -0700 Subject: [PATCH 02/11] src/radmeth/radmeth_model.hpp: adding cumulative counts for the new algorithm to calculate the beta-binomial likelihood --- src/radmeth/radmeth_model.hpp | 41 ++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp index d7a01d63..bf60620d 100644 --- a/src/radmeth/radmeth_model.hpp +++ b/src/radmeth/radmeth_model.hpp @@ -25,19 +25,37 @@ #include #include +struct cumulative_counts { + std::vector m_counts; + std::vector r_counts; + std::vector d_counts; +}; + struct Design { std::vector factor_names; std::vector sample_names; std::vector> matrix; std::vector> tmatrix; - std::size_t + std::vector> groups; + std::vector group_id; + + [[nodiscard]] std::size_t n_factors() const { - return factor_names.size(); + return std::size(factor_names); + } + + [[nodiscard]] std::size_t + n_groups() const { + return std::size(groups); } - std::size_t + + [[nodiscard]] std::size_t n_samples() const { - return sample_names.size(); + return std::size(sample_names); } + + [[nodiscard]] Design + drop_factor(const std::size_t factor_idx); }; std::istream & @@ -56,7 +74,7 @@ operator>>(std::istream &in, mcounts &rm) { return in >> rm.n_reads >> rm.n_meth; } -struct SiteProportions { +struct SiteProp { std::string chrom; std::size_t position{}; char strand{}; @@ -68,13 +86,15 @@ struct SiteProportions { }; struct Regression { - static double tolerance; // 1e-4; + static double tolerance; // 1e-3; static double stepsize; // 0.001; - static std::uint32_t max_iter; // 700; + static std::uint32_t max_iter; // 250; Design design; - SiteProportions props; + SiteProp props; double max_loglik{}; + + std::vector cumul; std::vector p_v; // scratch space [[nodiscard]] std::size_t @@ -82,6 +102,11 @@ struct Regression { return design.n_factors(); } + [[nodiscard]] std::size_t + n_groups() const { + return design.n_groups(); + } + [[nodiscard]] std::size_t props_size() const { return std::size(props.mc); From 6ac4a904c2a6c86fd7192b0ea87a79d9975c17f0 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Aug 2025 19:40:28 -0700 Subject: [PATCH 03/11] src/radmeth/radmeth_model.cpp: added functions to make groups and to get the mapping from sample to group; also put some functions here that were previously in radmeth.cpp because they belong with the model structs --- src/radmeth/radmeth_model.cpp | 49 +++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/src/radmeth/radmeth_model.cpp b/src/radmeth/radmeth_model.cpp index faa4a8bb..a4b6ef6f 100644 --- a/src/radmeth/radmeth_model.cpp +++ b/src/radmeth/radmeth_model.cpp @@ -29,7 +29,7 @@ double Regression::stepsize = 0.001; std::uint32_t Regression::max_iter = 250; void -SiteProportions::parse(const std::string &line) { +SiteProp::parse(const std::string &line) { const auto first_ws = line.find_first_of(" \t"); // Parse the row name (must be like: "chr:position:strand:context") @@ -105,6 +105,27 @@ SiteProportions::parse(const std::string &line) { throw std::runtime_error("failed to parse counts from:\n" + line); } +static void +make_groups(Design &design) { + auto s = design.matrix; + std::sort(std::begin(s), std::end(s)); + s.erase(std::unique(std::begin(s), std::end(s)), std::end(s)); + design.groups = std::move(s); +} + +static void +assign_groups(Design &design) { + const auto &matrix = design.matrix; + const auto &s = design.groups; + const auto n_samples = design.n_samples(); + auto &group_id = design.group_id; + group_id.resize(n_samples); + for (auto i = 0u; i < n_samples; ++i) { + const auto x = std::find(std::cbegin(s), std::cend(s), matrix[i]); + group_id[i] = std::distance(std::cbegin(s), x); + } +} + template static void transpose(const std::vector> &mat, @@ -152,10 +173,33 @@ operator>>(std::istream &is, Design &design) { } transpose(design.matrix, design.tmatrix); + make_groups(design); + assign_groups(design); return is; } +[[nodiscard]] Design +Design::drop_factor(const std::size_t factor_idx) { + // clang-format off + Design design{ + factor_names, + sample_names, + matrix, + tmatrix, + groups, + group_id, + }; + // clang-format on + design.factor_names.erase(std::begin(design.factor_names) + factor_idx); + for (auto i = 0u; i < n_samples(); ++i) + design.matrix[i].erase(std::begin(design.matrix[i]) + factor_idx); + transpose(design.matrix, design.tmatrix); + make_groups(design); + assign_groups(design); + return design; +} + std::ostream & operator<<(std::ostream &out, const Design &design) { for (std::size_t factor = 0; factor < design.factor_names.size(); ++factor) { @@ -168,8 +212,9 @@ operator<<(std::ostream &out, const Design &design) { for (std::size_t i = 0; i < design.n_samples(); ++i) { out << design.sample_names[i]; for (std::size_t j = 0; j < design.n_factors(); ++j) - out << "\t" << design.matrix[i][j]; + out << "\t" << static_cast(design.matrix[i][j]); out << "\n"; } + return out; } From 5cfdb63b90f644a421f3005478b5e1db8f7e968c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Aug 2025 19:54:23 -0700 Subject: [PATCH 04/11] src/radmeth/radmeth_optimize.cpp: implementing a new algorithm for computing the beta-binomial likelihood and gradients. This algorithm accumulates mass without the use of unstable gamma functions but minimizes the number of summations by re-grouping terms. Doing this required a focus on groups rather than factors and samples, where a group is a set of factor level combinations --- src/radmeth/radmeth_optimize.cpp | 221 ++++++++++++++++++------------- 1 file changed, 131 insertions(+), 90 deletions(-) diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index 8affe87e..e58f117d 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -1,9 +1,8 @@ -/* Copyright (C) 2013-2023 University of Southern California and - * Egor Dolzhenko +/* Copyright (C) 2013-2025 University of Southern California and * Andrew D Smith - * Guilherme Sena * - * Authors: Andrew D. Smith and Egor Dolzhenko and Guilherme Sena + * Author: Andrew D. Smith + * Contributors: Egor Dolzhenko and Guilherme Sena * * This program is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -23,140 +22,133 @@ #include #include -#include -#include #include #include #include +#include +#include + +[[nodiscard]] static inline double +logistic(const double x) { + return 1.0 / (1.0 / std::exp(x) + 1.0); +} + template [[nodiscard]] static double -pi(const std::vector &v, const gsl_vector *params) { - // ADS: this function doesn't have a very helpful name +get_p(const std::vector &v, const gsl_vector *params) { const auto a = v.data(); - const double dot = std::inner_product(a, a + std::size(v), params->data, 0.0); - // const double p = std::exp(dot) / (1.0 + std::exp(dot)); - return 1.0 / (1.0 / std::exp(dot) + 1.0); + return logistic(std::inner_product(a, a + std::size(v), params->data, 0.0)); } [[nodiscard]] static double neg_loglik(const gsl_vector *params, void *object) { Regression *reg = (Regression *)(object); - // ADS: the dispersion parameter phi is the last element of - // parameter vector + // ADS: dispersion param phi is the last element of parameter vector const double disp_param = gsl_vector_get(params, reg->design.n_factors()); - // const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); - const double phi = 1.0 / (1.0 / std::exp(disp_param) + 1.0); + const double phi = logistic(disp_param); const double one_minus_phi = 1.0 - phi; - const auto &mc = reg->props.mc; - const auto &mat = reg->design.matrix; + const auto n_groups = reg->n_groups(); + const auto &groups = reg->design.groups; + const auto &cumul = reg->cumul; double log_lik = 0; - const std::size_t n_samples = reg->design.n_samples(); - for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { - const auto n = mc[col_idx].n_reads; - const auto y = mc[col_idx].n_meth; - const double p = pi(mat[col_idx], params); + for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) { + const double p = get_p(groups[g_idx], params); const double one_minus_p = 1.0 - p; const double term1 = one_minus_phi * p; - for (auto k = 0u; k < y; ++k) - log_lik += std::log(term1 + phi * k); + const auto &cumul_y = cumul[g_idx].m_counts; + for (std::size_t k = 0; k < std::size(cumul_y); ++k) + log_lik += cumul_y[k] * std::log(term1 + phi * k); const double term2 = one_minus_phi * one_minus_p; - for (auto k = 0u; k < n - y; ++k) - log_lik += std::log(term2 + phi * k); + const auto &cumul_d = cumul[g_idx].d_counts; + for (std::size_t k = 0; k < std::size(cumul_d); ++k) + log_lik += cumul_d[k] * std::log(term2 + phi * k); - for (auto k = 0u; k < n; ++k) - // log_lik -= std::log(1.0 + phi * (k - 1.0)); - log_lik -= std::log1p(phi * (k - 1.0)); + const auto &cumul_n = cumul[g_idx].r_counts; + for (std::size_t k = 0; k < std::size(cumul_n); ++k) + log_lik -= cumul_n[k] * std::log1p(phi * (k - 1.0)); } + return -log_lik; } static void neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) { Regression *reg = (Regression *)(object); - const std::size_t n_samples = reg->design.n_samples(); - const std::size_t n_factors = reg->design.n_factors(); - /// ADS: using scratch space held in reg instead of allocating here - // std::vector p_v(n_samples, 0.0); + const auto n_groups = reg->n_groups(); + const auto &groups = reg->design.groups; + const auto &cumul = reg->cumul; + + // ADS: using scratch space held in reg instead of allocating here auto &p_v = reg->p_v; - for (auto col_idx = 0u; col_idx < n_samples; ++col_idx) - p_v[col_idx] = pi(reg->design.matrix[col_idx], params); + for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) + p_v[g_idx] = get_p(groups[g_idx], params); + const std::size_t n_factors = reg->design.n_factors(); const double disp_param = gsl_vector_get(params, n_factors); - const auto &tmat = reg->design.tmatrix; - // const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); - const double phi = 1.0 / (1.0 / std::exp(disp_param) + 1.0); + const double phi = logistic(disp_param); const double one_minus_phi = 1.0 - phi; - const auto &mc = reg->props.mc; - - for (std::size_t factor_idx = 0; factor_idx < n_factors; ++factor_idx) { - double deriv = 0; - const auto &vec = tmat[factor_idx]; - for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { - const auto n = mc[col_idx].n_reads; - const auto y = mc[col_idx].n_meth; - const double p = p_v[col_idx]; - const double one_minus_p = 1.0 - p; + // init output to zero for all factors + gsl_vector_set_all(output, 0.0); - const double denom_term1 = one_minus_phi * p; - const double factor = denom_term1 * one_minus_p * vec[col_idx]; - if (factor == 0) - continue; + for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) { + const double p = p_v[g_idx]; + const double one_minus_p = 1.0 - p; + const double denom_term1 = one_minus_phi * p; + const double denom_term2 = one_minus_phi * one_minus_p; - double term = 0; + double term = 0; - const auto accum_term = [&](auto k, const auto lim, - const double denom_base) { - for (; k < lim; ++k) - term += 1.0 / (denom_base + phi * k); - }; + const auto &cumul_y = cumul[g_idx].m_counts; + for (auto k = 0u; k < std::size(cumul_y); ++k) + term += cumul_y[k] / (denom_term1 + phi * k); - const auto accum_subtract_term = [&](auto k, const auto lim, - const double denom_base) { - for (; k < lim; ++k) - term -= 1.0 / (denom_base + phi * k); - }; + const auto &cumul_d = cumul[g_idx].d_counts; + for (auto k = 0u; k < std::size(cumul_d); ++k) + term -= cumul_d[k] / (denom_term2 + phi * k); - accum_term(0u, y, denom_term1); - accum_subtract_term(0u, n - y, one_minus_phi * one_minus_p); + const auto &g = groups[g_idx]; + for (std::size_t factor_idx = 0; factor_idx < n_factors; ++factor_idx) { + const auto level = g[factor_idx]; + if (level == 0) + continue; + const double factor = denom_term1 * one_minus_p * level; + double deriv = gsl_vector_get(output, factor_idx); deriv += term * factor; + gsl_vector_set(output, factor_idx, deriv); } - gsl_vector_set(output, factor_idx, deriv); } double deriv = 0; - for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { - const auto n = mc[col_idx].n_reads; - const auto y = mc[col_idx].n_meth; - const double p = p_v[col_idx]; + for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) { + const double p = get_p(groups[g_idx], params); const double one_minus_p = 1.0 - p; - double term = 0; - const auto accum_term = [&](auto k, const auto lim, const double num_shift, - const double denom_base) { - for (; k < lim; ++k) - term += (k - num_shift) / (denom_base + phi * k); - }; - const auto accum_subtract_term = [&](auto k, const auto lim) { - for (; k < lim; ++k) - term -= (k - 1.0) / (1.0 + phi * (k - 1.0)); - }; - - accum_term(0u, y, p, one_minus_phi * p); - accum_term(0u, n - y, one_minus_p, one_minus_phi * one_minus_p); - accum_subtract_term(0u, n); - - deriv += term * phi * one_minus_phi; + const double term1 = one_minus_phi * p; + const auto &cumul_y = cumul[g_idx].m_counts; + for (auto k = 0u; k < std::size(cumul_y); ++k) + deriv += cumul_y[k] * (k - p) / (term1 + phi * k); + + const double term2 = one_minus_phi * one_minus_p; + const auto &cumul_d = cumul[g_idx].d_counts; + for (auto k = 0u; k < std::size(cumul_d); ++k) + deriv += cumul_d[k] * (k - one_minus_p) / (term2 + phi * k); + + const auto &cumul_n = cumul[g_idx].r_counts; + for (std::size_t k = 0; k < std::size(cumul_n); ++k) + deriv -= cumul_n[k] * (k - 1.0) / (1.0 + phi * (k - 1.0)); } + deriv *= (phi * one_minus_phi); + gsl_vector_set(output, n_factors, deriv); gsl_vector_scale(output, -1.0); } @@ -168,11 +160,61 @@ neg_loglik_and_grad(const gsl_vector *params, void *object, double *loglik_val, neg_gradient(params, object, d_loglik_val); } -bool +static void +get_cumulative(const std::vector &group_id, + const std::uint32_t n_groups, const std::vector &mc, + std::vector &cumul) { + const auto n_cols = std::size(mc); + cumul.clear(); + cumul.resize(n_groups); + + const auto compute_cumulative = [&](auto get_value, auto get_vector) { + // phase 1: determine max value for each group + for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) { + std::uint32_t max_v{}; + for (auto c_idx = 0u; c_idx < n_cols; ++c_idx) { + if (group_id[c_idx] == g_idx) { + const auto val = get_value(mc[c_idx]); + if (val > max_v) + max_v = val; + } + } + get_vector(cumul[g_idx]).resize(max_v, 0); + } + + // phase 2: fill cumulative counts + for (auto c_idx = 0u; c_idx < n_cols; ++c_idx) { + const auto g_idx = group_id[c_idx]; + const auto val = get_value(mc[c_idx]); + auto &vec = get_vector(cumul[g_idx]); + for (auto i = 0u; i < val; ++i) + vec[i]++; + } + }; + // call the lambda 3 times for m_counts, r_counts, d_counts + compute_cumulative([](const mcounts &m) { return m.n_meth; }, + [](cumulative_counts &c) -> std::vector & { + return c.m_counts; + }); + + compute_cumulative([](const mcounts &m) { return m.n_reads; }, + [](cumulative_counts &c) -> std::vector & { + return c.r_counts; + }); + + compute_cumulative([](const mcounts &m) { return m.n_reads - m.n_meth; }, + [](cumulative_counts &c) -> std::vector & { + return c.d_counts; + }); +} + +[[nodiscard]] bool fit_regression_model(Regression &r, std::vector ¶ms_init) { const auto stepsize = Regression::stepsize; const auto max_iter = Regression::max_iter; + get_cumulative(r.design.group_id, r.design.n_groups(), r.props.mc, r.cumul); + // one more than the number of factors const std::size_t n_params = r.n_factors() + 1; if (params_init.empty()) { @@ -180,7 +222,7 @@ fit_regression_model(Regression &r, std::vector ¶ms_init) { params_init.back() = -2.5; } - r.p_v.resize(r.n_samples()); + r.p_v.resize(r.n_groups()); const double tolerance = std::sqrt(n_params) * r.n_samples() * Regression::tolerance; @@ -204,7 +246,7 @@ fit_regression_model(Regression &r, std::vector ¶ms_init) { // - gsl_multimin_fdfminimizer_vector_bfgs2 // - gsl_multimin_fdfminimizer_steepest_descent const gsl_multimin_fdfminimizer_type *minimizer = - gsl_multimin_fdfminimizer_conjugate_fr; + gsl_multimin_fdfminimizer_conjugate_pr; gsl_multimin_fdfminimizer *s = gsl_multimin_fdfminimizer_alloc(minimizer, n_params); @@ -225,7 +267,6 @@ fit_regression_model(Regression &r, std::vector ¶ms_init) { // check status from gradient status = gsl_multimin_test_gradient(s->gradient, tolerance); } while (status == GSL_CONTINUE && ++iter < max_iter); - // ADS: What is a reasonable number of iterations? r.max_loglik = -1.0 * neg_loglik(s->x, &r); From 3b4a2a2b14d26e0d86ac20710a7191208e078c82 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Aug 2025 21:11:13 -0700 Subject: [PATCH 05/11] src/radmeth/radmeth_model.hpp: shortening some names --- src/radmeth/radmeth_model.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp index bf60620d..5b2641d8 100644 --- a/src/radmeth/radmeth_model.hpp +++ b/src/radmeth/radmeth_model.hpp @@ -25,7 +25,7 @@ #include #include -struct cumulative_counts { +struct cumul_counts { std::vector m_counts; std::vector r_counts; std::vector d_counts; @@ -94,7 +94,7 @@ struct Regression { SiteProp props; double max_loglik{}; - std::vector cumul; + std::vector cumul; std::vector p_v; // scratch space [[nodiscard]] std::size_t From a9631fe3d54988f5caecb750d301be2701046b1c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Aug 2025 21:15:52 -0700 Subject: [PATCH 06/11] src/radmeth/radmeth_optimize.cpp: cleaning up some code and tweaking for more optimization --- src/radmeth/radmeth_optimize.cpp | 185 ++++++++++++++----------------- 1 file changed, 86 insertions(+), 99 deletions(-) diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index e58f117d..c10b8cda 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -16,7 +16,6 @@ */ #include "radmeth_optimize.hpp" - #include "radmeth_model.hpp" #include @@ -26,9 +25,6 @@ #include #include -#include -#include - [[nodiscard]] static inline double logistic(const double x) { return 1.0 / (1.0 / std::exp(x) + 1.0); @@ -42,29 +38,25 @@ get_p(const std::vector &v, const gsl_vector *params) { } [[nodiscard]] static double -neg_loglik(const gsl_vector *params, void *object) { - Regression *reg = (Regression *)(object); +log_likelihood(const gsl_vector *params, const Regression ®) { + const auto phi = logistic(gsl_vector_get(params, reg.design.n_factors())); + const auto one_minus_phi = 1.0 - phi; - // ADS: dispersion param phi is the last element of parameter vector - const double disp_param = gsl_vector_get(params, reg->design.n_factors()); - const double phi = logistic(disp_param); - const double one_minus_phi = 1.0 - phi; + const auto n_groups = reg.n_groups(); + const auto &groups = reg.design.groups; + const auto &cumul = reg.cumul; - const auto n_groups = reg->n_groups(); - const auto &groups = reg->design.groups; - const auto &cumul = reg->cumul; - - double log_lik = 0; + double log_lik = 0.0; for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) { - const double p = get_p(groups[g_idx], params); - const double one_minus_p = 1.0 - p; + const auto p = get_p(groups[g_idx], params); + const auto one_minus_p = 1.0 - p; - const double term1 = one_minus_phi * p; + const auto term1 = one_minus_phi * p; const auto &cumul_y = cumul[g_idx].m_counts; for (std::size_t k = 0; k < std::size(cumul_y); ++k) log_lik += cumul_y[k] * std::log(term1 + phi * k); - const double term2 = one_minus_phi * one_minus_p; + const auto term2 = one_minus_phi * one_minus_p; const auto &cumul_d = cumul[g_idx].d_counts; for (std::size_t k = 0; k < std::size(cumul_d); ++k) log_lik += cumul_d[k] * std::log(term2 + phi * k); @@ -73,83 +65,88 @@ neg_loglik(const gsl_vector *params, void *object) { for (std::size_t k = 0; k < std::size(cumul_n); ++k) log_lik -= cumul_n[k] * std::log1p(phi * (k - 1.0)); } - - return -log_lik; + return log_lik; } static void -neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) { - Regression *reg = (Regression *)(object); +gradient(const gsl_vector *params, Regression ®, gsl_vector *output) { + const auto n_factors = reg.design.n_factors(); + const auto phi = logistic(gsl_vector_get(params, n_factors)); + const auto one_minus_phi = 1.0 - phi; - const auto n_groups = reg->n_groups(); - const auto &groups = reg->design.groups; - const auto &cumul = reg->cumul; + const auto n_groups = reg.n_groups(); + const auto &groups = reg.design.groups; + const auto &cumul = reg.cumul; - // ADS: using scratch space held in reg instead of allocating here - auto &p_v = reg->p_v; + auto &p_v = reg.p_v; // ADS: reusing scratch space for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) p_v[g_idx] = get_p(groups[g_idx], params); - const std::size_t n_factors = reg->design.n_factors(); - const double disp_param = gsl_vector_get(params, n_factors); - - const double phi = logistic(disp_param); - const double one_minus_phi = 1.0 - phi; - // init output to zero for all factors gsl_vector_set_all(output, 0.0); + auto &data = output->data; for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) { - const double p = p_v[g_idx]; - const double one_minus_p = 1.0 - p; - const double denom_term1 = one_minus_phi * p; - const double denom_term2 = one_minus_phi * one_minus_p; + const auto p = p_v[g_idx]; + const auto one_minus_p = 1.0 - p; - double term = 0; + double deriv = 0.0; + const auto denom_term1 = one_minus_phi * p; const auto &cumul_y = cumul[g_idx].m_counts; for (auto k = 0u; k < std::size(cumul_y); ++k) - term += cumul_y[k] / (denom_term1 + phi * k); + deriv += cumul_y[k] / (denom_term1 + phi * k); + const auto denom_term2 = one_minus_phi * one_minus_p; const auto &cumul_d = cumul[g_idx].d_counts; for (auto k = 0u; k < std::size(cumul_d); ++k) - term -= cumul_d[k] / (denom_term2 + phi * k); + deriv -= cumul_d[k] / (denom_term2 + phi * k); const auto &g = groups[g_idx]; - - for (std::size_t factor_idx = 0; factor_idx < n_factors; ++factor_idx) { - const auto level = g[factor_idx]; + for (auto fact_idx = 0u; fact_idx < n_factors; ++fact_idx) { + const auto level = g[fact_idx]; if (level == 0) continue; - const double factor = denom_term1 * one_minus_p * level; - double deriv = gsl_vector_get(output, factor_idx); - deriv += term * factor; - gsl_vector_set(output, factor_idx, deriv); + data[fact_idx] += deriv * (denom_term1 * one_minus_p * level); } } - double deriv = 0; - for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) { - const double p = get_p(groups[g_idx], params); - const double one_minus_p = 1.0 - p; + double deriv = 0.0; + auto cumul_itr = std::cbegin(cumul); + for (auto g_idx = 0u; g_idx < n_groups; ++g_idx, ++cumul_itr) { + const auto p = get_p(groups[g_idx], params); + const auto one_minus_p = 1.0 - p; - const double term1 = one_minus_phi * p; - const auto &cumul_y = cumul[g_idx].m_counts; - for (auto k = 0u; k < std::size(cumul_y); ++k) + const auto term1 = one_minus_phi * p; + const auto &cumul_y = cumul_itr->m_counts; + const auto y_lim = std::size(cumul_y); + for (auto k = 0u; k < y_lim; ++k) deriv += cumul_y[k] * (k - p) / (term1 + phi * k); - const double term2 = one_minus_phi * one_minus_p; - const auto &cumul_d = cumul[g_idx].d_counts; - for (auto k = 0u; k < std::size(cumul_d); ++k) + const auto term2 = one_minus_phi * one_minus_p; + const auto &cumul_d = cumul_itr->d_counts; + const auto d_lim = std::size(cumul_d); + for (auto k = 0u; k < d_lim; ++k) deriv += cumul_d[k] * (k - one_minus_p) / (term2 + phi * k); - const auto &cumul_n = cumul[g_idx].r_counts; - for (std::size_t k = 0; k < std::size(cumul_n); ++k) + const auto &cumul_n = cumul_itr->r_counts; + const auto n_lim = std::size(cumul_n); + for (auto k = 0u; k < n_lim; ++k) deriv -= cumul_n[k] * (k - 1.0) / (1.0 + phi * (k - 1.0)); } - deriv *= (phi * one_minus_phi); + gsl_vector_set(output, n_factors, deriv * (phi * one_minus_phi)); +} - gsl_vector_set(output, n_factors, deriv); +[[nodiscard]] static double +neg_loglik(const gsl_vector *params, void *object) { + auto reg = static_cast(object); + return -log_likelihood(params, *reg); +} + +static void +neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) { + auto reg = static_cast(object); + gradient(params, *reg, output); gsl_vector_scale(output, -1.0); } @@ -163,12 +160,12 @@ neg_loglik_and_grad(const gsl_vector *params, void *object, double *loglik_val, static void get_cumulative(const std::vector &group_id, const std::uint32_t n_groups, const std::vector &mc, - std::vector &cumul) { + std::vector &cumul) { const auto n_cols = std::size(mc); cumul.clear(); cumul.resize(n_groups); - const auto compute_cumulative = [&](auto get_value, auto get_vector) { + const auto comp_cumul = [&](auto get_value, auto get_vector) { // phase 1: determine max value for each group for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) { std::uint32_t max_v{}; @@ -192,24 +189,22 @@ get_cumulative(const std::vector &group_id, } }; // call the lambda 3 times for m_counts, r_counts, d_counts - compute_cumulative([](const mcounts &m) { return m.n_meth; }, - [](cumulative_counts &c) -> std::vector & { - return c.m_counts; - }); - - compute_cumulative([](const mcounts &m) { return m.n_reads; }, - [](cumulative_counts &c) -> std::vector & { - return c.r_counts; - }); - - compute_cumulative([](const mcounts &m) { return m.n_reads - m.n_meth; }, - [](cumulative_counts &c) -> std::vector & { - return c.d_counts; - }); + comp_cumul( + [](const mcounts &m) { return m.n_meth; }, + [](cumul_counts &c) -> std::vector & { return c.m_counts; }); + + comp_cumul( + [](const mcounts &m) { return m.n_reads; }, + [](cumul_counts &c) -> std::vector & { return c.r_counts; }); + + comp_cumul( + [](const mcounts &m) { return m.n_reads - m.n_meth; }, + [](cumul_counts &c) -> std::vector & { return c.d_counts; }); } [[nodiscard]] bool fit_regression_model(Regression &r, std::vector ¶ms_init) { + static constexpr auto init_dispersion_param = -2.5; const auto stepsize = Regression::stepsize; const auto max_iter = Regression::max_iter; @@ -219,19 +214,14 @@ fit_regression_model(Regression &r, std::vector ¶ms_init) { const std::size_t n_params = r.n_factors() + 1; if (params_init.empty()) { params_init.resize(n_params, 0.0); - params_init.back() = -2.5; + params_init.back() = init_dispersion_param; } - - r.p_v.resize(r.n_groups()); - - const double tolerance = - std::sqrt(n_params) * r.n_samples() * Regression::tolerance; - - if (params_init.size() != n_params) + if (std::size(params_init) != n_params) throw std::runtime_error("Wrong number of initial parameters."); - + r.p_v.resize(r.n_groups()); + const auto tol = std::sqrt(n_params) * r.n_samples() * Regression::tolerance; // clang-format off - gsl_multimin_function_fdf loglik_bundle = { + auto loglik_bundle = gsl_multimin_function_fdf{ &neg_loglik, // objective function &neg_gradient, // gradient &neg_loglik_and_grad, // combined obj and grad @@ -245,30 +235,27 @@ fit_regression_model(Regression &r, std::vector ¶ms_init) { // - gsl_multimin_fdfminimizer_conjugate_fr // - gsl_multimin_fdfminimizer_vector_bfgs2 // - gsl_multimin_fdfminimizer_steepest_descent - const gsl_multimin_fdfminimizer_type *minimizer = - gsl_multimin_fdfminimizer_conjugate_pr; + const auto minimizer = gsl_multimin_fdfminimizer_conjugate_pr; + auto s = gsl_multimin_fdfminimizer_alloc(minimizer, n_params); - gsl_multimin_fdfminimizer *s = - gsl_multimin_fdfminimizer_alloc(minimizer, n_params); - - gsl_vector *params = gsl_vector_alloc(n_params); - for (std::size_t i = 0; i < n_params; ++i) + auto params = gsl_vector_alloc(n_params); + for (auto i = 0u; i < n_params; ++i) gsl_vector_set(params, i, params_init[i]); - gsl_multimin_fdfminimizer_set(s, &loglik_bundle, params, stepsize, tolerance); + gsl_multimin_fdfminimizer_set(s, &loglik_bundle, params, stepsize, tol); int status = 0; std::size_t iter = 0; - do { status = gsl_multimin_fdfminimizer_iterate(s); // one iter and get status if (status) break; // check status from gradient - status = gsl_multimin_test_gradient(s->gradient, tolerance); + status = gsl_multimin_test_gradient(s->gradient, tol); } while (status == GSL_CONTINUE && ++iter < max_iter); - r.max_loglik = -1.0 * neg_loglik(s->x, &r); + const auto param_estimates = gsl_multimin_fdfminimizer_x(s); + r.max_loglik = log_likelihood(param_estimates, r); gsl_multimin_fdfminimizer_free(s); gsl_vector_free(params); From 45220731bd83ae1d3b3a196e67b8dd3735da2c96 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 10 Aug 2025 16:00:17 -0700 Subject: [PATCH 07/11] src/radmeth/radmeth.cpp: no longer iterating over chunks because consecutive sites of extreme coverage can cause an imbalance in terms of work done by threads. Also using a decimal digit of precision in the progress reporting --- src/radmeth/radmeth.cpp | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index e4a89aa3..97ea9ad7 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -37,20 +37,25 @@ #include struct file_progress { - double one_hundred_over_filesize{}; + double one_thousand_over_filesize{}; std::size_t prev_offset{}; explicit file_progress(const std::string &filename) : - one_hundred_over_filesize{100.0 / std::filesystem::file_size(filename)} {} + one_thousand_over_filesize{1000.0 / std::filesystem::file_size(filename)} {} void operator()(std::ifstream &in) { const std::size_t curr_offset = - in.eof() ? 100 : in.tellg() * one_hundred_over_filesize; + in.eof() ? 1000 : in.tellg() * one_thousand_over_filesize; if (curr_offset <= prev_offset) return; - std::cerr << "\r[progress: " << std::setw(3) << curr_offset - << (curr_offset == 100 ? "%]\n" : "%]"); - prev_offset = (curr_offset == 100) ? std::numeric_limits::max() - : curr_offset; + std::ios old_state(nullptr); + old_state.copyfmt(std::cerr); + std::cerr << "\r[progress: " << std::setw(5) << std::fixed + << std::setprecision(1) << (curr_offset / 10.0) + << (curr_offset == 1000 ? "%]\n" : "%]"); + std::cerr.copyfmt(old_state); + prev_offset = (curr_offset == 1000) + ? std::numeric_limits::max() + : curr_offset; } }; @@ -162,6 +167,9 @@ drop_idx(const std::vector &v, const std::size_t idx_to_drop) { return u; } +/// ADS: this function is not currently used, as the threads do not operate in +/// "chunks" +/* static inline void get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks, std::vector> &chunks) { @@ -175,6 +183,7 @@ get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks, block_start = block_end; } } +*/ static void radmeth(const bool show_progress, const bool more_na_info, @@ -224,7 +233,7 @@ that the design matrix and the proportion table are correctly formatted. std::vector alt_models(n_threads, alt_model); std::vector null_models(n_threads, null_model); - std::vector> chunks(n_threads); + // std::vector> chunks(n_threads); // Iterate over rows in the file and do llr test on proportions from // each. Do this in sets of rows to avoid having to spawn too many threads. @@ -235,15 +244,21 @@ that the design matrix and the proportion table are correctly formatted. if (n_lines == 0) break; - get_chunk_bounds(n_lines, n_threads, chunks); + /// ADS: chunks not used + // get_chunk_bounds(n_lines, n_threads, chunks); std::vector threads; for (auto thread_id = 0u; thread_id < n_threads; ++thread_id) { threads.emplace_back([&, thread_id] { - const auto &[chunk_beg, chunk_end] = chunks[thread_id]; + /// ADS: chunks not used + // const auto &[chunk_beg, chunk_end] = chunks[thread_id]; auto &t_alt_model = alt_models[thread_id]; auto &t_null_model = null_models[thread_id]; - for (auto b = chunk_beg; b != chunk_end; ++b) { + /// ADS: chunks not used + // for (auto b = chunk_beg; b != chunk_end; ++b) { + for (auto b = 0u; b < n_lines; ++b) { + if (b % n_threads != thread_id) + continue; t_alt_model.props.parse(lines[b]); if (t_alt_model.props_size() != n_samples) throw std::runtime_error("found row with wrong number of columns"); From 23166b5153404f81669ce957ee754b4b21592ec1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 10 Aug 2025 17:10:43 -0700 Subject: [PATCH 08/11] src/radmeth/radmeth_model.hpp: adding a vector to Regression struct to cache some log1p calculations --- src/radmeth/radmeth_model.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp index 5b2641d8..8775ff5a 100644 --- a/src/radmeth/radmeth_model.hpp +++ b/src/radmeth/radmeth_model.hpp @@ -94,8 +94,10 @@ struct Regression { SiteProp props; double max_loglik{}; + // scratch space std::vector cumul; - std::vector p_v; // scratch space + std::vector p_v; + std::vector log1p_fact_v; [[nodiscard]] std::size_t n_factors() const { From 771d0c87b4f2658bd8dfc21565855b816ae45da6 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 10 Aug 2025 17:11:29 -0700 Subject: [PATCH 09/11] src/radmeth/radmeth_optimize.cpp: adding a caching strategy for computing some log1p that can be reused across groups --- src/radmeth/radmeth_optimize.cpp | 33 ++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index c10b8cda..4839c229 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -1,8 +1,6 @@ -/* Copyright (C) 2013-2025 University of Southern California and - * Andrew D Smith +/* Copyright (C) 2013-2025 Andrew D Smith * * Author: Andrew D. Smith - * Contributors: Egor Dolzhenko and Guilherme Sena * * This program is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -21,6 +19,7 @@ #include #include +#include #include #include #include @@ -37,8 +36,24 @@ get_p(const std::vector &v, const gsl_vector *params) { return logistic(std::inner_product(a, a + std::size(v), params->data, 0.0)); } +static inline auto +cache_log1p_factors(Regression ®, const double phi) { + const auto &cumul = reg.cumul; + const auto max_itr = std::max_element( + std::cbegin(cumul), std::cend(cumul), [](const auto &a, const auto &b) { + return std::size(a.r_counts) < std::size(b.r_counts); + }); + const std::size_t max_k = std::size(max_itr->r_counts); + auto &cache = reg.log1p_fact_v; + // ADS: avoid the realloc that can happen even for resize(smaller_size) + if (max_k > std::size(cache)) + cache.resize(max_k, 0.0); + for (std::size_t k = 0; k < max_k; ++k) + cache[k] = std::log1p(phi * (k - 1.0)); +} + [[nodiscard]] static double -log_likelihood(const gsl_vector *params, const Regression ®) { +log_likelihood(const gsl_vector *params, Regression ®) { const auto phi = logistic(gsl_vector_get(params, reg.design.n_factors())); const auto one_minus_phi = 1.0 - phi; @@ -46,6 +61,11 @@ log_likelihood(const gsl_vector *params, const Regression ®) { const auto &groups = reg.design.groups; const auto &cumul = reg.cumul; + // ADS: precompute the log1p(phi * (k - 1.0)) values, which are reused for + // each group. + cache_log1p_factors(reg, phi); + const auto &log1p_fact_v = reg.log1p_fact_v; + double log_lik = 0.0; for (std::size_t g_idx = 0; g_idx < n_groups; ++g_idx) { const auto p = get_p(groups[g_idx], params); @@ -63,7 +83,7 @@ log_likelihood(const gsl_vector *params, const Regression ®) { const auto &cumul_n = cumul[g_idx].r_counts; for (std::size_t k = 0; k < std::size(cumul_n); ++k) - log_lik -= cumul_n[k] * std::log1p(phi * (k - 1.0)); + log_lik -= cumul_n[k] * log1p_fact_v[k]; } return log_lik; } @@ -103,11 +123,12 @@ gradient(const gsl_vector *params, Regression ®, gsl_vector *output) { deriv -= cumul_d[k] / (denom_term2 + phi * k); const auto &g = groups[g_idx]; + const auto denom_term1_one_minus_p = denom_term1 * one_minus_p; for (auto fact_idx = 0u; fact_idx < n_factors; ++fact_idx) { const auto level = g[fact_idx]; if (level == 0) continue; - data[fact_idx] += deriv * (denom_term1 * one_minus_p * level); + data[fact_idx] += deriv * (denom_term1_one_minus_p * level); } } From d9f88710033c105cc4679c0348f8ef69bc7308fd Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 10 Aug 2025 17:29:01 -0700 Subject: [PATCH 10/11] src/radmeth/radmeth.cpp: merge --- src/radmeth/radmeth.cpp | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 97ea9ad7..e4a89aa3 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -37,25 +37,20 @@ #include struct file_progress { - double one_thousand_over_filesize{}; + double one_hundred_over_filesize{}; std::size_t prev_offset{}; explicit file_progress(const std::string &filename) : - one_thousand_over_filesize{1000.0 / std::filesystem::file_size(filename)} {} + one_hundred_over_filesize{100.0 / std::filesystem::file_size(filename)} {} void operator()(std::ifstream &in) { const std::size_t curr_offset = - in.eof() ? 1000 : in.tellg() * one_thousand_over_filesize; + in.eof() ? 100 : in.tellg() * one_hundred_over_filesize; if (curr_offset <= prev_offset) return; - std::ios old_state(nullptr); - old_state.copyfmt(std::cerr); - std::cerr << "\r[progress: " << std::setw(5) << std::fixed - << std::setprecision(1) << (curr_offset / 10.0) - << (curr_offset == 1000 ? "%]\n" : "%]"); - std::cerr.copyfmt(old_state); - prev_offset = (curr_offset == 1000) - ? std::numeric_limits::max() - : curr_offset; + std::cerr << "\r[progress: " << std::setw(3) << curr_offset + << (curr_offset == 100 ? "%]\n" : "%]"); + prev_offset = (curr_offset == 100) ? std::numeric_limits::max() + : curr_offset; } }; @@ -167,9 +162,6 @@ drop_idx(const std::vector &v, const std::size_t idx_to_drop) { return u; } -/// ADS: this function is not currently used, as the threads do not operate in -/// "chunks" -/* static inline void get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks, std::vector> &chunks) { @@ -183,7 +175,6 @@ get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks, block_start = block_end; } } -*/ static void radmeth(const bool show_progress, const bool more_na_info, @@ -233,7 +224,7 @@ that the design matrix and the proportion table are correctly formatted. std::vector alt_models(n_threads, alt_model); std::vector null_models(n_threads, null_model); - // std::vector> chunks(n_threads); + std::vector> chunks(n_threads); // Iterate over rows in the file and do llr test on proportions from // each. Do this in sets of rows to avoid having to spawn too many threads. @@ -244,21 +235,15 @@ that the design matrix and the proportion table are correctly formatted. if (n_lines == 0) break; - /// ADS: chunks not used - // get_chunk_bounds(n_lines, n_threads, chunks); + get_chunk_bounds(n_lines, n_threads, chunks); std::vector threads; for (auto thread_id = 0u; thread_id < n_threads; ++thread_id) { threads.emplace_back([&, thread_id] { - /// ADS: chunks not used - // const auto &[chunk_beg, chunk_end] = chunks[thread_id]; + const auto &[chunk_beg, chunk_end] = chunks[thread_id]; auto &t_alt_model = alt_models[thread_id]; auto &t_null_model = null_models[thread_id]; - /// ADS: chunks not used - // for (auto b = chunk_beg; b != chunk_end; ++b) { - for (auto b = 0u; b < n_lines; ++b) { - if (b % n_threads != thread_id) - continue; + for (auto b = chunk_beg; b != chunk_end; ++b) { t_alt_model.props.parse(lines[b]); if (t_alt_model.props_size() != n_samples) throw std::runtime_error("found row with wrong number of columns"); From cb88503b1eec6c4a718dccb27de94eb17e05cc73 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Aug 2025 19:54:23 -0700 Subject: [PATCH 11/11] src/radmeth/radmeth_optimize.cpp: implementing a new algorithm for computing the beta-binomial likelihood and gradients. This algorithm accumulates mass without the use of unstable gamma functions but minimizes the number of summations by re-grouping terms. Doing this required a focus on groups rather than factors and samples, where a group is a set of factor level combinations --- src/radmeth/radmeth_optimize.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index 4839c229..5d103c6f 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -168,7 +168,6 @@ static void neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) { auto reg = static_cast(object); gradient(params, *reg, output); - gsl_vector_scale(output, -1.0); } static void