From ffca731dc239ca90745e2b62ecc4c6209703b534 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 20 Jul 2025 19:57:49 -0700 Subject: [PATCH 01/24] Makefile.am: adding radmeth_model.cpp to sources --- Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.am b/Makefile.am index 7ebf1d31..068ffad6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -236,6 +236,7 @@ dnmtools_SOURCES += src/amrfinder/amrtester.cpp dnmtools_SOURCES += src/radmeth/dmr.cpp dnmtools_SOURCES += src/radmeth/methdiff.cpp dnmtools_SOURCES += src/radmeth/radmeth_optimize.cpp +dnmtools_SOURCES += src/radmeth/radmeth_model.cpp dnmtools_SOURCES += src/radmeth/radmeth.cpp dnmtools_SOURCES += src/radmeth/radmeth-adjust.cpp dnmtools_SOURCES += src/radmeth/radmeth-merge.cpp From c5d8645bc93edc321201057f0880b70b48522fbd Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 20 Jul 2025 19:58:09 -0700 Subject: [PATCH 02/24] src/analysis/nanopore.cpp: removing a debug variable --- src/analysis/nanopore.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index d8886c5b..6eb3ba53 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -760,7 +760,6 @@ consistent_existing_targets( return true; } -int counter{}; struct read_processor { static constexpr auto default_expected_basecall_model = "dna_r10.4.1_e8.2_400bps_hac@v4.3.0"; From 9fd69ad60f26fc1566f727b459090b54b8fc671e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 20 Jul 2025 19:59:17 -0700 Subject: [PATCH 03/24] src/radmeth/radmeth_optimize.cpp: major rewrite for speed within the existing approach via general optmizers --- src/radmeth/radmeth_optimize.cpp | 265 +++++++++++++++++-------------- 1 file changed, 146 insertions(+), 119 deletions(-) diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index b1942328..e957cc6b 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -21,179 +21,206 @@ #include #include +#include +#include +#include #include #include -using std::vector; -using std::runtime_error; - -static double -pi(Regression *reg, size_t sample, const gsl_vector *params) { +[[nodiscard]] static double +pi(const std::vector &v, const gsl_vector *params) { // ADS: this function doesn't have a very helpful name - double dot_prod = 0; - for (size_t fact = 0; fact < reg->design.n_factors(); ++fact) - dot_prod += reg->design.matrix[sample][fact]*gsl_vector_get(params, fact); - - double p = exp(dot_prod)/(1 + exp(dot_prod)); - - return p; + const auto a = v.data(); + const double dot = std::inner_product(a, a + std::size(v), params->data, 0.0); + // const double p = -1.0 / std::expm1(-dot_prod); ? + return std::exp(dot) / (1.0 + std::exp(dot)); } - -static double +[[nodiscard]] static double neg_loglik(const gsl_vector *params, void *object) { Regression *reg = (Regression *)(object); - const size_t n_params = reg->design.factor_names.size() + 1; - - double log_lik = 0; // ADS: the dispersion parameter phi is the last element of // parameter vector - const double disp_param = gsl_vector_get(params, n_params - 1); - const double phi = exp(disp_param)/(1.0 + exp(disp_param)); - - for(size_t s = 0; s < reg->design.n_samples(); ++s) { - const double n_s = reg->props.total[s]; - const double y_s = reg->props.meth[s]; - const double p_s = pi(reg, s, params); - - for (int k = 0; k < y_s; ++k) - log_lik += log((1 - phi)*p_s + phi*k); + const double disp_param = gsl_vector_get(params, reg->design.n_factors()); + const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); + const double one_minus_phi = 1.0 - phi; - for (int k = 0; k < n_s - y_s; ++k) - log_lik += log((1 - phi)*(1 - p_s) + phi*k); + const auto &mc = reg->props.mc; - for (int k = 0; k < n_s; ++k) - log_lik -= log(1.0 + phi*(k - 1)); + double log_lik = 0; + const std::size_t n_samples = reg->design.n_samples(); + for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { + const double n = mc[col_idx].n_reads; + const double y = mc[col_idx].n_meth; + const double p = pi(reg->design.matrix[col_idx], params); + const double one_minus_p = 1.0 - p; + + const double term1 = one_minus_phi * p; + for (int k = 0; k < y; ++k) + log_lik += std::log(term1 + phi * k); + + const double term2 = one_minus_phi * one_minus_p; + for (int k = 0; k < n - y; ++k) + log_lik += std::log(term2 + phi * k); + + for (int k = 0; k < n; ++k) + log_lik -= std::log(1.0 + phi * (k - 1.0)); } - return -log_lik; } - static void -neg_gradient(const gsl_vector *params, void *object, - gsl_vector *output) { - +neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) { Regression *reg = (Regression *)(object); - const size_t n_params = reg->design.n_factors() + 1; + const std::size_t n_samples = reg->design.n_samples(); + const std::size_t n_factors = reg->design.n_factors(); - const double disp_param = gsl_vector_get(params, n_params - 1); + std::vector p_v(n_samples, 0.0); + for (auto col_idx = 0u; col_idx < n_samples; ++col_idx) + p_v[col_idx] = pi(reg->design.matrix[col_idx], params); - const double phi = exp(disp_param)/(1 + exp(disp_param)); + const double disp_param = gsl_vector_get(params, n_factors); - for(size_t f = 0; f < n_params; ++f) { + const auto &tmatrix = reg->design.tmatrix; + const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); + // const double phi = -1.0 / std::expm1(-disp_param); + const double one_minus_phi = 1.0 - phi; - double deriv = 0; + const auto &mc = reg->props.mc; - for(size_t s = 0; s < reg->design.n_samples(); ++s) { - int n_s = reg->props.total[s]; - int y_s = reg->props.meth[s]; - double p_s = pi(reg, s, params); + for (std::size_t f = 0; f < n_factors; ++f) { + double deriv = 0; + const auto &vec = tmatrix[f]; + for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { + const double n = mc[col_idx].n_reads; + const double y = mc[col_idx].n_meth; + const double p = p_v[col_idx]; + const double one_minus_p = 1.0 - p; + + const double denom_term1 = one_minus_phi * p; + const double factor = denom_term1 * one_minus_p * vec[col_idx]; + if (factor == 0) + continue; double term = 0; - //a parameter linked to p - if(f < reg->design.factor_names.size()) { - double factor = (1 - phi)*p_s*(1 - p_s)*reg->design.matrix[s][f]; - if (factor == 0) continue; - - for(int k = 0; k < y_s; ++k) - term += 1/((1 - phi)*p_s + phi*k); + const auto accumulate_term = [&](int k, const int lim, + const double denom_base) { + for (; k < lim; ++k) + term += 1.0 / (denom_base + phi * k); + }; - for(int k = 0; k < n_s - y_s; ++k) - term -= 1/((1 - phi)*(1 - p_s) + phi*k); + const auto accumulate_subtract_term = [&](int k, const int lim, + const double denom_base) { + for (; k < lim; ++k) + term -= 1.0 / (denom_base + phi * k); + }; - deriv += term*factor; - } else { // the parameter linked to phi - for(int k = 0; k < y_s; ++k) - term += (k - p_s)/((1 - phi)*p_s + phi*k); + accumulate_term(0, y, denom_term1); + accumulate_subtract_term(0, n - y, one_minus_phi * one_minus_p); - for(int k = 0; k < n_s - y_s; ++k) - term += (k - (1 - p_s))/((1 - phi)*(1 - p_s) + phi*k); - - for(int k = 0; k < n_s; ++k) { - term -= (k - 1)/(1 + phi*(k - 1)); - } - - deriv += term * phi * (1 - phi); - } + deriv += term * factor; } - gsl_vector_set(output, f, deriv); } + double deriv = 0; + for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { + const double n = mc[col_idx].n_reads; + const double y = mc[col_idx].n_meth; + const double p = p_v[col_idx]; + const double one_minus_p = 1.0 - p; + + double term = 0; + const auto accumulate_term = [&](int k, const int lim, + const double num_shift, + const double denom_base) { + for (; k < lim; ++k) + term += (k - num_shift) / (denom_base + phi * k); + }; + const auto accumulate_subtract_term = [&](int k, const int lim) { + for (; k < lim; ++k) + term -= (k - 1.0) / (1.0 + phi * (k - 1.0)); + }; + + accumulate_term(0, y, p, one_minus_phi * p); + accumulate_term(0, n - y, one_minus_p, one_minus_phi * one_minus_p); + accumulate_subtract_term(0, n); + + deriv += term * phi * one_minus_phi; + } + gsl_vector_set(output, n_factors, deriv); gsl_vector_scale(output, -1.0); } static void -neg_loglik_and_grad(const gsl_vector *params, - void *object, - double *loglik_val, +neg_loglik_and_grad(const gsl_vector *params, void *object, double *loglik_val, gsl_vector *d_loglik_val) { - *loglik_val = neg_loglik(params, object); neg_gradient(params, object, d_loglik_val); } - bool -fit_regression_model(Regression &r, vector &initial_params) { +fit_regression_model(Regression &r, std::vector ¶ms_init) { + const auto tolerance = Regression::tolerance; + const auto stepsize = Regression::stepsize; + const auto max_iter = Regression::max_iter; // one more than the number of factors - const size_t n_params = r.n_factors() + 1; - if (initial_params.empty()) { - initial_params.resize(n_params, 0.0); - initial_params.back() = -2.5; + const std::size_t n_params = r.n_factors() + 1; + if (params_init.empty()) { + params_init.resize(n_params, 0.0); + params_init.back() = -2.5; } - if (initial_params.size() != n_params) - throw runtime_error("Wrong number of initial parameters."); - - int status = 0; + if (params_init.size() != n_params) + throw std::runtime_error("Wrong number of initial parameters."); - size_t iter = 0; + // clang-format off + gsl_multimin_function_fdf loglik_bundle = { + &neg_loglik, // objective function + &neg_gradient, // gradient + &neg_loglik_and_grad, // combined obj and grad + n_params, // number of model params + static_cast(&r) // parameters for objective and gradient functions + }; + // clang-format on - { - gsl_multimin_function_fdf loglik_bundle; - loglik_bundle.f = &neg_loglik; - loglik_bundle.df = &neg_gradient; - loglik_bundle.fdf = &neg_loglik_and_grad; - loglik_bundle.n = n_params; - loglik_bundle.params = (void *)&r; + // Alternatives: + // - gsl_multimin_fdfminimizer_conjugate_pr + // - gsl_multimin_fdfminimizer_conjugate_fr + // - gsl_multimin_fdfminimizer_vector_bfgs2 + // - gsl_multimin_fdfminimizer_steepest_descent + const gsl_multimin_fdfminimizer_type *minimizer = + gsl_multimin_fdfminimizer_conjugate_fr; - gsl_vector *params = gsl_vector_alloc(n_params); + gsl_multimin_fdfminimizer *s = + gsl_multimin_fdfminimizer_alloc(minimizer, n_params); - for (size_t param = 0; param < initial_params.size(); ++param) - gsl_vector_set(params, param, initial_params[param]); + gsl_vector *params = gsl_vector_alloc(n_params); + for (std::size_t i = 0; i < n_params; ++i) + gsl_vector_set(params, i, params_init[i]); - //can also try gsl_multimin_fdfminimizer_conjugate_pr; - const gsl_multimin_fdfminimizer_type *T = gsl_multimin_fdfminimizer_conjugate_fr; + gsl_multimin_fdfminimizer_set(s, &loglik_bundle, params, stepsize, tolerance); - gsl_multimin_fdfminimizer *s = gsl_multimin_fdfminimizer_alloc (T, n_params); - - // ADS: 0.001 is the step size and 1e-4 is the tolerance - // get the minimizer - gsl_multimin_fdfminimizer_set(s, &loglik_bundle, params, 0.001, 1e-4); - - do { - iter++; - // do one iteration and get the status - status = gsl_multimin_fdfminimizer_iterate(s); - if (status) - break; - // check the status based on gradient - status = gsl_multimin_test_gradient (s->gradient, 1e-4); - } - while (status == GSL_CONTINUE && iter < 700); - // It it reasonable to reduce the number of iterations to 500? - // ADS: 700 vs. 500? what's the difference? - - r.max_loglik = (-1)*neg_loglik(s->x, &r); - - gsl_multimin_fdfminimizer_free(s); - gsl_vector_free(params); - } + int status = 0; + std::size_t iter = 0; + + do { + status = gsl_multimin_fdfminimizer_iterate(s); // one iter and get status + if (status) + break; + // check status from gradient + status = gsl_multimin_test_gradient(s->gradient, tolerance); + } while (status == GSL_CONTINUE && ++iter < max_iter); + // It it reasonable to reduce the number of iterations to 500? + // ADS: 700 vs. 500? what's the difference? + + r.max_loglik = -1.0 * neg_loglik(s->x, &r); + + gsl_multimin_fdfminimizer_free(s); + gsl_vector_free(params); return status == GSL_SUCCESS; } From 85d9818dec6467e71a759e739fa2e2f446fa4d4a Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 20 Jul 2025 19:59:47 -0700 Subject: [PATCH 04/24] src/radmeth/radmeth_model.cpp: adding the radmeth_model.cpp source file --- src/radmeth/radmeth_model.cpp | 103 ++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 src/radmeth/radmeth_model.cpp diff --git a/src/radmeth/radmeth_model.cpp b/src/radmeth/radmeth_model.cpp new file mode 100644 index 00000000..53778283 --- /dev/null +++ b/src/radmeth/radmeth_model.cpp @@ -0,0 +1,103 @@ +/* Copyright (C) 2025 Andrew D Smith + * + * Authors: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "radmeth_model.hpp" + +#include +#include +#include +#include +#include + +double Regression::tolerance = 1e-4; +double Regression::stepsize = 0.001; +std::uint32_t Regression::max_iter = 700; + +void +SiteProportions::parse(const std::string &line) { + const auto first_ws = line.find_first_of(" \t"); + + // Parse the row name (must be like: "chr:position:strand:context") + bool failed = false; + + const auto label_end = line.data() + first_ws; + auto field_s = line.data(); + auto field_e = std::find(field_s + 1, label_end, ':'); + if (field_e == label_end) + failed = true; + + { + const std::uint32_t d = std::distance(field_s, field_e); + chrom = std::string{field_s, d}; + } + + field_s = field_e + 1; + field_e = std::find(field_s + 1, label_end, ':'); + failed = failed || (field_e == label_end); + + { + const auto [ptr, ec] = std::from_chars(field_s, field_e, position); + failed = failed || (ptr == field_s); + } + + field_s = field_e + 1; + field_e = std::find(field_s + 1, label_end, ':'); + failed = failed || (field_e != field_s + 1 || field_e == label_end); + + strand = *field_s; + failed = failed || (strand != '-' && strand != '+'); + + field_s = field_e + 1; + field_e = std::find_if(field_s + 1, label_end, + [](const auto x) { return x == ' ' || x == '\t'; }); + failed = failed || (field_e != label_end); + + { + const std::uint32_t d = std::distance(field_s, field_e); + context = std::string{field_s, d}; + } + + if (failed) + throw std::runtime_error("failed to parse label from:\n" + line); + + // Parse the counts of total reads and methylated reads + const auto is_sep = [](const auto x) { return std::isspace(x); }; + const auto not_sep = [](const auto x) { return std::isdigit(x); }; + + const auto line_end = line.data() + std::size(line); + mcounts mc1{}; + mc.clear(); + while (field_e != line_end) { + // get the total count + field_s = std::find_if(field_e + 1, line_end, not_sep); + field_e = std::find_if(field_s + 1, line_end, is_sep); + { + const auto [ptr, ec] = std::from_chars(field_s, field_e, mc1.n_reads); + failed = failed || (ptr != field_e); + } + + field_s = std::find_if(field_e + 1, line_end, not_sep); + field_e = std::find_if(field_s + 1, line_end, is_sep); + { + const auto [ptr, ec] = std::from_chars(field_s, field_e, mc1.n_meth); + failed = failed || (ptr != field_e); + } + + mc.push_back(mc1); + } + + if (failed) + throw std::runtime_error("failed to parse counts from:\n" + line); +} From ba4dc52359aa53f8f6a1c257ae2637d542486759 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 20 Jul 2025 20:00:19 -0700 Subject: [PATCH 05/24] src/radmeth/radmeth_model.hpp: cleanup and adding some functions to help with organization and efficiency --- src/radmeth/radmeth_model.hpp | 58 +++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp index 0402a269..5ff20fb0 100644 --- a/src/radmeth/radmeth_model.hpp +++ b/src/radmeth/radmeth_model.hpp @@ -19,32 +19,70 @@ #ifndef RADMETH_MODEL_HPP #define RADMETH_MODEL_HPP +#include +#include #include #include struct Design { std::vector factor_names; std::vector sample_names; - std::vector > matrix; - size_t n_factors() const {return factor_names.size();} - size_t n_samples() const {return sample_names.size();} + std::vector> matrix; + std::vector> tmatrix; + std::size_t + n_factors() const { + return factor_names.size(); + } + std::size_t + n_samples() const { + return sample_names.size(); + } }; +struct mcounts { + std::uint32_t n_reads{}; + std::uint32_t n_meth{}; +}; + +[[nodiscard]] inline std::istream & +operator>>(std::istream &in, mcounts &rm) { + return in >> rm.n_reads >> rm.n_meth; +} + struct SiteProportions { std::string chrom; - size_t position; - std::string strand; + std::size_t position{}; + char strand{}; std::string context; - std::vector total; - std::vector meth; + std::vector mc; + + void + parse(const std::string &line); }; struct Regression { + static double tolerance; // 1e-4; + static double stepsize; // 0.001; + static std::uint32_t max_iter; // 700; + Design design; SiteProportions props; - double max_loglik; - size_t n_factors() const {return design.n_factors();} - size_t n_samples() const {return design.n_samples();} + double max_loglik{}; + + [[nodiscard]] std::size_t + n_factors() const { + return design.n_factors(); + } + + [[nodiscard]] std::size_t + props_size() const { + return std::size(props.mc); + } + + [[nodiscard]] std::size_t + n_samples() const { + return design.n_samples(); + } }; #endif From 99df6b727807c5d0ed32d46e68f76fb59ee1ecd3 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 20 Jul 2025 20:01:00 -0700 Subject: [PATCH 06/24] src/radmeth/radmeth.cpp: using multi-threaded --- src/radmeth/radmeth.cpp | 611 ++++++++++++++++++++++------------------ 1 file changed, 330 insertions(+), 281 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 815e599a..cb0a43e9 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -16,15 +16,17 @@ * General Public License for more details. */ -#include // GSL header for chisqrd distribution +#include // GSL header for chisqrd distribution #include #include +#include #include #include #include #include #include +#include #include // smithlab headers @@ -37,175 +39,107 @@ #include "radmeth_optimize.hpp" using std::begin; -using std::cerr; using std::cout; using std::distance; using std::end; -using std::endl; -using std::istringstream; -using std::runtime_error; -using std::sort; -using std::string; -using std::vector; + +struct file_progress { + double one_hundred_over_filesize{}; + std::size_t prev_offset{}; + explicit file_progress(const std::string &filename) : + one_hundred_over_filesize{100.0 / std::filesystem::file_size(filename)} {} + void + operator()(std::ifstream &in) { + const std::size_t curr_offset = in.tellg() * one_hundred_over_filesize; + if (curr_offset <= prev_offset) + return; + std::cerr << "[progress: " << std::setw(3) << curr_offset + << (curr_offset == 100 ? "%]\n" : "%]\r"); + prev_offset = (curr_offset == 100) ? std::numeric_limits::max() + : curr_offset; + } +}; static std::istream & operator>>(std::istream &is, Design &design) { - string header_encoding; - getline(is, header_encoding); + std::string header_encoding; + std::getline(is, header_encoding); - istringstream header_is(header_encoding); - string header_name; - while (header_is >> header_name) design.factor_names.push_back(header_name); + std::istringstream header_is(header_encoding); + std::string header_name; + while (header_is >> header_name) + design.factor_names.push_back(header_name); - string row; - while (getline(is, row)) { - if (row.empty()) continue; + std::string row; + while (std::getline(is, row)) { + if (row.empty()) + continue; - istringstream row_is(row); - string token; + std::istringstream row_is(row); + std::string token; row_is >> token; design.sample_names.push_back(token); - vector matrix_row; + std::vector matrix_row; while (row_is >> token) { - if (token.length() == 1 && (token == "0" || token == "1")) - matrix_row.push_back(token == "1"); - else - throw runtime_error("only binary factor levels are allowed:\n" + row); + if (std::size(token) != 1 || (token != "0" && token != "1")) + throw std::runtime_error("Must use binary factor levels:\n" + row); + matrix_row.push_back(token == "1"); } - if (matrix_row.size() != design.n_factors()) - throw runtime_error( - "each row must have as many columns as factors:\n" + - row); + if (std::size(matrix_row) != design.n_factors()) + throw std::runtime_error( + "each row must have as many columns as factors:\n" + row); - design.matrix.push_back(vector()); + design.matrix.push_back(std::vector()); swap(design.matrix.back(), matrix_row); } + + const auto n_row = std::size(design.matrix); + const auto n_col = std::size(design.matrix.front()); + design.tmatrix.resize(n_col, std::vector(n_row, 0.0)); + for (auto row_idx = 0u; row_idx < n_row; ++row_idx) + for (auto col_idx = 0u; col_idx < n_col; ++col_idx) + design.tmatrix[col_idx][row_idx] = design.matrix[row_idx][col_idx]; + return is; } static std::ostream & operator<<(std::ostream &out, const Design &design) { - for (size_t factor = 0; factor < design.factor_names.size(); ++factor) { + for (std::size_t factor = 0; factor < design.factor_names.size(); ++factor) { + if (factor != 0) + out << '\t'; out << design.factor_names[factor]; - if (factor + 1 != design.factor_names.size()) out << "\t"; } - out << endl; + out << '\n'; - for (size_t i = 0; i < design.n_samples(); ++i) { + for (std::size_t i = 0; i < design.n_samples(); ++i) { out << design.sample_names[i]; - for (size_t factor = 0; factor < design.factor_names.size(); ++factor) - out << "\t" << design.matrix[i][factor]; + for (std::size_t j = 0; j < design.n_factors(); ++j) + out << "\t" << design.matrix[i][j]; out << "\n"; } return out; } static void -remove_factor(Design &design, const size_t factor) { - design.factor_names.erase(begin(design.factor_names) + factor); - for (size_t i = 0; i < design.n_samples(); ++i) - design.matrix[i].erase(begin(design.matrix[i]) + factor); -} - -// Parses a natural number from its string representation. Throws exception if -// the string does not encode one. -static size_t -parse_natural_number(string encoding) { - istringstream iss(encoding); - size_t number; - iss >> number; - if (!iss) - throw runtime_error("The token \"" + encoding + - "\" does not encode a natural number"); - return number; -} - -static std::istream & -operator>>(std::istream &table_encoding, SiteProportions &props) { - props.chrom.clear(); - props.position = 0; - props.strand.clear(); - props.context.clear(); - props.meth.clear(); - props.total.clear(); - - string row_encoding; - getline(table_encoding, row_encoding); - - // Skip lines contining only the newline character (e.g. the last line of the - // proportion table). - if (row_encoding.empty()) return table_encoding; - - // Get the row name (which must be specified like this: "chr:position") and - // parse it. - istringstream row_stream(row_encoding); - string row_name_encoding; - row_stream >> row_name_encoding; - - // Every row must start an identifier consisiting of genomic loci of the - // corresponding site. Here we check this identifier has the correct number - // of colons. - const size_t num_colon = - count(row_name_encoding.begin(), row_name_encoding.end(), ':'); - - if (num_colon != 3) - throw runtime_error( - "Each row in the count table must start with " - "a line chromosome:position:strand:context. Got \"" + - row_name_encoding + "\" instead."); - - // First parse the row identifier. - istringstream name_stream(row_name_encoding); - getline(name_stream, props.chrom, ':'); - - if (props.chrom.empty()) - throw runtime_error("Error parsing " + row_name_encoding + - ": chromosome name is missing."); - - string position_encoding; - - getline(name_stream, position_encoding, ':'); - props.position = parse_natural_number(position_encoding); - getline(name_stream, props.strand, ':'); - getline(name_stream, props.context, ':'); - - // After parsing the row identifier, parse count proportions. - size_t total_count, meth_count; - - while (row_stream >> total_count >> meth_count) { - props.total.push_back(total_count); - props.meth.push_back(meth_count); - } - - if (!row_stream.eof()) - throw runtime_error("Some row entries are not natural numbers: " + - row_stream.str()); - - if (props.total.size() != props.meth.size()) - throw runtime_error( - "This row does not encode proportions" - "correctly:\n" + - row_encoding); - return table_encoding; -} - -static bool -fit_regression_model(Regression &r) { - vector initial_params; - return fit_regression_model(r, initial_params); +remove_factor(Design &design, const std::size_t factor_idx) { + design.factor_names.erase(std::begin(design.factor_names) + factor_idx); + for (std::size_t i = 0; i < design.n_samples(); ++i) + design.matrix[i].erase(std::begin(design.matrix[i]) + factor_idx); } /***************** RADMETH ALGORITHM *****************/ static bool -consistent_sample_names(const Regression ®, const string &header) { - istringstream iss(header); - auto nm_itr(begin(reg.design.sample_names)); - const auto nm_end(end(reg.design.sample_names)); - string token; +consistent_sample_names(const Regression ®, const std::string &header) { + std::istringstream iss(header); + auto nm_itr(std::begin(reg.design.sample_names)); + const auto nm_end(std::end(reg.design.sample_names)); + std::string token; while (iss >> token && nm_itr != nm_end) - if (token != *nm_itr++) return false; + if (token != *nm_itr++) + return false; return true; } @@ -213,13 +147,13 @@ consistent_sample_names(const Regression ®, const string &header) { // function outputs the p-value of the log-likelihood ratio. *Note* that it is // assumed that the reduced model has one fewer factor than the reduced model. static double -loglikratio_test(double null_loglik, double full_loglik) { +llr_test(double null_loglik, double full_loglik) { // The log-likelihood ratio statistic. const double log_lik_stat = -2 * (null_loglik - full_loglik); // It is assumed that null model has one fewer factor than the full // model. Hence the number of degrees of freedom is 1. - const size_t degrees_of_freedom = 1; + const std::size_t degrees_of_freedom = 1; // Log-likelihood ratio statistic has a chi-sqare distribution. const double chisq_p = gsl_cdf_chisq_P(log_lik_stat, degrees_of_freedom); @@ -229,46 +163,249 @@ loglikratio_test(double null_loglik, double full_loglik) { } static bool -has_low_coverage(const Regression ®, const size_t test_fac) { +has_low_coverage(const Regression ®, const std::size_t test_fac) { bool cvrd_in_test_fact_smpls = false; - for (size_t i = 0; i < reg.n_samples() && !cvrd_in_test_fact_smpls; ++i) + for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_test_fact_smpls; ++i) cvrd_in_test_fact_smpls = - (reg.design.matrix[i][test_fac] == 1 && reg.props.total[i] != 0); + (reg.design.matrix[i][test_fac] == 1 && reg.props.mc[i].n_reads != 0); bool cvrd_in_other_smpls = false; - for (size_t i = 0; i < reg.n_samples() && !cvrd_in_other_smpls; ++i) + for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_other_smpls; ++i) cvrd_in_other_smpls = - (reg.design.matrix[i][test_fac] != 1 && reg.props.total[i] != 0); + (reg.design.matrix[i][test_fac] != 1 && reg.props.mc[i].n_reads != 0); return !cvrd_in_test_fact_smpls || !cvrd_in_other_smpls; } -static bool +[[nodiscard]] static bool has_extreme_counts(const Regression ®) { - bool is_maximally_methylated = true; - for (size_t i = 0; i < reg.n_samples() && is_maximally_methylated; ++i) - is_maximally_methylated = (reg.props.total[i] == reg.props.meth[i]); + const auto &mc = reg.props.mc; + + bool full_meth = true; + for (auto i = std::cbegin(mc); i != std::cend(mc) && full_meth; ++i) + full_meth = (i->n_reads == i->n_meth); + + bool no_meth = true; + for (auto i = std::cbegin(mc); i != std::cend(mc) && no_meth; ++i) + no_meth = (i->n_meth == 0.0); - bool is_unmethylated = true; - for (size_t i = 0; i < reg.n_samples() && is_unmethylated; ++i) - is_unmethylated = (reg.props.meth[i] == 0.0); + return full_meth || no_meth; +} - return is_maximally_methylated || is_unmethylated; +[[nodiscard]] static bool +has_two_values(const Regression ®, const std::size_t test_factor) { + const auto &v = reg.design.tmatrix[test_factor]; + for (auto i = std::cbegin(v); i != std::cend(v); ++i) + if (*i != v[0]) + return true; + return false; } +[[nodiscard]] static std::uint32_t +get_test_factor_idx(const Regression &model, const std::string &test_factor) { + const auto &factors = model.design.factor_names; + const auto itr = + std::find(std::cbegin(factors), std::cend(factors), test_factor); -static bool -verify_multiple_levels(const Regression &full_regression, - const size_t test_factor) { - const size_t n_samples = full_regression.design.n_samples(); - const auto first_level = full_regression.design.matrix[0][test_factor]; - bool test_fact_mult_levels = false; - for (size_t i = 1; i < n_samples; ++i) - if (full_regression.design.matrix[i][test_factor] != first_level) - test_fact_mult_levels = true; - return test_fact_mult_levels; + if (itr == std::cend(factors)) + throw std::runtime_error("Factor not part of design: " + test_factor); + + return std::distance(std::cbegin(factors), itr); } +static void +read_design(const bool verbose, const std::string &design_filename, + Design &design) { + if (verbose) + std::cerr << "design table filename: " << design_filename << std::endl; + std::ifstream design_file(design_filename); + if (!design_file) + throw std::runtime_error("could not open file: " + design_filename); + + // initialize full design matrix from file + design_file >> design; + + if (verbose) + std::cerr << design << std::endl; +} + +enum class row_status : std::uint8_t { + ok, + na, + na_low_cov, + na_extreme_cnt, +}; + +[[nodiscard]] static std::vector +drop_idx(const std::vector &v, const std::size_t to_drop) { + std::vector u; + u.reserve(std::size(v) - 1); + for (auto i = 0u; i < std::size(v); ++i) + if (i != to_drop) + u.push_back(v[i]); + return u; +} + +static void +radmeth(const bool show_progress, const bool more_na_info, + const std::uint32_t n_threads, const std::string &table_filename, + const std::string &outfile, const Regression &alt_model, + const Regression &null_model, const std::uint32_t test_factor_idx) { + static constexpr auto prefix_fmt = "%s\t%ld\t%c\t%s\t"; + static constexpr auto suffix_fmt = "\t%ld\t%ld\t%ld\t%ld\n"; + static constexpr auto buf_size = 1024; + static constexpr auto n_lines_at_once = 1024; + + // ADS: open the data table file + std::ifstream table_file(table_filename); + if (!table_file) + throw std::runtime_error("could not open file: " + table_filename); + + // Make sure that the first line of the proportion table file contains + // names of the samples. Throw an exception if the names or their order + // in the proportion table does not match those in the full design matrix. + std::string sample_names_header; + std::getline(table_file, sample_names_header); + + if (!consistent_sample_names(alt_model, sample_names_header)) + throw std::runtime_error( + "header:\n" + sample_names_header + "\n" + + "does not match factor names or their order in the\n" + "design matrix. Check that the design matrix and\n" + "the proportion table are correctly formatted."); + + const std::size_t n_samples = alt_model.design.n_samples(); + + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open output file: " + outfile); + + file_progress progress{table_filename}; + + std::vector> bufs(n_threads, + std::vector(buf_size, 0)); + std::vector n_bytes(n_threads, 0); + std::vector alt_models(n_threads, alt_model); + std::vector null_models(n_threads, null_model); + + // iterate over rows in the file and do llr test on proportions from each + std::vector lines(n_threads); + while (true) { + + std::uint32_t n_lines = 0; + while (n_lines < n_lines_at_once && + std::getline(table_file, lines[n_lines])) + ++n_lines; + + if (show_progress) + progress(table_file); + + std::vector threads; + for (auto th_id = 0u; th_id < std::min(n_lines, n_threads); ++th_id) + threads.emplace_back([&, th_id] { + auto &t_alt_model = alt_models[th_id]; + auto &t_null_model = null_models[th_id]; + t_alt_model.props.parse(lines[th_id]); + if (t_alt_model.props_size() != n_samples) + throw std::runtime_error("found row with wrong number of columns"); + + const auto [p_val, status] = [&]() -> std::tuple { + // Skip the test if (1) no coverage in all cases or in all controls, + // or (2) the site is completely methylated or completely + // unmethylated across all samples. + if (has_low_coverage(t_alt_model, test_factor_idx)) + return std::tuple{1.0, row_status::na_low_cov}; + + if (has_extreme_counts(t_alt_model)) + return std::tuple{1.0, row_status::na_extreme_cnt}; + + std::vector alternate_params; + fit_regression_model(t_alt_model, alternate_params); + t_null_model.props = t_alt_model.props; + + auto null_params = drop_idx(alternate_params, test_factor_idx); + + fit_regression_model(t_null_model, null_params); + const double p_value = + llr_test(t_null_model.max_loglik, t_alt_model.max_loglik); + + return (p_value != p_value) ? std::tuple{1.0, row_status::na} + : std::tuple{p_value, row_status::ok}; + }(); + + std::size_t n_reads_factor = 0; + std::size_t n_reads_others = 0; + std::size_t n_meth_factor = 0; + std::size_t n_meth_others = 0; + + const auto &mc = t_alt_model.props.mc; + const auto &vec = t_alt_model.design.tmatrix[test_factor_idx]; + for (std::size_t s = 0; s < n_samples; ++s) + if (vec[s] != 0) { + n_reads_factor += mc[s].n_reads; + n_meth_factor += mc[s].n_meth; + } + else { + n_reads_others += mc[s].n_reads; + n_meth_others += mc[s].n_meth; + } + + n_bytes[th_id] = [&] { + // clang-format off + const int n_prefix_bytes = std::sprintf(bufs[th_id].data(), prefix_fmt, + t_alt_model.props.chrom.data(), + t_alt_model.props.position, + t_alt_model.props.strand, + t_alt_model.props.context.data()); + // clang-format on + if (n_prefix_bytes < 0) + return n_prefix_bytes; + + auto cursor = bufs[th_id].data() + n_prefix_bytes; + + const int n_pval_bytes = [&] { + if (status == row_status::ok) + return std::sprintf(cursor, "%.6g", p_val); + if (!more_na_info || status == row_status::na) + return std::sprintf(cursor, "NA"); + if (status == row_status::na_extreme_cnt) + return std::sprintf(cursor, "NA_EXTREME_CNT"); + // if (status == row_status::na_low_cov) + return std::sprintf(cursor, "NA_LOW_COV"); + }(); + + if (n_pval_bytes < 0) + return n_pval_bytes; + + cursor += n_pval_bytes; + + // clang-format off + const int n_suffix_bytes = std::sprintf(cursor, suffix_fmt, + n_reads_factor, + n_meth_factor, + n_reads_others, + n_meth_others); + // clang-format on + if (n_suffix_bytes < 0) + return n_suffix_bytes; + + return n_prefix_bytes + n_pval_bytes + n_suffix_bytes; + }(); + }); + + for (auto &thread : threads) + thread.join(); + + for (auto i = 0u; i < std::min(n_lines, n_threads); ++i) { + if (n_bytes[i] < 0) + throw std::runtime_error("failed to write to output buffer"); + out.write(bufs[i].data(), n_bytes[i]); + } + + if (n_lines < n_lines_at_once) + break; + } +} /*********************************************************************** * Run beta-binoimial regression using the specified table with @@ -277,171 +414,83 @@ verify_multiple_levels(const Regression &full_regression, int main_radmeth(int argc, char *argv[]) { try { - static const string description = + static const std::string description = "calculate differential methylation scores"; - string outfile; - string test_factor_name; - bool VERBOSE = false; - bool more_na_info = false; + std::string outfile; + std::string test_factor; + bool verbose{false}; + bool show_progress{false}; + bool more_na_info{false}; + std::uint32_t n_threads{1}; /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(strip_path(argv[0]), description, " "); opt_parse.set_show_defaults(); - opt_parse.add_opt("out", 'o', "output file (default: stdout)", false, - outfile); - opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); + opt_parse.add_opt("out", 'o', "output file", true, outfile); + opt_parse.add_opt("threads", 't', "number of threads to use", false, + n_threads); + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); + opt_parse.add_opt("progress", '\0', "show progress", false, show_progress); opt_parse.add_opt( "na-info", 'n', "if a p-value is not calculated, print NAs in more detail: " "low count (NA_LOW_COV) extreme values (NA_EXTREME_CNT) or " "numerical errors in likelihood ratios (NA)", false, more_na_info); - opt_parse.add_opt("factor", 'f', "a factor to test", true, - test_factor_name); + opt_parse.add_opt("factor", 'f', "a factor to test", true, test_factor); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + std::cerr << opt_parse.help_message() << std::endl + << opt_parse.about_message() << std::endl; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + std::cerr << opt_parse.about_message() << std::endl; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << std::endl; return EXIT_SUCCESS; } if (leftover_args.size() != 2) { - cerr << opt_parse.help_message() << endl; + std::cerr << opt_parse.help_message() << std::endl; return EXIT_SUCCESS; } - const string design_filename(leftover_args.front()); - const string table_filename(leftover_args.back()); + const std::string design_filename(leftover_args.front()); + const std::string table_filename(leftover_args.back()); /****************** END COMMAND LINE OPTIONS *****************/ - if (VERBOSE) cerr << "design table filename: " << design_filename << endl; - std::ifstream design_file(design_filename); - if (!design_file) - throw runtime_error("could not open file: " + design_filename); - // initialize full design matrix from file - Regression full_regression; - design_file >> full_regression.design; - - if (VERBOSE) cerr << full_regression.design << endl; + Regression orig_alt_model; + read_design(verbose, design_filename, orig_alt_model.design); // Check that provided test factor name exists and find its index. - // Identify test factors with their indexes to simplify naming - auto test_fact_it = - find(begin(full_regression.design.factor_names), - end(full_regression.design.factor_names), test_factor_name); - - if (test_fact_it == end(full_regression.design.factor_names)) - throw runtime_error(test_factor_name + - " factor not part of design specification"); - - const size_t test_factor = - distance(begin(full_regression.design.factor_names), test_fact_it); + const auto test_factor_idx = + get_test_factor_idx(orig_alt_model, test_factor); // verify that the design includes more than one level for the // test factor - if (!verify_multiple_levels(full_regression, test_factor)) { - const auto first_level = full_regression.design.matrix[0][test_factor]; - throw runtime_error("test factor only one level: " + - test_factor_name + ", " + - std::to_string(first_level)); + if (!has_two_values(orig_alt_model, test_factor_idx)) { + const auto first_level = orig_alt_model.design.matrix[0][test_factor_idx]; + throw std::runtime_error("test factor only one level: " + test_factor + + ", " + std::to_string(first_level)); } - Regression null_regression; - null_regression.design = full_regression.design; - remove_factor(null_regression.design, test_factor); + Regression orig_null_model; + orig_null_model.design = orig_alt_model.design; + remove_factor(orig_null_model.design, test_factor_idx); // ADS: done setup for the model - // ADS: open the data table file - std::ifstream table_file(table_filename); - if (!table_file) - throw runtime_error("could not open file: " + table_filename); - - // Make sure that the first line of the proportion table file contains - // names of the samples. Throw an exception if the names or their order - // in the proportion table does not match those in the full design matrix. - string sample_names_header; - getline(table_file, sample_names_header); - - if (!consistent_sample_names(full_regression, sample_names_header)) - throw runtime_error("header:\n" + sample_names_header + "\n" + - "does not match factor names or their order in the\n" - "design matrix. Check that the design matrix and\n" - "the proportion table are correctly formatted."); - - const size_t n_samples = full_regression.design.n_samples(); - - // ADS: now open the output file because each row is processed - // sequentially - std::ofstream of; - if (!outfile.empty()) of.open(outfile); - std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); - - // Performing the log-likelihood ratio test on proportions from each row - // of the proportion table. - while (table_file >> full_regression.props) { - if (full_regression.props.total.size() != n_samples) - throw runtime_error("found row with wrong number of columns"); - - size_t coverage_factor = 0, coverage_rest = 0, meth_factor = 0, - meth_rest = 0; - - for (size_t s = 0; s < n_samples; ++s) { - if (full_regression.design.matrix[s][test_factor] != 0) { - coverage_factor += full_regression.props.total[s]; - meth_factor += full_regression.props.meth[s]; - } - else { - coverage_rest += full_regression.props.total[s]; - meth_rest += full_regression.props.meth[s]; - } - } - - out << full_regression.props.chrom << "\t" - << full_regression.props.position << "\t" - << full_regression.props.strand << "\t" - << full_regression.props.context << "\t"; - - // Do not perform the test if there's no coverage in either all - // case or all control samples. Also do not test if the site is - // completely methylated or completely unmethylated across all - // samples. - if (has_low_coverage(full_regression, test_factor)) { - out << ((more_na_info) ? "NA_LOW_COV" : "NA"); - } - else if (has_extreme_counts(full_regression)) { - out << ((more_na_info) ? "NA_EXTREME_CNT" : "NA"); - } - else { - fit_regression_model(full_regression); - null_regression.props = full_regression.props; - fit_regression_model(null_regression); - const double p_value = loglikratio_test(null_regression.max_loglik, - full_regression.max_loglik); - - // If error occured in fitting (p-val = nan or -nan). - if (p_value != p_value) - out << "NA"; - else - out << p_value; - } - out << "\t" << coverage_factor << "\t" << meth_factor << "\t" - << coverage_rest << "\t" << meth_rest << endl; - } + radmeth(show_progress, more_na_info, n_threads, table_filename, outfile, + orig_alt_model, orig_null_model, test_factor_idx); } catch (const std::exception &e) { - cerr << "ERROR: " << e.what() << endl; - exit(EXIT_FAILURE); + std::cerr << "ERROR: " << e.what() << std::endl; + return EXIT_FAILURE; } return EXIT_SUCCESS; } From df6ec6a924a632c2bfbde2278098d1e52bec82f4 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 20 Jul 2025 20:23:44 -0700 Subject: [PATCH 07/24] src/radmeth/radmeth.cpp: removing some accidental code.. --- src/radmeth/radmeth.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index cb0a43e9..26d1f06a 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -254,7 +254,6 @@ radmeth(const bool show_progress, const bool more_na_info, static constexpr auto prefix_fmt = "%s\t%ld\t%c\t%s\t"; static constexpr auto suffix_fmt = "\t%ld\t%ld\t%ld\t%ld\n"; static constexpr auto buf_size = 1024; - static constexpr auto n_lines_at_once = 1024; // ADS: open the data table file std::ifstream table_file(table_filename); @@ -293,8 +292,7 @@ radmeth(const bool show_progress, const bool more_na_info, while (true) { std::uint32_t n_lines = 0; - while (n_lines < n_lines_at_once && - std::getline(table_file, lines[n_lines])) + while (n_lines < n_threads && std::getline(table_file, lines[n_lines])) ++n_lines; if (show_progress) @@ -402,7 +400,7 @@ radmeth(const bool show_progress, const bool more_na_info, out.write(bufs[i].data(), n_bytes[i]); } - if (n_lines < n_lines_at_once) + if (n_lines < n_threads) break; } } From d65e24dc7f9fd5c9b69aafdcac89fdee845c9d68 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 15:27:51 -0700 Subject: [PATCH 08/24] src/radmeth/radmeth_optimize.hcpp: fixing includes --- src/radmeth/radmeth_optimize.cpp | 2 ++ src/radmeth/radmeth_optimize.hpp | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index e957cc6b..1e201100 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -16,6 +16,8 @@ * General Public License for more details. */ +#include "radmeth_optimize.hpp" + #include "radmeth_model.hpp" #include diff --git a/src/radmeth/radmeth_optimize.hpp b/src/radmeth/radmeth_optimize.hpp index 2bf115de..16f4a061 100644 --- a/src/radmeth/radmeth_optimize.hpp +++ b/src/radmeth/radmeth_optimize.hpp @@ -19,10 +19,11 @@ #ifndef RADMETH_OPTIMIZE_HPP #define RADMETH_OPTIMIZE_HPP +#include + struct Regression; bool fit_regression_model(Regression &r, std::vector &initial_params); - #endif From 53d9ba9f93716c29cbe21a41b485b1be8875c7a8 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 16:22:39 -0700 Subject: [PATCH 09/24] src/radmeth/radmeth.cpp: changing strategies for parallelism to get chunks of data for each thread and avoid too much thread creation. Currently performance has problems. --- src/radmeth/radmeth.cpp | 204 ++++++++++++++++++++++------------------ 1 file changed, 115 insertions(+), 89 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 26d1f06a..dd7b1c6f 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -246,6 +246,22 @@ drop_idx(const std::vector &v, const std::size_t to_drop) { return u; } +template +static inline std::vector> +get_chunk_bounds(const T n_elements, const T n_chunks) { + std::vector> chunks; + const T q = n_elements / n_chunks; + const T r = n_elements - q * n_chunks; + T block_start{}; + for (std::size_t i = 0; i < n_chunks; ++i) { + const auto sz = (i < r) ? q + 1 : q; + const auto block_end = block_start + sz; + chunks.emplace_back(block_start, block_end); + block_start = block_end; + } + return chunks; +} + static void radmeth(const bool show_progress, const bool more_na_info, const std::uint32_t n_threads, const std::string &table_filename, @@ -254,6 +270,7 @@ radmeth(const bool show_progress, const bool more_na_info, static constexpr auto prefix_fmt = "%s\t%ld\t%c\t%s\t"; static constexpr auto suffix_fmt = "\t%ld\t%ld\t%ld\t%ld\n"; static constexpr auto buf_size = 1024; + static constexpr auto max_lines = 256; // ADS: open the data table file std::ifstream table_file(table_filename); @@ -281,120 +298,129 @@ radmeth(const bool show_progress, const bool more_na_info, file_progress progress{table_filename}; - std::vector> bufs(n_threads, + std::vector> bufs(max_lines, std::vector(buf_size, 0)); - std::vector n_bytes(n_threads, 0); + std::vector n_bytes(max_lines, 0); + std::vector lines(max_lines); + std::vector alt_models(n_threads, alt_model); std::vector null_models(n_threads, null_model); - // iterate over rows in the file and do llr test on proportions from each - std::vector lines(n_threads); + // Iterate over rows in the file and do llr test on proportions from + // each. Do this in sets of rows to avoid having to spawn too many threads. while (true) { - std::uint32_t n_lines = 0; - while (n_lines < n_threads && std::getline(table_file, lines[n_lines])) + while (n_lines < max_lines && std::getline(table_file, lines[n_lines])) ++n_lines; + if (n_lines == 0) + break; if (show_progress) progress(table_file); + const auto chunks = get_chunk_bounds(n_lines, n_threads); + std::vector threads; - for (auto th_id = 0u; th_id < std::min(n_lines, n_threads); ++th_id) - threads.emplace_back([&, th_id] { - auto &t_alt_model = alt_models[th_id]; - auto &t_null_model = null_models[th_id]; - t_alt_model.props.parse(lines[th_id]); - if (t_alt_model.props_size() != n_samples) - throw std::runtime_error("found row with wrong number of columns"); - - const auto [p_val, status] = [&]() -> std::tuple { - // Skip the test if (1) no coverage in all cases or in all controls, - // or (2) the site is completely methylated or completely - // unmethylated across all samples. - if (has_low_coverage(t_alt_model, test_factor_idx)) - return std::tuple{1.0, row_status::na_low_cov}; - - if (has_extreme_counts(t_alt_model)) - return std::tuple{1.0, row_status::na_extreme_cnt}; - - std::vector alternate_params; - fit_regression_model(t_alt_model, alternate_params); - t_null_model.props = t_alt_model.props; - - auto null_params = drop_idx(alternate_params, test_factor_idx); - - fit_regression_model(t_null_model, null_params); - const double p_value = - llr_test(t_null_model.max_loglik, t_alt_model.max_loglik); - - return (p_value != p_value) ? std::tuple{1.0, row_status::na} - : std::tuple{p_value, row_status::ok}; - }(); - - std::size_t n_reads_factor = 0; - std::size_t n_reads_others = 0; - std::size_t n_meth_factor = 0; - std::size_t n_meth_others = 0; - - const auto &mc = t_alt_model.props.mc; - const auto &vec = t_alt_model.design.tmatrix[test_factor_idx]; - for (std::size_t s = 0; s < n_samples; ++s) - if (vec[s] != 0) { - n_reads_factor += mc[s].n_reads; - n_meth_factor += mc[s].n_meth; - } - else { - n_reads_others += mc[s].n_reads; - n_meth_others += mc[s].n_meth; - } - - n_bytes[th_id] = [&] { - // clang-format off - const int n_prefix_bytes = std::sprintf(bufs[th_id].data(), prefix_fmt, - t_alt_model.props.chrom.data(), - t_alt_model.props.position, - t_alt_model.props.strand, - t_alt_model.props.context.data()); - // clang-format on - if (n_prefix_bytes < 0) - return n_prefix_bytes; - - auto cursor = bufs[th_id].data() + n_prefix_bytes; - - const int n_pval_bytes = [&] { - if (status == row_status::ok) - return std::sprintf(cursor, "%.6g", p_val); - if (!more_na_info || status == row_status::na) - return std::sprintf(cursor, "NA"); - if (status == row_status::na_extreme_cnt) - return std::sprintf(cursor, "NA_EXTREME_CNT"); - // if (status == row_status::na_low_cov) - return std::sprintf(cursor, "NA_LOW_COV"); + for (auto thread_id = 0u; thread_id < n_threads; ++thread_id) { + threads.emplace_back([&, thread_id] { + const auto &[chunk_beg, chunk_end] = chunks[thread_id]; + auto &t_alt_model = alt_models[thread_id]; + auto &t_null_model = null_models[thread_id]; + for (auto b = chunk_beg; b != chunk_end; ++b) { + t_alt_model.props.parse(lines[b]); + if (t_alt_model.props_size() != n_samples) + throw std::runtime_error("found row with wrong number of columns"); + + const auto [p_val, status] = [&]() -> std::tuple { + // Skip the test if (1) no coverage in all cases or in all controls, + // or (2) the site is completely methylated or completely + // unmethylated across all samples. + if (has_low_coverage(t_alt_model, test_factor_idx)) + return std::tuple{1.0, row_status::na_low_cov}; + + if (has_extreme_counts(t_alt_model)) + return std::tuple{1.0, row_status::na_extreme_cnt}; + + std::vector alternate_params; + fit_regression_model(t_alt_model, alternate_params); + t_null_model.props = t_alt_model.props; + + auto null_params = drop_idx(alternate_params, test_factor_idx); + + fit_regression_model(t_null_model, null_params); + const double p_value = + llr_test(t_null_model.max_loglik, t_alt_model.max_loglik); + + return (p_value != p_value) ? std::tuple{1.0, row_status::na} + : std::tuple{p_value, row_status::ok}; }(); - if (n_pval_bytes < 0) - return n_pval_bytes; - - cursor += n_pval_bytes; - - // clang-format off + std::size_t n_reads_factor = 0; + std::size_t n_reads_others = 0; + std::size_t n_meth_factor = 0; + std::size_t n_meth_others = 0; + + const auto &mc = t_alt_model.props.mc; + const auto &vec = t_alt_model.design.tmatrix[test_factor_idx]; + for (std::size_t s = 0; s < n_samples; ++s) + if (vec[s] != 0) { + n_reads_factor += mc[s].n_reads; + n_meth_factor += mc[s].n_meth; + } + else { + n_reads_others += mc[s].n_reads; + n_meth_others += mc[s].n_meth; + } + + n_bytes[b] = [&] { + // clang-format off + const int n_prefix_bytes = std::sprintf(bufs[b].data(), prefix_fmt, + t_alt_model.props.chrom.data(), + t_alt_model.props.position, + t_alt_model.props.strand, + t_alt_model.props.context.data()); + // clang-format on + if (n_prefix_bytes < 0) + return n_prefix_bytes; + + auto cursor = bufs[b].data() + n_prefix_bytes; + + const int n_pval_bytes = [&] { + if (status == row_status::ok) + return std::sprintf(cursor, "%.6g", p_val); + if (!more_na_info || status == row_status::na) + return std::sprintf(cursor, "NA"); + if (status == row_status::na_extreme_cnt) + return std::sprintf(cursor, "NA_EXTREME_CNT"); + // if (status == row_status::na_low_cov) + return std::sprintf(cursor, "NA_LOW_COV"); + }(); + + if (n_pval_bytes < 0) + return n_pval_bytes; + + cursor += n_pval_bytes; + + // clang-format off const int n_suffix_bytes = std::sprintf(cursor, suffix_fmt, n_reads_factor, n_meth_factor, n_reads_others, n_meth_others); - // clang-format on - if (n_suffix_bytes < 0) - return n_suffix_bytes; + // clang-format on + if (n_suffix_bytes < 0) + return n_suffix_bytes; - return n_prefix_bytes + n_pval_bytes + n_suffix_bytes; - }(); + return n_prefix_bytes + n_pval_bytes + n_suffix_bytes; + }(); + } }); + } for (auto &thread : threads) thread.join(); - for (auto i = 0u; i < std::min(n_lines, n_threads); ++i) { + for (auto i = 0u; i < n_lines; ++i) { if (n_bytes[i] < 0) throw std::runtime_error("failed to write to output buffer"); out.write(bufs[i].data(), n_bytes[i]); From 5b8c1d3e5201721785cc4932acac0b0fbdeb9d77 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 18:10:26 -0700 Subject: [PATCH 10/24] src/radmeth/radmeth_model.hcpp: changing the design matrix to uint8_t because it's really just binary. Moving the functions that do IO on design matrices here also because this is where they belong --- src/radmeth/radmeth_model.cpp | 66 +++++++++++++++++++++++++++++++++++ src/radmeth/radmeth_model.hpp | 11 ++++-- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/src/radmeth/radmeth_model.cpp b/src/radmeth/radmeth_model.cpp index 53778283..1f7dc6d7 100644 --- a/src/radmeth/radmeth_model.cpp +++ b/src/radmeth/radmeth_model.cpp @@ -18,8 +18,11 @@ #include #include #include +#include +#include #include #include +#include double Regression::tolerance = 1e-4; double Regression::stepsize = 0.001; @@ -101,3 +104,66 @@ SiteProportions::parse(const std::string &line) { if (failed) throw std::runtime_error("failed to parse counts from:\n" + line); } + +std::istream & +operator>>(std::istream &is, Design &design) { + std::string header_encoding; + std::getline(is, header_encoding); + + std::istringstream header_is(header_encoding); + std::string header_name; + while (header_is >> header_name) + design.factor_names.push_back(header_name); + + std::string row; + while (std::getline(is, row)) { + if (row.empty()) + continue; + + std::istringstream row_is(row); + std::string token; + row_is >> token; + design.sample_names.push_back(token); + + std::vector matrix_row; + while (row_is >> token) { + if (std::size(token) != 1 || (token != "0" && token != "1")) + throw std::runtime_error("Must use binary factor levels:\n" + row); + matrix_row.push_back(token == "1"); + } + + if (std::size(matrix_row) != design.n_factors()) + throw std::runtime_error( + "each row must have as many columns as factors:\n" + row); + + design.matrix.push_back(std::vector()); + swap(design.matrix.back(), matrix_row); + } + + const auto n_row = std::size(design.matrix); + const auto n_col = std::size(design.matrix.front()); + design.tmatrix.resize(n_col, std::vector(n_row, 0.0)); + for (auto row_idx = 0u; row_idx < n_row; ++row_idx) + for (auto col_idx = 0u; col_idx < n_col; ++col_idx) + design.tmatrix[col_idx][row_idx] = design.matrix[row_idx][col_idx]; + + return is; +} + +std::ostream & +operator<<(std::ostream &out, const Design &design) { + for (std::size_t factor = 0; factor < design.factor_names.size(); ++factor) { + if (factor != 0) + out << '\t'; + out << design.factor_names[factor]; + } + out << '\n'; + + for (std::size_t i = 0; i < design.n_samples(); ++i) { + out << design.sample_names[i]; + for (std::size_t j = 0; j < design.n_factors(); ++j) + out << "\t" << design.matrix[i][j]; + out << "\n"; + } + return out; +} diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp index 5ff20fb0..3189ecd0 100644 --- a/src/radmeth/radmeth_model.hpp +++ b/src/radmeth/radmeth_model.hpp @@ -21,14 +21,15 @@ #include #include +#include #include #include struct Design { std::vector factor_names; std::vector sample_names; - std::vector> matrix; - std::vector> tmatrix; + std::vector> matrix; + std::vector> tmatrix; std::size_t n_factors() const { return factor_names.size(); @@ -39,6 +40,12 @@ struct Design { } }; +std::istream & +operator>>(std::istream &is, Design &design); + +std::ostream & +operator<<(std::ostream &os, const Design &design); + struct mcounts { std::uint32_t n_reads{}; std::uint32_t n_meth{}; From 6093554b51629e95fe518ef2e1c23b068cbcde3b Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 18:18:27 -0700 Subject: [PATCH 11/24] src/radmeth/radmeth_optimize.cpp: changes to allow the desig matrix to have uint8_t for its binary values rather than the full double --- src/radmeth/radmeth_optimize.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index 1e201100..380ddeaf 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -29,8 +29,9 @@ #include #include +template [[nodiscard]] static double -pi(const std::vector &v, const gsl_vector *params) { +pi(const std::vector &v, const gsl_vector *params) { // ADS: this function doesn't have a very helpful name const auto a = v.data(); const double dot = std::inner_product(a, a + std::size(v), params->data, 0.0); From 07b45d0ab164f0af34bf1d2a5e67624775168e94 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 18:27:11 -0700 Subject: [PATCH 12/24] src/radmeth/radmeth.cpp: moving code for IO for Design into radmeth_model sources and setting the number of lines to read/write at once to 4096 --- src/radmeth/radmeth.cpp | 66 +---------------------------------------- 1 file changed, 1 insertion(+), 65 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index dd7b1c6f..342db53b 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -60,69 +60,6 @@ struct file_progress { } }; -static std::istream & -operator>>(std::istream &is, Design &design) { - std::string header_encoding; - std::getline(is, header_encoding); - - std::istringstream header_is(header_encoding); - std::string header_name; - while (header_is >> header_name) - design.factor_names.push_back(header_name); - - std::string row; - while (std::getline(is, row)) { - if (row.empty()) - continue; - - std::istringstream row_is(row); - std::string token; - row_is >> token; - design.sample_names.push_back(token); - - std::vector matrix_row; - while (row_is >> token) { - if (std::size(token) != 1 || (token != "0" && token != "1")) - throw std::runtime_error("Must use binary factor levels:\n" + row); - matrix_row.push_back(token == "1"); - } - - if (std::size(matrix_row) != design.n_factors()) - throw std::runtime_error( - "each row must have as many columns as factors:\n" + row); - - design.matrix.push_back(std::vector()); - swap(design.matrix.back(), matrix_row); - } - - const auto n_row = std::size(design.matrix); - const auto n_col = std::size(design.matrix.front()); - design.tmatrix.resize(n_col, std::vector(n_row, 0.0)); - for (auto row_idx = 0u; row_idx < n_row; ++row_idx) - for (auto col_idx = 0u; col_idx < n_col; ++col_idx) - design.tmatrix[col_idx][row_idx] = design.matrix[row_idx][col_idx]; - - return is; -} - -static std::ostream & -operator<<(std::ostream &out, const Design &design) { - for (std::size_t factor = 0; factor < design.factor_names.size(); ++factor) { - if (factor != 0) - out << '\t'; - out << design.factor_names[factor]; - } - out << '\n'; - - for (std::size_t i = 0; i < design.n_samples(); ++i) { - out << design.sample_names[i]; - for (std::size_t j = 0; j < design.n_factors(); ++j) - out << "\t" << design.matrix[i][j]; - out << "\n"; - } - return out; -} - static void remove_factor(Design &design, const std::size_t factor_idx) { design.factor_names.erase(std::begin(design.factor_names) + factor_idx); @@ -270,7 +207,7 @@ radmeth(const bool show_progress, const bool more_na_info, static constexpr auto prefix_fmt = "%s\t%ld\t%c\t%s\t"; static constexpr auto suffix_fmt = "\t%ld\t%ld\t%ld\t%ld\n"; static constexpr auto buf_size = 1024; - static constexpr auto max_lines = 256; + static constexpr auto max_lines = 4096; // ADS: open the data table file std::ifstream table_file(table_filename); @@ -507,7 +444,6 @@ main_radmeth(int argc, char *argv[]) { Regression orig_null_model; orig_null_model.design = orig_alt_model.design; remove_factor(orig_null_model.design, test_factor_idx); - // ADS: done setup for the model radmeth(show_progress, more_na_info, n_threads, table_filename, outfile, orig_alt_model, orig_null_model, test_factor_idx); From f063a4ff532e35a817c14c7877a09f09f1096c50 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 18:36:03 -0700 Subject: [PATCH 13/24] src/radmeth/radmeth.cpp: showing progress after it has been done, preallocating the chunks vector, and changing the default number of lines to read at once to 16k --- src/radmeth/radmeth.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 342db53b..509ab76b 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -183,20 +183,18 @@ drop_idx(const std::vector &v, const std::size_t to_drop) { return u; } -template -static inline std::vector> -get_chunk_bounds(const T n_elements, const T n_chunks) { - std::vector> chunks; - const T q = n_elements / n_chunks; - const T r = n_elements - q * n_chunks; - T block_start{}; +static inline void +get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks, + std::vector> &chunks) { + const std::uint32_t q = n_elements / n_chunks; + const std::uint32_t r = n_elements - q * n_chunks; + std::uint32_t block_start{}; for (std::size_t i = 0; i < n_chunks; ++i) { const auto sz = (i < r) ? q + 1 : q; const auto block_end = block_start + sz; - chunks.emplace_back(block_start, block_end); + chunks[i] = {block_start, block_end}; block_start = block_end; } - return chunks; } static void @@ -207,7 +205,7 @@ radmeth(const bool show_progress, const bool more_na_info, static constexpr auto prefix_fmt = "%s\t%ld\t%c\t%s\t"; static constexpr auto suffix_fmt = "\t%ld\t%ld\t%ld\t%ld\n"; static constexpr auto buf_size = 1024; - static constexpr auto max_lines = 4096; + static constexpr auto max_lines = 16384; // ADS: open the data table file std::ifstream table_file(table_filename); @@ -243,6 +241,8 @@ radmeth(const bool show_progress, const bool more_na_info, std::vector alt_models(n_threads, alt_model); std::vector null_models(n_threads, null_model); + std::vector> chunks(n_threads); + // Iterate over rows in the file and do llr test on proportions from // each. Do this in sets of rows to avoid having to spawn too many threads. while (true) { @@ -252,10 +252,7 @@ radmeth(const bool show_progress, const bool more_na_info, if (n_lines == 0) break; - if (show_progress) - progress(table_file); - - const auto chunks = get_chunk_bounds(n_lines, n_threads); + get_chunk_bounds(n_lines, n_threads, chunks); std::vector threads; for (auto thread_id = 0u; thread_id < n_threads; ++thread_id) { @@ -363,6 +360,9 @@ radmeth(const bool show_progress, const bool more_na_info, out.write(bufs[i].data(), n_bytes[i]); } + if (show_progress) + progress(table_file); + if (n_lines < n_threads) break; } From a2185642628a909ea24b601eb949ce8d007df078 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 18:56:20 -0700 Subject: [PATCH 14/24] docs/content/radmeth.md: updating docs for radmeth --- docs/content/radmeth.md | 380 ++++++++++++++++++++++------------------ 1 file changed, 210 insertions(+), 170 deletions(-) diff --git a/docs/content/radmeth.md b/docs/content/radmeth.md index 54d03292..96f595bd 100644 --- a/docs/content/radmeth.md +++ b/docs/content/radmeth.md @@ -1,62 +1,65 @@ ## radmeth - Differential CpG methylation ## Synopsis + ```console -$ dnmtools radmeth -factor case design_matrix.txt proportion_table.txt >output.bed +dnmtools radmeth -factor case -o output.txt design_matrix.txt proportion_table.txt ``` ## Description -The DM detection method described in this section is based on the -beta-binomial regression. We recommend using this method when more -than three replicates are available in each group. For rapid -differential methylation analysis the regression-based method should -be run on a computing cluster with a few hundred available nodes, in -which case it takes approximately a few hours to process a dataset -consisting of 30-50 WGBS samples. The analysis can be also performed -on a personal workstation, but it will take substantially longer. Note -that the actual processing time depends on the coverage of each -methylome, the number of sites analyzed, and the number of methylomes -in the dataset. +The differential-methylation detection method described in this section is +based on the beta-binomial regression. We recommend using this method when +more than three replicates are available in each group. For rapid differential +methylation analysis the regression-based method should be run on a large +machine with many cores. Note that the actual processing time depends on the +coverage of each methylome, the number of sites analyzed, and the number of +methylomes in the dataset. ### Generating proportion tables -The first step in the differential methylation analysis is to assemble -a proportion table containing read proportions for all target -methylomes. Dnmtools includes a program [merge](../merge) to generate -a proportion table from the given collection of methylomes. Suppose -that we want to compare methylomes named `control-a.meth`, -`control-b.meth`, `control-c.meth` to the methylomes `case-a.meth`, -`case-b.meth`, `case-c.meth`. The proportion table can be created with -the following command: +The first step in the differential methylation analysis is to assemble a +proportion table containing read proportions for all target +methylomes. Dnmtools includes a program [merge](../merge) to generate a +proportion table from the given collection of methylomes. Suppose that we want +to compare methylomes named `control-a.meth`, `control-b.meth`, +`control-c.meth` to the methylomes `case-a.meth`, `case-b.meth`, +`case-c.meth`. The proportion table can be created with the following +command: + ```console $ dnmtools merge -t -radmeth \ control-a.meth control-b.meth control-c.meth \ case-a.meth case-b.meth case-c.meth > proportion-table.txt ``` + This is what `proportion-table.txt` looks like: + ```txt - control_a control_b control_c case_a case_b case_c +control_a control_b control_c case_a case_b case_c chr1:108:+:CpG 9 6 10 8 1 1 2 2 2 1 14 1 chr1:114:+:CpG 17 7 10 0 14 3 5 1 9 1 7 1 chr1:160:+:CpG 12 8 10 5 17 4 15 14 13 6 4 4 chr1:309:+:CpG 1 1 1 0 17 12 12 8 2 1 19 8 ``` -As indicated in the header, this proportion table contains information -about 6 methylomes: 3 controls and 3 cases. Each row of the table -contains information about a CpG site and a proportion of reads -mapping over this site in each methylome. For example, the first row -describes a cytosine within a CpG site located on chromosome 1 at -position 108. This site is present in 9 reads in the methylome control -a and is methylated in 6 of them. Note that the dnmtools `merge` -command adds methylomes into the proportion table in the order in -which they are listed on the command line. + +As indicated in the header, this proportion table contains information about 6 +methylomes: 3 controls and 3 cases. Each row of the table contains +information about a CpG site and a proportion of reads mapping over this site +in each methylome. For example, the first row describes a cytosine within a +CpG site located on chromosome 1 at position 108. This site is present in 9 +reads in the methylome control a and is methylated in 6 of them. Note that the +dnmtools `merge` command adds methylomes into the proportion table in the +order in which they are listed on the command line. + +Note: the header conforms to a traditional "data frame" from R, which does not +include a column name for the row names, which are the first column. ### Design matrix -The next step is to specify the design matrix, which describes the -structure of the experiment. For our running example, the design -matrix looks like this: +The next step is to specify the design matrix, which describes the structure +of the experiment. For our running example, the design matrix looks like this: + ```txt base case control_a 1 0 @@ -66,59 +69,59 @@ case_a 1 1 case_b 1 1 case_c 1 1 ``` -The design matrix shows that samples in this dataset are associated -with two factors: base and case. The first column corresponds to the -base factor and will always be present in the design matrix. Think of -it as stating that all samples have the same baseline mean methylation -level. To distinguish cases from controls we add another factor case -(second column). The `1`s in this column correspond to the samples -which belong to the cases group. You can use this design matrix as a -template to create design matrices for two-group comparisons involving -arbitrary many methylomes. - -After creating the proportion table and the design matrix, we are now -ready to start the differential methylation analysis. It consists of -(1) regression, (2) combining significance, and (3) multiple testing -adjustment steps. + +The design matrix shows that samples in this dataset are associated with two +factors: base and case. The first column corresponds to the base factor and +will always be present in the design matrix. Think of it as stating that all +samples have the same baseline mean methylation level. To distinguish cases +from controls we add another factor case (second column). The `1`s in this +column correspond to the samples which belong to the cases group. You can use +this design matrix as a template to create design matrices for two-group +comparisons involving arbitrary many methylomes. + +After creating the proportion table and the design matrix, we are now ready to +start the differential methylation analysis. It consists of (1) regression, +(2) combining significance, and (3) multiple testing adjustment steps. ### Regression Suppose that the `proportion-table.txt` and `design-matrix.txt` are as described above. The regression step is run with the command + ```console -$ dnmtools radmeth -factor case design-matrix.txt proportion-table.txt > output.bed +$ dnmtools radmeth -factor case -o output.txt design-matrix.txt proportion-table.txt ``` -The `-factor` parameter specifies the factor with respect to which we -want to test for differential methylation. The test factor is case, -meaning that we are testing for differential methylation between cases -and controls. In the output file `output.bed`, the last four columns -correspond to the total read counts and methylated read counts of the -case group and control group, respectively. The p-value (5-th column) -is given by the log-likelihood ratio test. - -Depending on the distribution of methylated and unmethylated CpGs, -some p-values may not be calculated. If there are no reads that cover -a given CpG, the value `NA_LOW_COV` will be printed instead of a -p-value. If all CpGs are fully methylated and fully methylated in both -case and control, we say the CpG has an "extreme distribution". In -these cases, we cannot reject the null and output `NA_EXTREME_CNT`. -Finally, if there are numerical errors in the log-likelihood ratio -test, we simply print `NA`. You can opt out of these additional NA -informations by not using the `-n` flag. - -We do not recommend using the individual p-values of CpGs from -radmeth, as they often do not contain sufficient coverage for -significant biological interpretations. Instead, we recommend merging -p-values of neighboring CpGs using [radadjust](../radadjust). + +The `-factor` parameter specifies the factor with respect to which we want to +test for differential methylation. The test factor is 'case', meaning that we +are testing for differential methylation between cases and controls. In the +output file `output.txt`, the last four columns correspond to the total read +counts and methylated read counts of the case group and control group, +respectively. The p-value (5-th column) is given by the log-likelihood ratio +test. + +Depending on the distribution of methylated and unmethylated CpGs, some +p-values may not be calculated. If there are no reads that cover a given CpG, +the value `NA_LOW_COV` will be printed instead of a p-value. If all CpGs are +fully methylated and fully methylated in both case and control, we say the CpG +has an "extreme distribution". In these cases, we cannot reject the null and +output `NA_EXTREME_CNT`. Finally, if there are numerical errors in the +log-likelihood ratio test, we simply print `NA`. You can opt out of these +additional NA informations by not using the `-n` flag. + +We do not recommend relying too much on the individual p-values of CpGs from +radmeth, as they often do not contain sufficient coverage for significant +biological interpretations. Instead, we recommend merging p-values of +neighboring CpGs using [radadjust](../radadjust). ## Tutorial -This tutorial aims to answer questions users often have about the -assumptions of radmeth and how to interepret the results. We will -assume a factor of interest, with levels A and B. We will also assume -a potentially confounding factor, sex, with levels M and F. Further, -we have 8 total samples, 2 samples for each of the 4 combinations of -{A,B}x{M,F}. The design matrix should look like this: +This tutorial aims to answer questions users often have about the assumptions +of radmeth and how to interepret the results. We will assume a factor of +interest, with levels A and B. We will also assume a potentially confounding +factor, sex, with levels M and F. Further, we have 8 total samples, 2 samples +for each of the 4 combinations of {A,B}x{M,F}. The design matrix should look +like this: ```txt base sex factor @@ -131,28 +134,28 @@ sample_MA2 1 0 1 sample_MB1 1 0 0 sample_MB2 1 0 0 ``` -Note that the 3 headings correspond with the three binary columns, but -need not be aligned over them. The first sample in the design matrix, -`sample_FA1`, has an indicator in the other columns for both the "F" -and the "A", both encoded with a "1" in their respective columns. But -the replicate number 1 in the `_FA1` is just a replicate, and if it -had meaning we would consider encoding it using some factor. -Designating it as a replicate with no meaning is a choice here, -aligning with the idea that the replicates are statistically -identical. If all replicates identified with a "2" were done on "day -2", for example, then we might reconsider. - -For our example we will use a genome file `genome.fa` that is very -small and not a real genome. We will also use a file `features.bed` -that will be the starting point for generating data that fit with our -hypotheses. We will be generating data that *should* show us the -differences we want to see -- assuming we use the tools properly. I -will refer to each of the intervals in `features.bed` as a "feature" -but just think of it as a genomic interval with a chromosome name, a -start and an end. We need to first obtain some features that are -associated specifically with each of the levels of our factors: A, B, + +Note that the 3 headings correspond with the three binary columns, but need +not be aligned over them. The first sample in the design matrix, `sample_FA1`, +has an indicator in the other columns for both the "F" and the "A", both +encoded with a "1" in their respective columns. But the replicate number 1 in +the `_FA1` is just a replicate, and if it had meaning we would consider +encoding it using some factor. Designating it as a replicate with no meaning +is a choice here, aligning with the idea that the replicates are statistically +identical. If all replicates identified with a "2" were done on "day 2", for +example, then we might reconsider. + +For our example we will use a genome file `genome.fa` that is very small and +not a real genome. We will also use a file `features.bed` that will be the +starting point for generating data that fit with our hypotheses. We will be +generating data that *should* show us the differences we want to see -- +assuming we use the tools properly. I will refer to each of the intervals in +`features.bed` as a "feature" but just think of it as a genomic interval with +a chromosome name, a start and an end. We need to first obtain some features +that are associated specifically with each of the levels of our factors: A, B, M and F. We can do this starting with our features file and using the `bedtools` shuffle command: + ```console $ bedtools shuffle -i features.bed -g genome.chrom.sizes > sim/features_M.bed; $ cat sim/features_*.bed | sort -k 1,1 -k 2,2g -o excl.bed; @@ -162,10 +165,11 @@ $ bedtools shuffle -excl excl.bed -i features.bed -g genome.chrom.sizes > sim/fe $ cat sim/features_*.bed | sort -k 1,1 -k 2,2g -o excl.bed; $ bedtools shuffle -excl excl.bed -i features.bed -g genome.chrom.sizes > sim/features_B.bed; ``` -This ensures none of the "features" are overlapping between A, B, M or -F, and all are similar in number and size distribution. Using each of -these, I then made files of features corresponding to the -*combinations* of factors: + +This ensures none of the "features" are overlapping between A, B, M or F, and +all are similar in number and size distribution. Using each of these, I then +made files of features corresponding to the *combinations* of factors: + ```shell for i in M F; do for j in A B; do @@ -174,28 +178,30 @@ for i in M F; do done; done; ``` + Now I have files named: + ```txt sim/features_MA.bed sim/features_MB.bed sim/features_FA.bed sim/features_FB.bed ``` -The files named with an "M" include all intervals in -`sim/features_M.bed`, and similarly the files named with a "B" include -all intervals in `sim/features_B.bed`. So the file -`sim/features_FA.bed` is all the features for "A" and all the features -for "F". It will differ from `sim/features_FB.bed` less than it will -from `sim/features_MB.bed`. - -Using these files, I simulated methylation levels at every CpG site in -the fake genome `genome.fa` with CpGs outside the intervals receiving -a methylation level of 0.8 and CpGs inside the intervals receiving a -methylation level of 0.15. At each CpG site, a number of reads was -sampled (Poisson with mean 10) and then the methylation levels were -sampled with a coin flip weighed as either 0.8 or 0.15, depending on -the CpG site. I did this twice for each combination of {M,F}x{A,B}. -The result is 8 files named as follows: + +The files named with an "M" include all intervals in `sim/features_M.bed`, and +similarly the files named with a "B" include all intervals in +`sim/features_B.bed`. So the file `sim/features_FA.bed` is all the features +for "A" and all the features for "F". It will differ from +`sim/features_FB.bed` less than it will from `sim/features_MB.bed`. + +Using these files, I simulated methylation levels at every CpG site in the +fake genome `genome.fa` with CpGs outside the intervals receiving a +methylation level of 0.8 and CpGs inside the intervals receiving a methylation +level of 0.15. At each CpG site, a number of reads was sampled (Poisson with +mean 10) and then the methylation levels were sampled with a coin flip weighed +as either 0.8 or 0.15, depending on the CpG site. I did this twice for each +combination of {M,F}x{A,B}. The result is 8 files named as follows: + ```console $ ls -1 sim/*.counts sim/sample_FA1.counts @@ -207,12 +213,14 @@ sim/sample_MA2.counts sim/sample_MB1.counts sim/sample_MB2.counts ``` -The program I used for the simulation above is not currently -available. In this simulation, the only difference between replicates -(e.g., `sim/sample_FA1.counts` vs. `sim/sample_FA2.counts`) is random -noise due to sampling. + +The program I used for the simulation above is not currently available. In +this simulation, the only difference between replicates (e.g., +`sim/sample_FA1.counts` vs. `sim/sample_FA2.counts`) is random noise due to +sampling. Here is a look inside one of these files: + ```console $ head sim/sample_FA1.counts chr1 163 + CG 0.857143 7 @@ -226,12 +234,16 @@ chr1 324 + CG 0.555556 9 chr1 350 + CG 0.933333 15 chr1 356 + CG 0.923077 13 ``` + Then I used the `merge` command from dnmtools to make a data matrix as follows: + ```console $ dnmtools merge -t -radmeth -v -remove .counts -o sim/table.txt sim/sample_*.counts ``` + And this is what it gave me: + ```console $ head sim/table.txt sample_FA1 sample_FA2 sample_FB1 sample_FB2 sample_MA1 sample_MA2 sample_MB1 sample_MB2 @@ -245,18 +257,21 @@ chr1:322:+:CG 6 5 6 6 9 8 19 14 13 11 13 11 9 6 10 7 chr1:324:+:CG 9 5 10 9 8 5 10 9 7 6 14 9 7 6 6 6 chr1:350:+:CG 15 14 8 6 6 5 9 6 7 6 6 4 9 8 11 10 ``` -With this simulated data, in which each column has the influence of -two different factors, we can use `radmeth` with the design matrix -shown above (in a file named `desgin.txt`) to get p-values for -differential methylation between levels A and B of our factor of -interest -- while controling for the effect of M/F: + +With this simulated data, in which each column has the influence of two +different factors, we can use `radmeth` with the design matrix shown above (in +a file named `desgin.txt`) to get p-values for differential methylation +between levels A and B of our factor of interest -- while controling for the +effect of M/F: + ```console $ dnmtools radmeth -o sim/samples.radmeth -f factor design.txt sim/table.txt ``` -As explained already, the 5th column of the output gives the p-values -for differential methylation between levels for our factor of interest -(named "factor" in the design matrix). Here is a look at part of the -output: + +As explained already, the 5th column of the output gives the p-values for +differential methylation between levels for our factor of interest (named +"factor" in the design matrix). Here is a look at part of the output: + ```console $ head sim/samples.radmeth chr1 163 + CG 0.0864953 35 22 45 36 @@ -270,19 +285,23 @@ chr1 324 + CG 0.257851 40 29 31 26 chr1 350 + CG 0.893476 36 30 35 29 chr1 356 + CG 0.0527032 37 35 42 33 ``` -Focusing on the 5th column, we see values between 0 and 1, but none of -them are very extreme. This particular output file has 17903 rows. We -can use the dnmtools command `selectsites` to pull out the rows in -this file that correspond to each of the original feature sets. Here -is how we would do it for all 4 different original sets of features: + +Focusing on the 5th column, we see values between 0 and 1, but none of them +are very extreme. This particular output file has 17903 rows. We can use the +dnmtools command `selectsites` to pull out the rows in this file that +correspond to each of the original feature sets. Here is how we would do it +for all 4 different original sets of features: + ```shell for i in M F A B; do dnmtools selectsites -o sim/features_${i}.txt sim/features_${i}.bed sim/samples.radmeth; done ``` -The result allows us to verify that `radmeth` with the design matrix -given above identifies sites as significant when they differ between -levels A and B of the factor of interest: + +The result allows us to verify that `radmeth` with the design matrix given +above identifies sites as significant when they differ between levels A and B +of the factor of interest: + ```console $ for i in sim/features_*.counts; do echo $i `awk 'BEGIN{k=0;c=0}{k+=$5;c+=1}END{print k,c,k/c}' $i`; done | column -t sim/features_A.counts 0.255448 430 0.000594064 @@ -290,22 +309,23 @@ sim/features_B.counts 0.114256 372 0.00030714 sim/features_F.counts 196.093 400 0.490232 sim/features_M.counts 295.234 588 0.502099 ``` -The p-values in features associated with either M or F are averaging -around 0.5, which is what we want if we are trying to control this -possible confounding variable. In the features associated either with -A or B, the p-values are very small on average. Of course this is -literally by design. Knowing that we simulated just as much difference -between M and F as between A and B, if we had used `-f sex` then -`radmeth` would have correctly assigned low p-values to sites in the -intervals of `features_M.bed` and `features_F.bed`. - -We can use the same simulated data to verify that we do need the -intercept in our design. In other words, the column "base" in -`design.txt` needs to be present (you can give it any name you want; -make sure it's all 1s). - -We change the design matrix to make the following, which has the -intercept column removed: + +The p-values in features associated with either M or F are averaging around +0.5, which is what we want if we are trying to control this possible +confounding variable. In the features associated either with A or B, the +p-values are very small on average. Of course this is literally by +design. Knowing that we simulated just as much difference between M and F as +between A and B, if we had used `-f sex` then `radmeth` would have correctly +assigned low p-values to sites in the intervals of `features_M.bed` and +`features_F.bed`. + +We can use the same simulated data to verify that we do need the intercept in +our design. In other words, the column "base" in `design.txt` needs to be +present (you can give it any name you want; make sure it's all 1s). + +We change the design matrix to make the following, which has the intercept +column removed: + ```console $ cat design_noint.txt sex factor @@ -318,8 +338,10 @@ sample_MA2 0 1 sample_MB1 0 0 sample_MB2 0 0 ``` -Then we re-run all the above commands (I won't exaplain each) to get -results analogous but with this modified design matrix: + +Then we re-run all the above commands (I won't exaplain each) to get results +analogous but with this modified design matrix: + ```console $ dnmtools radmeth -o sim/samples_noint.radmeth -f factor design_noint.txt sim/table.txt $ for i in M F A B; do dnmtools selectsites -o sim/features_noint_${i}.txt sim/features_${i}.bed sim/samples_noint.radmeth; done @@ -329,33 +351,34 @@ sim/features_noint_B.txt 15.2391 372 0.0409653 sim/features_noint_F.txt 77.2438 400 0.193109 sim/features_noint_M.txt 93.7181 588 0.159385 ``` -Although the sites within features corresponding to A and B are much -lower on average than those for M or F, the distinction has been -reduced. More importantly, we know that the sites in -`sim/features_M.bed` and `sim/features_F.bed` are not influenced by -the level A/B of our factor of interest. The p-values for sites in the -last two rows above should have mean of 0.5. But without the intercept -included in the design matrix, the outcomes to not match our -statistical assumptions. I cannot predict what will happen if you use -the wrong design matrix. In analyzing bisulfite sequencing data for -differential methylation, we become aware of uncertainty through the -counts, and we leverage this to try and be more accurate. So unlike -other regression settings, if you try to normalize the data first, in -order to avoid having to deal with an intercept, you will not arrive -at an equivalent procedure -- if you want to normalize the data -yourself (i.e. subtract mu and divide by sigma) then you are probably -better off using a general regression package. + +Although the sites within features corresponding to A and B are much lower on +average than those for M or F, the distinction has been reduced. More +importantly, we know that the sites in `sim/features_M.bed` and +`sim/features_F.bed` are not influenced by the level A/B of our factor of +interest. The p-values for sites in the last two rows above should have mean +of 0.5. But without the intercept included in the design matrix, the outcomes +to not match our statistical assumptions. I cannot predict what will happen if +you use the wrong design matrix. In analyzing bisulfite sequencing data for +differential methylation, we become aware of uncertainty through the counts, +and we leverage this to try and be more accurate. So unlike other regression +settings, if you try to normalize the data first, in order to avoid having to +deal with an intercept, you will not arrive at an equivalent procedure -- if +you want to normalize the data yourself (i.e. subtract mu and divide by sigma) +then you are probably better off using a general regression package. ## Options ```txt -o, -out ``` + The name of the output file (default: stdout). ```txt -n -na-info ``` + If a p-value is not calculated, print NAs in more detail: low coverage (`NA_LOW_COV`), extreme values (`NA_EXTREME`) or numerical errors in likelihood ratios (`NA`). @@ -363,10 +386,27 @@ likelihood ratios (`NA`). ```txt -v, -verbose ``` + Print more information while the command is running. +```txt + -progress +``` + +Show progress while radmeth runs. + ```txt -f, -factor ``` + The name of the factor on which to test differences. This must be the name of one of the columns in file design matrix. This is required. + +```txt + -t, -threads +``` + +Use multiple threads. This gives very good speedup as long as you do not +exceed the number of available physical cores. Most CPUs support 2 hardware +threads per physical core via SMT/Hyper-Threading, so check your number of +cores if you are unsure. From 006bdacb7f7ee3363352bd55d7ecc8e434df429e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 19:58:33 -0700 Subject: [PATCH 15/24] src/radmeth/radmeth_model.cpp: changing default parameter values: tolerance from 1e-4 to 1e-4 and max_iter from 700 to 250 --- src/radmeth/radmeth_model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/radmeth/radmeth_model.cpp b/src/radmeth/radmeth_model.cpp index 1f7dc6d7..89ed0943 100644 --- a/src/radmeth/radmeth_model.cpp +++ b/src/radmeth/radmeth_model.cpp @@ -24,9 +24,9 @@ #include #include -double Regression::tolerance = 1e-4; +double Regression::tolerance = 1e-3; double Regression::stepsize = 0.001; -std::uint32_t Regression::max_iter = 700; +std::uint32_t Regression::max_iter = 250; void SiteProportions::parse(const std::string &line) { From a88951dc528b12c168216bfd5f2fb7b1f329ef68 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 19:59:10 -0700 Subject: [PATCH 16/24] src/radmeth/radmeth.cpp: adding cli args for tolerance and max-iter in the GSL iterative estimation of model parameters --- src/radmeth/radmeth.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 509ab76b..0fa8fcbe 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -401,6 +401,12 @@ main_radmeth(int argc, char *argv[]) { "numerical errors in likelihood ratios (NA)", false, more_na_info); opt_parse.add_opt("factor", 'f', "a factor to test", true, test_factor); + opt_parse.add_opt("tolerance", '\0', + "numerical tolerance to test convergence", false, + Regression::tolerance); + opt_parse.add_opt("max-iter", '\0', + "max iterations when estimating parameters", false, + Regression::max_iter); std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); From 439cdccc5c211c8955d622a50acae38b5b7ae79f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 19:59:30 -0700 Subject: [PATCH 17/24] docs/content/radmeth.md: adding description of the cli args for tolerance and max-iter in the GSL iterative estimation of model parameters --- docs/content/radmeth.md | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/docs/content/radmeth.md b/docs/content/radmeth.md index 96f595bd..ce1eec98 100644 --- a/docs/content/radmeth.md +++ b/docs/content/radmeth.md @@ -375,6 +375,15 @@ then you are probably better off using a general regression package. The name of the output file (default: stdout). +```txt + -t, -threads +``` + +Use multiple threads. This gives very good speedup as long as you do not +exceed the number of available physical cores. Most CPUs support 2 hardware +threads per physical core via SMT/Hyper-Threading, so check your number of +cores if you are unsure. + ```txt -n -na-info ``` @@ -403,10 +412,15 @@ The name of the factor on which to test differences. This must be the name of one of the columns in file design matrix. This is required. ```txt - -t, -threads + -tolerance ``` -Use multiple threads. This gives very good speedup as long as you do not -exceed the number of available physical cores. Most CPUs support 2 hardware -threads per physical core via SMT/Hyper-Threading, so check your number of -cores if you are unsure. +The numerical tolerance for convergence when estimating model parameters. +This cutoff is applied to the norm of the gradient of the log-likelihood +function for MLE parameter estimation. + +```txt + -max-iter +``` + +The maximum iterations to allow when estimating model parameters. From 175a078eb2d312b1554e0c5cec961e3cec9b1ab1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 20:23:35 -0700 Subject: [PATCH 18/24] src/radmeth/radmeth_optimize.cpp: using a tolerance that is a multiplied by the sqrt of the number of params and by the number of obs/samples --- src/radmeth/radmeth_optimize.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index 380ddeaf..b787b964 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -166,7 +166,6 @@ neg_loglik_and_grad(const gsl_vector *params, void *object, double *loglik_val, bool fit_regression_model(Regression &r, std::vector ¶ms_init) { - const auto tolerance = Regression::tolerance; const auto stepsize = Regression::stepsize; const auto max_iter = Regression::max_iter; @@ -177,6 +176,9 @@ fit_regression_model(Regression &r, std::vector ¶ms_init) { params_init.back() = -2.5; } + const double tolerance = + std::sqrt(n_params) * r.n_samples() * Regression::tolerance; + if (params_init.size() != n_params) throw std::runtime_error("Wrong number of initial parameters."); From 8c38ac296444473bef6436f83f4337a96e9dcecc Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 21 Jul 2025 20:45:25 -0700 Subject: [PATCH 19/24] docs/content/radmeth.md: updating the explanation of the tolerance parameter --- docs/content/radmeth.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/content/radmeth.md b/docs/content/radmeth.md index ce1eec98..4bbcdc37 100644 --- a/docs/content/radmeth.md +++ b/docs/content/radmeth.md @@ -417,7 +417,9 @@ name of one of the columns in file design matrix. This is required. The numerical tolerance for convergence when estimating model parameters. This cutoff is applied to the norm of the gradient of the log-likelihood -function for MLE parameter estimation. +function for MLE parameter estimation, and is multiplied by both the square +root of the number of parameters and by the number of methylomes in the +analysis. ```txt -max-iter From 549f31a7d69667ba9eea3c1d7acd1ef5e3e0beab Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 22 Jul 2025 15:44:59 -0700 Subject: [PATCH 20/24] src/radmeth/radmeth_model.cpp: refactored transpose --- src/radmeth/radmeth_model.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/radmeth/radmeth_model.cpp b/src/radmeth/radmeth_model.cpp index 89ed0943..faa4a8bb 100644 --- a/src/radmeth/radmeth_model.cpp +++ b/src/radmeth/radmeth_model.cpp @@ -105,6 +105,18 @@ SiteProportions::parse(const std::string &line) { throw std::runtime_error("failed to parse counts from:\n" + line); } +template +static void +transpose(const std::vector> &mat, + std::vector> &tmat) { + const auto n_row = std::size(mat); + const auto n_col = std::size(mat.front()); + tmat.resize(n_col, std::vector(n_row, 0.0)); + for (auto row_idx = 0u; row_idx < n_row; ++row_idx) + for (auto col_idx = 0u; col_idx < n_col; ++col_idx) + tmat[col_idx][row_idx] = mat[row_idx][col_idx]; +} + std::istream & operator>>(std::istream &is, Design &design) { std::string header_encoding; @@ -136,16 +148,10 @@ operator>>(std::istream &is, Design &design) { throw std::runtime_error( "each row must have as many columns as factors:\n" + row); - design.matrix.push_back(std::vector()); - swap(design.matrix.back(), matrix_row); + design.matrix.push_back(matrix_row); } - const auto n_row = std::size(design.matrix); - const auto n_col = std::size(design.matrix.front()); - design.tmatrix.resize(n_col, std::vector(n_row, 0.0)); - for (auto row_idx = 0u; row_idx < n_row; ++row_idx) - for (auto col_idx = 0u; col_idx < n_col; ++col_idx) - design.tmatrix[col_idx][row_idx] = design.matrix[row_idx][col_idx]; + transpose(design.matrix, design.tmatrix); return is; } From 15ca070221124820b6074cfb578638afe075c19d Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 22 Jul 2025 15:45:36 -0700 Subject: [PATCH 21/24] src/radmeth/radmeth_model.hpp: declaring scratch space for the p_v vector to avoid reallocation in the gradient --- src/radmeth/radmeth_model.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp index 3189ecd0..d7a01d63 100644 --- a/src/radmeth/radmeth_model.hpp +++ b/src/radmeth/radmeth_model.hpp @@ -75,6 +75,7 @@ struct Regression { Design design; SiteProportions props; double max_loglik{}; + std::vector p_v; // scratch space [[nodiscard]] std::size_t n_factors() const { From dd2e264afa29b75008214c3e67b6d3925eead0db Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 22 Jul 2025 15:46:48 -0700 Subject: [PATCH 22/24] src/radmeth/radmeth_optimize.cpp: slight changes to some numerical calculations and modifying the p_v in neg_gradient to be held in Regression to avoid reallocating --- src/radmeth/radmeth_optimize.cpp | 77 +++++++++++++++++--------------- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index b787b964..8affe87e 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -35,8 +35,8 @@ pi(const std::vector &v, const gsl_vector *params) { // ADS: this function doesn't have a very helpful name const auto a = v.data(); const double dot = std::inner_product(a, a + std::size(v), params->data, 0.0); - // const double p = -1.0 / std::expm1(-dot_prod); ? - return std::exp(dot) / (1.0 + std::exp(dot)); + // const double p = std::exp(dot) / (1.0 + std::exp(dot)); + return 1.0 / (1.0 / std::exp(dot) + 1.0); } [[nodiscard]] static double @@ -46,29 +46,32 @@ neg_loglik(const gsl_vector *params, void *object) { // ADS: the dispersion parameter phi is the last element of // parameter vector const double disp_param = gsl_vector_get(params, reg->design.n_factors()); - const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); + // const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); + const double phi = 1.0 / (1.0 / std::exp(disp_param) + 1.0); const double one_minus_phi = 1.0 - phi; const auto &mc = reg->props.mc; + const auto &mat = reg->design.matrix; double log_lik = 0; const std::size_t n_samples = reg->design.n_samples(); for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { - const double n = mc[col_idx].n_reads; - const double y = mc[col_idx].n_meth; - const double p = pi(reg->design.matrix[col_idx], params); + const auto n = mc[col_idx].n_reads; + const auto y = mc[col_idx].n_meth; + const double p = pi(mat[col_idx], params); const double one_minus_p = 1.0 - p; const double term1 = one_minus_phi * p; - for (int k = 0; k < y; ++k) + for (auto k = 0u; k < y; ++k) log_lik += std::log(term1 + phi * k); const double term2 = one_minus_phi * one_minus_p; - for (int k = 0; k < n - y; ++k) + for (auto k = 0u; k < n - y; ++k) log_lik += std::log(term2 + phi * k); - for (int k = 0; k < n; ++k) - log_lik -= std::log(1.0 + phi * (k - 1.0)); + for (auto k = 0u; k < n; ++k) + // log_lik -= std::log(1.0 + phi * (k - 1.0)); + log_lik -= std::log1p(phi * (k - 1.0)); } return -log_lik; } @@ -79,25 +82,27 @@ neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) { const std::size_t n_samples = reg->design.n_samples(); const std::size_t n_factors = reg->design.n_factors(); - std::vector p_v(n_samples, 0.0); + /// ADS: using scratch space held in reg instead of allocating here + // std::vector p_v(n_samples, 0.0); + auto &p_v = reg->p_v; for (auto col_idx = 0u; col_idx < n_samples; ++col_idx) p_v[col_idx] = pi(reg->design.matrix[col_idx], params); const double disp_param = gsl_vector_get(params, n_factors); - const auto &tmatrix = reg->design.tmatrix; - const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); - // const double phi = -1.0 / std::expm1(-disp_param); + const auto &tmat = reg->design.tmatrix; + // const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); + const double phi = 1.0 / (1.0 / std::exp(disp_param) + 1.0); const double one_minus_phi = 1.0 - phi; const auto &mc = reg->props.mc; - for (std::size_t f = 0; f < n_factors; ++f) { + for (std::size_t factor_idx = 0; factor_idx < n_factors; ++factor_idx) { double deriv = 0; - const auto &vec = tmatrix[f]; + const auto &vec = tmat[factor_idx]; for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { - const double n = mc[col_idx].n_reads; - const double y = mc[col_idx].n_meth; + const auto n = mc[col_idx].n_reads; + const auto y = mc[col_idx].n_meth; const double p = p_v[col_idx]; const double one_minus_p = 1.0 - p; @@ -108,48 +113,47 @@ neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) { double term = 0; - const auto accumulate_term = [&](int k, const int lim, - const double denom_base) { + const auto accum_term = [&](auto k, const auto lim, + const double denom_base) { for (; k < lim; ++k) term += 1.0 / (denom_base + phi * k); }; - const auto accumulate_subtract_term = [&](int k, const int lim, - const double denom_base) { + const auto accum_subtract_term = [&](auto k, const auto lim, + const double denom_base) { for (; k < lim; ++k) term -= 1.0 / (denom_base + phi * k); }; - accumulate_term(0, y, denom_term1); - accumulate_subtract_term(0, n - y, one_minus_phi * one_minus_p); + accum_term(0u, y, denom_term1); + accum_subtract_term(0u, n - y, one_minus_phi * one_minus_p); deriv += term * factor; } - gsl_vector_set(output, f, deriv); + gsl_vector_set(output, factor_idx, deriv); } double deriv = 0; for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { - const double n = mc[col_idx].n_reads; - const double y = mc[col_idx].n_meth; + const auto n = mc[col_idx].n_reads; + const auto y = mc[col_idx].n_meth; const double p = p_v[col_idx]; const double one_minus_p = 1.0 - p; double term = 0; - const auto accumulate_term = [&](int k, const int lim, - const double num_shift, - const double denom_base) { + const auto accum_term = [&](auto k, const auto lim, const double num_shift, + const double denom_base) { for (; k < lim; ++k) term += (k - num_shift) / (denom_base + phi * k); }; - const auto accumulate_subtract_term = [&](int k, const int lim) { + const auto accum_subtract_term = [&](auto k, const auto lim) { for (; k < lim; ++k) term -= (k - 1.0) / (1.0 + phi * (k - 1.0)); }; - accumulate_term(0, y, p, one_minus_phi * p); - accumulate_term(0, n - y, one_minus_p, one_minus_phi * one_minus_p); - accumulate_subtract_term(0, n); + accum_term(0u, y, p, one_minus_phi * p); + accum_term(0u, n - y, one_minus_p, one_minus_phi * one_minus_p); + accum_subtract_term(0u, n); deriv += term * phi * one_minus_phi; } @@ -176,6 +180,8 @@ fit_regression_model(Regression &r, std::vector ¶ms_init) { params_init.back() = -2.5; } + r.p_v.resize(r.n_samples()); + const double tolerance = std::sqrt(n_params) * r.n_samples() * Regression::tolerance; @@ -219,8 +225,7 @@ fit_regression_model(Regression &r, std::vector ¶ms_init) { // check status from gradient status = gsl_multimin_test_gradient(s->gradient, tolerance); } while (status == GSL_CONTINUE && ++iter < max_iter); - // It it reasonable to reduce the number of iterations to 500? - // ADS: 700 vs. 500? what's the difference? + // ADS: What is a reasonable number of iterations? r.max_loglik = -1.0 * neg_loglik(s->x, &r); From a767a6ef97bd134699b7cbfd189a5300a8b6970a Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 22 Jul 2025 16:30:52 -0700 Subject: [PATCH 23/24] src/utils/selectsites.cpp: adding an option to read sites with additional fields -- noticed when analysing output from radmeth --- src/utils/selectsites.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/utils/selectsites.cpp b/src/utils/selectsites.cpp index 43b288a7..b0389ac4 100644 --- a/src/utils/selectsites.cpp +++ b/src/utils/selectsites.cpp @@ -294,6 +294,7 @@ main_selectsites(int argc, char *argv[]) { bool VERBOSE = false; bool keep_file_on_disk = false; bool compress_output = false; + bool allow_extra_fields = false; std::string outfile("-"); std::string summary_file; @@ -315,6 +316,8 @@ main_selectsites(int argc, char *argv[]) { opt_parse.add_opt("summary", 'S', "write summary to this file", false, summary_file); opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); + opt_parse.add_opt("relaxed", '\0', "input has extra fields", false, + allow_extra_fields); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); opt_parse.set_show_defaults(); std::vector leftover_args; @@ -340,6 +343,8 @@ main_selectsites(int argc, char *argv[]) { const std::string sites_file = leftover_args.back(); /****************** END COMMAND LINE OPTIONS *****************/ + MSite::no_extra_fields = (allow_extra_fields == false); + selectsites_summary summary; summary.command_line = get_command_line(argc, argv); From 2595624a279042e6821188887c80148e2733affc Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 22 Jul 2025 16:32:00 -0700 Subject: [PATCH 24/24] docs/content/radmeth.md: updating numbers for changes in radmeth code --- docs/content/radmeth.md | 52 ++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/docs/content/radmeth.md b/docs/content/radmeth.md index 4bbcdc37..d46dc108 100644 --- a/docs/content/radmeth.md +++ b/docs/content/radmeth.md @@ -116,7 +116,15 @@ neighboring CpGs using [radadjust](../radadjust). ## Tutorial -This tutorial aims to answer questions users often have about the assumptions +The data used here is available at: + +```txt +https://github.com/andrewdavidsmith/radmeth-demo-simulated-data +``` + +And the results below were generated with v1.5.1 on Linux. + +This tutorial should answer questions users often have about the assumptions of radmeth and how to interepret the results. We will assume a factor of interest, with levels A and B. We will also assume a potentially confounding factor, sex, with levels M and F. Further, we have 8 total samples, 2 samples @@ -274,16 +282,16 @@ differential methylation between levels for our factor of interest (named ```console $ head sim/samples.radmeth -chr1 163 + CG 0.0864953 35 22 45 36 -chr1 206 + CG 0.444024 37 30 43 38 -chr1 232 + CG 0.756591 28 20 35 26 -chr1 278 + CG 0.780385 43 35 37 31 -chr1 296 + CG 0.64704 33 29 38 32 -chr1 310 + CG 0.640826 37 28 31 22 -chr1 322 + CG 0.0971364 38 33 47 35 -chr1 324 + CG 0.257851 40 29 31 26 -chr1 350 + CG 0.893476 36 30 35 29 -chr1 356 + CG 0.0527032 37 35 42 33 +chr1 163 + CG 0.0877822 35 22 45 36 +chr1 206 + CG 0.442745 37 30 43 38 +chr1 232 + CG 0.756925 28 20 35 26 +chr1 278 + CG 0.794626 43 35 37 31 +chr1 296 + CG 0.635986 33 29 38 32 +chr1 310 + CG 0.641986 37 28 31 22 +chr1 322 + CG 0.0973903 38 33 47 35 +chr1 324 + CG 0.260703 40 29 31 26 +chr1 350 + CG 0.888242 36 30 35 29 +chr1 356 + CG 0.0535286 37 35 42 33 ``` Focusing on the 5th column, we see values between 0 and 1, but none of them @@ -294,7 +302,7 @@ for all 4 different original sets of features: ```shell for i in M F A B; do - dnmtools selectsites -o sim/features_${i}.txt sim/features_${i}.bed sim/samples.radmeth; + dnmtools selectsites -relaxed -o sim/features_${i}.txt sim/features_${i}.bed sim/samples.radmeth; done ``` @@ -303,11 +311,11 @@ above identifies sites as significant when they differ between levels A and B of the factor of interest: ```console -$ for i in sim/features_*.counts; do echo $i `awk 'BEGIN{k=0;c=0}{k+=$5;c+=1}END{print k,c,k/c}' $i`; done | column -t -sim/features_A.counts 0.255448 430 0.000594064 -sim/features_B.counts 0.114256 372 0.00030714 -sim/features_F.counts 196.093 400 0.490232 -sim/features_M.counts 295.234 588 0.502099 +$ for i in sim/features_[ABFM].txt; do echo $i `awk 'BEGIN{k=0;c=0}{k+=$5;c+=1}END{print k,c,k/c}' $i`; done | column -t +sim/features_A.txt 0.256072 430 0.000595517 +sim/features_B.txt 0.114786 372 0.000308564 +sim/features_F.txt 197.332 400 0.493329 +sim/features_M.txt 297.728 588 0.50634 ``` The p-values in features associated with either M or F are averaging around @@ -344,12 +352,12 @@ analogous but with this modified design matrix: ```console $ dnmtools radmeth -o sim/samples_noint.radmeth -f factor design_noint.txt sim/table.txt -$ for i in M F A B; do dnmtools selectsites -o sim/features_noint_${i}.txt sim/features_${i}.bed sim/samples_noint.radmeth; done +$ for i in M F A B; do dnmtools selectsites -relaxed -o sim/features_noint_${i}.txt sim/features_${i}.bed sim/samples_noint.radmeth; done $ for i in sim/*_noint_*.txt; do echo $i `awk 'BEGIN{k=0;c=0}{k+=$5;c+=1}END{print k,c,k/c}' $i`; done | column -t -sim/features_noint_A.txt 8.27788 430 0.0192509 -sim/features_noint_B.txt 15.2391 372 0.0409653 -sim/features_noint_F.txt 77.2438 400 0.193109 -sim/features_noint_M.txt 93.7181 588 0.159385 +sim/features_noint_A.txt 8.27931 430 0.0192542 +sim/features_noint_B.txt 15.2397 372 0.040967 +sim/features_noint_F.txt 77.2656 400 0.193164 +sim/features_noint_M.txt 93.7312 588 0.159407 ``` Although the sites within features corresponding to A and B are much lower on