From 7f0db95d4d85294ff7057c361c57d4420c39db53 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Aug 2025 14:55:34 -0700 Subject: [PATCH 1/2] src/radmeth/radmeth.cpp: some cleanup --- src/radmeth/radmeth.cpp | 98 ++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 54 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 0fa8fcbe..b5b5ba09 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -1,9 +1,7 @@ -/* Copyright (C) 2013-2023 University of Southern California and - * Egor Dolzhenko - * Andrew D Smith - * Guilherme Sena +/* Copyright (C) 2013-2025 Andrew D Smith * - * Authors: Andrew D. Smith and Egor Dolzhenko and Guilherme Sena + * Author: Andrew D. Smith + * Contributors: Andrew D. Smith and Egor Dolzhenko and Guilherme Sena * * This program is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -16,6 +14,15 @@ * General Public License for more details. */ +#include "radmeth_model.hpp" +#include "radmeth_optimize.hpp" + +// smithlab headers +#include "GenomicRegion.hpp" +#include "OptionParser.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" + #include // GSL header for chisqrd distribution #include @@ -29,20 +36,6 @@ #include #include -// smithlab headers -#include "GenomicRegion.hpp" -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - -#include "radmeth_model.hpp" -#include "radmeth_optimize.hpp" - -using std::begin; -using std::cout; -using std::distance; -using std::end; - struct file_progress { double one_hundred_over_filesize{}; std::size_t prev_offset{}; @@ -67,7 +60,6 @@ remove_factor(Design &design, const std::size_t factor_idx) { design.matrix[i].erase(std::begin(design.matrix[i]) + factor_idx); } -/***************** RADMETH ALGORITHM *****************/ static bool consistent_sample_names(const Regression ®, const std::string &header) { std::istringstream iss(header); @@ -84,7 +76,7 @@ consistent_sample_names(const Regression ®, const std::string &header) { // function outputs the p-value of the log-likelihood ratio. *Note* that it is // assumed that the reduced model has one fewer factor than the reduced model. static double -llr_test(double null_loglik, double full_loglik) { +llr_test(const double null_loglik, const double full_loglik) { // The log-likelihood ratio statistic. const double log_lik_stat = -2 * (null_loglik - full_loglik); @@ -100,16 +92,15 @@ llr_test(double null_loglik, double full_loglik) { } static bool -has_low_coverage(const Regression ®, const std::size_t test_fac) { +has_low_coverage(const Regression ®, const std::size_t test_factor) { bool cvrd_in_test_fact_smpls = false; + const auto &tcol = reg.design.tmatrix[test_factor]; for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_test_fact_smpls; ++i) - cvrd_in_test_fact_smpls = - (reg.design.matrix[i][test_fac] == 1 && reg.props.mc[i].n_reads != 0); + cvrd_in_test_fact_smpls = (tcol[i] == 1 && reg.props.mc[i].n_reads != 0); bool cvrd_in_other_smpls = false; for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_other_smpls; ++i) - cvrd_in_other_smpls = - (reg.design.matrix[i][test_fac] != 1 && reg.props.mc[i].n_reads != 0); + cvrd_in_other_smpls = (tcol[i] != 1 && reg.props.mc[i].n_reads != 0); return !cvrd_in_test_fact_smpls || !cvrd_in_other_smpls; } @@ -131,9 +122,9 @@ has_extreme_counts(const Regression ®) { [[nodiscard]] static bool has_two_values(const Regression ®, const std::size_t test_factor) { - const auto &v = reg.design.tmatrix[test_factor]; - for (auto i = std::cbegin(v); i != std::cend(v); ++i) - if (*i != v[0]) + const auto &tcol = reg.design.tmatrix[test_factor]; + for (const auto x : tcol) + if (x != tcol[0]) return true; return false; } @@ -145,7 +136,7 @@ get_test_factor_idx(const Regression &model, const std::string &test_factor) { std::find(std::cbegin(factors), std::cend(factors), test_factor); if (itr == std::cend(factors)) - throw std::runtime_error("Factor not part of design: " + test_factor); + throw std::runtime_error("factor not part of design: " + test_factor); return std::distance(std::cbegin(factors), itr); } @@ -154,16 +145,15 @@ static void read_design(const bool verbose, const std::string &design_filename, Design &design) { if (verbose) - std::cerr << "design table filename: " << design_filename << std::endl; + std::cerr << "design table filename: " << design_filename << '\n'; std::ifstream design_file(design_filename); if (!design_file) throw std::runtime_error("could not open file: " + design_filename); // initialize full design matrix from file design_file >> design; - if (verbose) - std::cerr << design << std::endl; + std::cerr << design << '\n'; } enum class row_status : std::uint8_t { @@ -174,11 +164,11 @@ enum class row_status : std::uint8_t { }; [[nodiscard]] static std::vector -drop_idx(const std::vector &v, const std::size_t to_drop) { +drop_idx(const std::vector &v, const std::size_t idx_to_drop) { std::vector u; u.reserve(std::size(v) - 1); for (auto i = 0u; i < std::size(v); ++i) - if (i != to_drop) + if (i != idx_to_drop) u.push_back(v[i]); return u; } @@ -218,12 +208,16 @@ radmeth(const bool show_progress, const bool more_na_info, std::string sample_names_header; std::getline(table_file, sample_names_header); - if (!consistent_sample_names(alt_model, sample_names_header)) - throw std::runtime_error( - "header:\n" + sample_names_header + "\n" + - "does not match factor names or their order in the\n" - "design matrix. Check that the design matrix and\n" - "the proportion table are correctly formatted."); + if (!consistent_sample_names(alt_model, sample_names_header)) { + // clang-format off + const auto message = + R"( +does not match factor names or their order in the design matrix. Check +that the design matrix and the proportion table are correctly formatted. +)"; + // clang-format on + throw std::runtime_error("header:\n" + sample_names_header + message); + } const std::size_t n_samples = alt_model.design.n_samples(); @@ -266,9 +260,9 @@ radmeth(const bool show_progress, const bool more_na_info, throw std::runtime_error("found row with wrong number of columns"); const auto [p_val, status] = [&]() -> std::tuple { - // Skip the test if (1) no coverage in all cases or in all controls, - // or (2) the site is completely methylated or completely - // unmethylated across all samples. + // Skip the test if (1) no coverage in all cases or in all + // controls, or (2) the site is completely methylated or + // completely unmethylated across all samples. if (has_low_coverage(t_alt_model, test_factor_idx)) return std::tuple{1.0, row_status::na_low_cov}; @@ -336,11 +330,11 @@ radmeth(const bool show_progress, const bool more_na_info, cursor += n_pval_bytes; // clang-format off - const int n_suffix_bytes = std::sprintf(cursor, suffix_fmt, - n_reads_factor, - n_meth_factor, - n_reads_others, - n_meth_others); + const int n_suffix_bytes = std::sprintf(cursor, suffix_fmt, + n_reads_factor, + n_meth_factor, + n_reads_others, + n_meth_others); // clang-format on if (n_suffix_bytes < 0) return n_suffix_bytes; @@ -368,10 +362,6 @@ radmeth(const bool show_progress, const bool more_na_info, } } -/*********************************************************************** - * Run beta-binoimial regression using the specified table with - * proportions and design matrix - */ int main_radmeth(int argc, char *argv[]) { try { @@ -455,7 +445,7 @@ main_radmeth(int argc, char *argv[]) { orig_alt_model, orig_null_model, test_factor_idx); } catch (const std::exception &e) { - std::cerr << "ERROR: " << e.what() << std::endl; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From b366af136a867fa390073165c6a3be91c5ae1b32 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Aug 2025 19:29:00 -0700 Subject: [PATCH 2/2] src/utils/merge-methcounts.cpp: modernizing some code --- src/utils/merge-methcounts.cpp | 429 ++++++++++++++++----------------- 1 file changed, 204 insertions(+), 225 deletions(-) diff --git a/src/utils/merge-methcounts.cpp b/src/utils/merge-methcounts.cpp index 08470b83..8aec4083 100644 --- a/src/utils/merge-methcounts.cpp +++ b/src/utils/merge-methcounts.cpp @@ -1,9 +1,9 @@ /* merge-methcounts: a program for merging methcounts files * - * Copyright (C) 2011-2022 University of Southern California and + * Copyright (C) 2011-2025 University of Southern California and * Andrew D. Smith * - * Authors: Benjamin E Decato, Meng Zhou and Andrew D Smith + * Authors: Andrew D Smith * * This program is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -16,64 +16,59 @@ * General Public License for more details. */ +#include "MSite.hpp" +#include "OptionParser.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" + +#include + +#include #include -#include -#include +#include #include #include -#include -#include +#include #include +#include #include -#include +#include #include -#include - -#include - -#include "OptionParser.hpp" -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" -#include "MSite.hpp" - -using std::string; -using std::vector; -using std::cout; -using std::cerr; -using std::endl; -using std::runtime_error; -using std::numeric_limits; -using std::unordered_set; -using std::unordered_map; - -using bamxx::bgzf_file; +#include +#include static void -set_invalid(MSite &s) {s.pos = numeric_limits::max();} +set_invalid(MSite &s) { + s.pos = std::numeric_limits::max(); +} static bool -is_valid(const MSite &s) {return s.pos != numeric_limits::max();} +is_valid(const MSite &s) { + return s.pos != std::numeric_limits::max(); +} -static bool -any_sites_unprocessed(const vector &filenames, - const vector &infiles, - vector &outdated, vector &sites) { +[[nodiscard]] static bool +any_sites_unprocessed( + const std::vector &filenames, + const std::vector> &infiles, + std::vector &outdated, std::vector &sites) { - const size_t n_files = sites.size(); + const std::size_t n_files = std::size(sites); bool sites_remain = false; - for (size_t i = 0; i < n_files; ++i) { + for (std::size_t i = 0; i < n_files; ++i) { if (outdated[i]) { outdated[i] = false; MSite tmp_site; if (read_site(*infiles[i], tmp_site)) { // ADS: chrom order within a file already tested if (tmp_site.pos <= sites[i].pos && tmp_site.chrom == sites[i].chrom) - throw runtime_error("sites not sorted in " + filenames[i]); + throw std::runtime_error("sites not sorted in " + filenames[i]); sites_remain = true; sites[i] = tmp_site; } - else set_invalid(sites[i]); + else + set_invalid(sites[i]); } else if (is_valid(sites[i])) sites_remain = true; @@ -81,42 +76,41 @@ any_sites_unprocessed(const vector &filenames, return sites_remain; } - -static bool -site_precedes(const vector &sites, - const unordered_map &chroms_order, - const size_t a, const size_t b) { +[[nodiscard]] static bool +site_precedes(const std::vector &sites, + const std::unordered_map &chroms_order, + const std::size_t a, const std::size_t b) { if (chroms_order.empty()) return sites[a] < sites[b]; - const size_t a_chr = chroms_order.find(sites[a].chrom)->second; - const size_t b_chr = chroms_order.find(sites[b].chrom)->second; + const std::size_t a_chr = chroms_order.find(sites[a].chrom)->second; + const std::size_t b_chr = chroms_order.find(sites[b].chrom)->second; - return ((a_chr < b_chr) || - (a_chr == b_chr && sites[a].pos < sites[b].pos)); + return a_chr < b_chr || (a_chr == b_chr && sites[a].pos < sites[b].pos); } +[[nodiscard]] static std::size_t +find_minimum_site( + const std::vector &sites, + const std::unordered_map &chroms_order, + const std::vector &outdated) { -static size_t -find_minimum_site(const vector &sites, - const unordered_map &chroms_order, - const vector &outdated) { - - const size_t n_files = sites.size(); + const std::size_t n_files = std::size(sites); // ms_id is the id of the minimum site, the next one to print - size_t ms_id = numeric_limits::max(); + std::size_t ms_id = std::numeric_limits::max(); // make sure there is at least one remaining site to print - for (size_t i = 0; i < n_files && ms_id == numeric_limits::max(); ++i) + for (std::size_t i = 0; + i < n_files && ms_id == std::numeric_limits::max(); ++i) if (is_valid(sites[i]) && !outdated[i]) ms_id = i; - if (ms_id == numeric_limits::max()) - throw runtime_error("failed in a next site to print"); + if (ms_id == std::numeric_limits::max()) + throw std::runtime_error("failed in a next site to print"); // now find the earliest site to print among those that could be - for (size_t i = 0; i < n_files; ++i) + for (std::size_t i = 0; i < n_files; ++i) if (!outdated[i] && is_valid(sites[i]) && site_precedes(sites, chroms_order, i, ms_id)) ms_id = i; @@ -124,23 +118,23 @@ find_minimum_site(const vector &sites, return ms_id; } - -static bool +[[nodiscard]] static bool same_location(const MSite &a, const MSite &b) { return a.chrom == b.chrom && a.pos == b.pos; } -static size_t -collect_sites_to_print(const vector &sites, - const unordered_map &chroms_order, - const vector &outdated, - vector &to_print) { +[[nodiscard]] static std::size_t +collect_sites_to_print( + const std::vector &sites, + const std::unordered_map &chroms_order, + const std::vector &outdated, std::vector &to_print) { - const size_t n_files = sites.size(); + const std::size_t n_files = std::size(sites); - const size_t min_site_idx = find_minimum_site(sites, chroms_order, outdated); + const std::size_t min_site_idx = + find_minimum_site(sites, chroms_order, outdated); - for (size_t i = 0; i < n_files; ++i) + for (std::size_t i = 0; i < n_files; ++i) // condition below covers "is_valid(sites[i])" if (same_location(sites[min_site_idx], sites[i])) to_print[i] = true; @@ -148,41 +142,35 @@ collect_sites_to_print(const vector &sites, return min_site_idx; } - -static bool -any_mutated(const vector &to_print, - const vector &sites) { - size_t i = 0; - while (i < sites.size() && !(to_print[i] && sites[i].is_mutated())) +[[nodiscard]] static bool +any_mutated(const std::vector &to_print, + const std::vector &sites) { + std::size_t i = 0; + while (i < std::size(sites) && !(to_print[i] && sites[i].is_mutated())) ++i; - return (i < sites.size()); + return (i < std::size(sites)); } - static void write_line_for_tabular(const bool write_fractional, const bool report_any_mutated, - const size_t min_reads, - std::ostream &out, - const vector &to_print, - const vector &sites, - MSite min_site) { + const std::size_t min_reads, std::ostream &out, + const std::vector &to_print, + const std::vector &sites, MSite min_site) { - const size_t n_files = sites.size(); + const std::size_t n_files = std::size(sites); min_site.set_unmutated(); if (report_any_mutated && any_mutated(to_print, sites)) min_site.set_mutated(); // ADS: is this the format we want for the row names? - out << min_site.chrom << ':' - << min_site.pos << ':' - << min_site.strand << ':' + out << min_site.chrom << ':' << min_site.pos << ':' << min_site.strand << ':' << min_site.context; if (write_fractional) { - for (size_t i = 0; i < n_files; ++i) { - const size_t r = sites[i].n_reads; + for (std::size_t i = 0; i < n_files; ++i) { + const std::size_t r = sites[i].n_reads; if (to_print[i] && r >= min_reads) out << '\t' << sites[i].meth; else @@ -190,22 +178,21 @@ write_line_for_tabular(const bool write_fractional, } } else - for (size_t i = 0; i < n_files; ++i) { + for (std::size_t i = 0; i < n_files; ++i) { if (to_print[i]) out << '\t' << sites[i].n_reads << '\t' << sites[i].n_meth(); - else out << '\t' << 0 << '\t'<< 0; + else + out << '\t' << 0 << '\t' << 0; } out << '\n'; } static void -write_line_for_merged_counts(std::ostream &out, - const bool report_any_mutated, - const vector &to_print, - const vector &sites, - MSite min_site) { +write_line_for_merged_counts(std::ostream &out, const bool report_any_mutated, + const std::vector &to_print, + const std::vector &sites, MSite min_site) { - const size_t n_files = sites.size(); + const std::size_t n_files = std::size(sites); min_site.set_unmutated(); if (report_any_mutated && any_mutated(to_print, sites)) @@ -213,49 +200,50 @@ write_line_for_merged_counts(std::ostream &out, double meth_sum = 0; min_site.n_reads = 0; - for (size_t i = 0; i < n_files; ++i) + for (std::size_t i = 0; i < n_files; ++i) if (to_print[i]) { meth_sum += sites[i].n_meth(); min_site.n_reads += sites[i].n_reads; } - min_site.meth = meth_sum/std::max(1ul, min_site.n_reads); + min_site.meth = meth_sum / std::max(1ul, min_site.n_reads); out << min_site << '\n'; } - -static string +[[nodiscard]] static std::string remove_extension(const std::string &filename) { - const size_t last_dot = filename.find_last_of("."); - if (last_dot == std::string::npos) return filename; - else return filename.substr(0, last_dot); + const std::size_t last_dot = filename.find_last_of("."); + if (last_dot == std::string::npos) + return filename; + else + return filename.substr(0, last_dot); } - -static string -remove_suffix(const string &suffix, const std::string &filename) { - if (filename.substr(filename.size() - suffix.size(), suffix.size()) == suffix) - return filename.substr(0, filename.size() - suffix.size()); +[[nodiscard]] static std::string +remove_suffix(const std::string &suffix, const std::string &filename) { + if (filename.substr(std::size(filename) - std::size(suffix), + std::size(suffix)) == suffix) + return filename.substr(0, std::size(filename) - std::size(suffix)); return filename; } - static void -get_orders_by_file(const string &filename, vector &chroms_order) { - - bgzf_file in(filename, "r"); - if (!in) throw runtime_error("bad file: " + filename); +get_orders_by_file(const std::string &filename, + std::vector &chroms_order) { + bamxx::bgzf_file in(filename, "r"); + if (!in) + throw std::runtime_error("bad file: " + filename); chroms_order.clear(); - unordered_set chroms_seen; - string line; - string prev_chrom; + std::unordered_set chroms_seen; + std::string line; + std::string prev_chrom; while (getline(in, line)) { line.resize(line.find_first_of(" \t")); if (line != prev_chrom) { - if (chroms_seen.find(line) != end(chroms_seen)) - throw runtime_error("chroms out of order in: " + filename); + if (chroms_seen.find(line) != std::cend(chroms_seen)) + throw std::runtime_error("chroms out of order in: " + filename); chroms_seen.insert(line); chroms_order.push_back(line); std::swap(line, prev_chrom); @@ -263,94 +251,88 @@ get_orders_by_file(const string &filename, vector &chroms_order) { } } - static void -get_chroms_order(const vector &filenames, - unordered_map &chroms_order) { +get_chroms_order(const std::vector &filenames, + std::unordered_map &chroms_order) { // get order of chroms in each file - vector> orders(filenames.size()); - for (size_t i = 0; i < filenames.size(); ++i) + std::vector> orders(std::size(filenames)); + for (std::size_t i = 0; i < std::size(filenames); ++i) get_orders_by_file(filenames[i], orders[i]); // get the union of chrom sets - unordered_set the_union; - for (size_t i = 0; i < orders.size(); ++i) - for (size_t j = 0; j < orders[i].size(); ++j) + std::unordered_set the_union; + for (std::size_t i = 0; i < std::size(orders); ++i) + for (std::size_t j = 0; j < std::size(orders[i]); ++j) the_union.insert(orders[i][j]); // get an adjacency list and in-degree for each node - unordered_map> adj_lists; - unordered_map in_degree; + std::unordered_map> adj_lists; + std::unordered_map in_degree; for (auto &&i : the_union) { in_degree[i] = 0; - adj_lists[i] = vector(); + adj_lists[i] = std::vector(); } for (auto &&i : orders) { - auto j = begin(i); - for (auto k = j + 1; k != end(i); ++j, ++k) { + auto j = std::cbegin(i); + for (auto k = j + 1; k != std::cend(i); ++j, ++k) { adj_lists[*j].push_back(*k); ++in_degree[*k]; } } - std::queue q; // invariant: nodes with no incoming edge + std::queue q; // invariant: nodes with no incoming edge for (auto &&i : the_union) if (in_degree[i] == 0) q.push(i); while (!q.empty()) { - const string u = q.front(); + const std::string u = q.front(); q.pop(); // iterate over the edges (u, v) for (auto &&v : adj_lists[u]) { - --in_degree[v]; // degree should not appear here as 0 - if (in_degree[v] == 0) // this should only happen once per v + --in_degree[v]; // degree should not appear here as 0 + if (in_degree[v] == 0) // this should only happen once per v q.push(v); } - adj_lists[u].clear(); // delete node; already had in_degree 0 + adj_lists[u].clear(); // delete node; already had in_degree 0 - chroms_order.insert(make_pair(u, chroms_order.size())); + chroms_order.insert(std::make_pair(u, std::size(chroms_order))); } // finally, make sure we found a consistent order for (auto &&i : adj_lists) if (!i.second.empty()) - throw runtime_error("inconsistent order of chroms between files"); + throw std::runtime_error("inconsistent order of chroms between files"); } - - /* - This utility does two things, and they are grouped together here - because of how they are done, not because the uses are related. (1) - merge-methcounts can take a set of methcounts output files and - combine them into one. There are several reasons a user might want - to do this. An example is when technical replicates are performed, - and analyzed separately to understand technical variance (e.g., - between sequencing runs or library preps). After examining the - technical variation, subsequent analyses might be best conducted on - all the data together. So all the methcounts files can be combined - into one using merge-methcounts. In this case, the coverage at any - site is the sum of the coverages in the original methcounts files, - and the methylation level at any site is the weighted mean. (2) - merge-methcounts can take a set of methcounts output files, and - create a table that contains all the same information. The table - format is helpful if subsequent analyses are best done using a data - table, for example in R. When producing a tabular format, - merge-methcounts allows the user to select whether the desired + This utility does two things, and they are grouped together here because of + how they are done, not because the uses are related. (1) merge-methcounts + can take a set of methcounts output files and combine them into one. There + are several reasons a user might want to do this. An example is when + technical replicates are performed, and analyzed separately to understand + technical variance (e.g., between sequencing runs or library preps). After + examining the technical variation, subsequent analyses might be best + conducted on all the data together. So all the methcounts files can be + combined into one using merge-methcounts. In this case, the coverage at any + site is the sum of the coverages in the original methcounts files, and the + methylation level at any site is the weighted mean. (2) merge-methcounts can + take a set of methcounts output files, and create a table that contains all + the same information. The table format is helpful if subsequent analyses are + best done using a data table, for example in R. When producing a tabular + format, merge-methcounts allows the user to select whether the desired output is in counts or fractions. */ int main_merge_methcounts(int argc, char *argv[]) { - try { - static const string description = "merge multiple methcounts files"; + static const std::string description = "merge multiple methcounts files"; - string outfile; - bool VERBOSE; + std::string outfile; + bool verbose = false; bool report_any_mutated = false; bool write_tabular_format = false; bool write_fractional = false; @@ -358,25 +340,27 @@ main_merge_methcounts(int argc, char *argv[]) { bool ignore_chroms_order = false; bool add_first_column_header = false; - string header_info; - string column_name_suffix = "RM"; - string suffix_to_remove; + std::string header_info; + std::string column_name_suffix = "RM"; + std::string suffix_to_remove; - size_t min_reads = 1; + std::size_t min_reads = 1; /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(strip_path(argv[0]), description, ""); opt_parse.add_opt("output", 'o', "output file name (default: stdout)", false, outfile); - opt_parse.add_opt("header", 'H',"header to print (ignored for tabular)", + opt_parse.add_opt("header", 'H', "header to print (ignored for tabular)", false, header_info); - opt_parse.add_opt("tabular", 't', "output as table", - false, write_tabular_format); - opt_parse.add_opt("radmeth", '\0', "make radmeth input " + opt_parse.add_opt("tabular", 't', "output as table", false, + write_tabular_format); + opt_parse.add_opt("radmeth", '\0', + "make radmeth input " "(use with tabular and without fractional)", false, radmeth_format); - opt_parse.add_opt("remove", '\0', "remove this suffix from filenames when " + opt_parse.add_opt("remove", '\0', + "remove this suffix from filenames when " "making column names; default removes from final dot", false, suffix_to_remove); opt_parse.add_opt("suff", 's', @@ -386,117 +370,119 @@ main_merge_methcounts(int argc, char *argv[]) { false, column_name_suffix); opt_parse.add_opt("fractional", 'f', "output fractions (requires tabular)", false, write_fractional); - opt_parse.add_opt("reads", 'r', "min reads (for fractional)", - false, min_reads); - opt_parse.add_opt("ignore", '\0',"Do not attempt to determine chromosome. " + opt_parse.add_opt("reads", 'r', "min reads (for fractional)", false, + min_reads); + opt_parse.add_opt("ignore", '\0', + "Do not attempt to determine chromosome. " "Lexicographic order will be used.", false, ignore_chroms_order); - opt_parse.add_opt("mut", 'm',"If any of the sites being merged indicates " + opt_parse.add_opt("mut", 'm', + "If any of the sites being merged indicates " "mutated, mark the result has mutated.", false, report_any_mutated); - opt_parse.add_opt("1st-col-header", '\0',"add name for 1st col in header", + opt_parse.add_opt("1st-col-header", '\0', "add name for 1st col in header", false, add_first_column_header); - opt_parse.add_opt("verbose", 'v',"print more run info", false, VERBOSE); + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); opt_parse.set_show_defaults(); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (write_fractional && !write_tabular_format) { - cerr << "fractional output only available for tabular format" << endl; + std::cerr << "fractional output only available for tabular format\n"; return EXIT_SUCCESS; } if (leftover_args.empty()) { - cerr << opt_parse.help_message() << endl; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - if (column_name_suffix.size() != 2) { - cerr << "column name suffix must be 2 letters" << endl; + if (std::size(column_name_suffix) != 2) { + std::cerr << "column name suffix must be 2 letters\n"; return EXIT_SUCCESS; } - vector meth_files(leftover_args); + std::vector meth_files(leftover_args); /****************** END COMMAND LINE OPTIONS *****************/ - unordered_map chroms_order; + std::unordered_map chroms_order; if (!ignore_chroms_order) { - if (VERBOSE) - cerr << "resolving chromosome order" << endl; + if (verbose) + std::cerr << "resolving chromosome order\n"; get_chroms_order(meth_files, chroms_order); - if (VERBOSE) { - cerr << "chromosome order" << endl; - vector v(chroms_order.size()); + if (verbose) { + std::cerr << "chromosome order\n"; + std::vector v(std::size(chroms_order)); for (auto &&i : chroms_order) v[i.second] = i.first; for (auto &&i : v) - cerr << i << endl; + std::cerr << i << '\n'; } } - const size_t n_files = meth_files.size(); + const std::size_t n_files = std::size(meth_files); - vector infiles(n_files); - for (size_t i = 0; i < n_files; ++i) { - infiles[i] = new bgzf_file(meth_files[i], "r"); + std::vector> infiles(n_files); + for (std::size_t i = 0; i < n_files; ++i) { + infiles[i] = std::make_unique(meth_files[i], "r"); if (!(*infiles[i])) - throw runtime_error("cannot open file: " + meth_files[i]); + throw std::runtime_error("cannot open file: " + meth_files[i]); } std::ofstream of; - if (!outfile.empty()) of.open(outfile); - std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); + if (!outfile.empty()) + of.open(outfile); + std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); // print header if user specifies or if tabular output format if (write_tabular_format) { - vector colnames; + std::vector colnames; for (const auto &i : meth_files) colnames.push_back(strip_path(i)); for (auto &i : colnames) - i = suffix_to_remove.empty() ? - remove_extension(i) : remove_suffix(suffix_to_remove, i); + i = suffix_to_remove.empty() ? remove_extension(i) + : remove_suffix(suffix_to_remove, i); if (!write_fractional && !radmeth_format) { - vector tmp; + std::vector tmp; for (auto &&i : colnames) { tmp.push_back(i + "_" + column_name_suffix[0]); tmp.push_back(i + "_" + column_name_suffix[1]); } - swap(tmp, colnames); + std::swap(tmp, colnames); } if (add_first_column_header) out << "site\t"; - copy(begin(colnames), end(colnames), - std::ostream_iterator(out, "\t")); - out << endl; + std::copy(std::cbegin(colnames), std::cend(colnames), + std::ostream_iterator(out, "\t")); + out << '\n'; } else if (!header_info.empty()) - out << "#" << header_info << endl; + out << "#" << header_info << '\n'; - vector sites(n_files); - vector outdated(n_files, true); - vector sites_to_print; // declared here to keep allocation - vector> chroms_seen(n_files); + std::vector sites(n_files); + std::vector outdated(n_files, true); + std::vector sites_to_print; // declared here to keep allocation + std::vector> chroms_seen(n_files); while (any_sites_unprocessed(meth_files, infiles, outdated, sites)) { - sites_to_print.clear(); sites_to_print.resize(n_files, false); // below idx is one index among the sites to print - const size_t idx = + const std::size_t idx = collect_sites_to_print(sites, chroms_order, outdated, sites_to_print); // output the appropriate sites' data @@ -504,21 +490,14 @@ main_merge_methcounts(int argc, char *argv[]) { write_line_for_tabular(write_fractional, report_any_mutated, min_reads, out, sites_to_print, sites, sites[idx]); else - write_line_for_merged_counts(out, report_any_mutated, - sites_to_print, sites, sites[idx]); + write_line_for_merged_counts(out, report_any_mutated, sites_to_print, + sites, sites[idx]); - swap(outdated, sites_to_print); + std::swap(outdated, sites_to_print); } - - for (auto &&f : infiles) - delete f; - } - catch (const runtime_error &e) { - cerr << e.what() << endl; - return EXIT_FAILURE; } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::runtime_error &e) { + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS;