From ec1323df5f19d70667e3ff921a007c0ae35f2b16 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 5 Jun 2025 21:32:24 -0700 Subject: [PATCH 01/13] src/common/bam_record_utils.cpp: fixing a use after free bug --- src/common/bam_record_utils.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common/bam_record_utils.cpp b/src/common/bam_record_utils.cpp index ec92f811..69f041cd 100644 --- a/src/common/bam_record_utils.cpp +++ b/src/common/bam_record_utils.cpp @@ -980,7 +980,8 @@ to_string(const bam_header &hdr, const bam_rec &aln) { if (ret < 0) { throw runtime_error("Can't format record: " + to_string(hdr, aln)); } + const std::string s = string(ks.s); if (ks.s != nullptr) free(ks.s); - return string(ks.s); + return s; } From a26ce482fdc8ba1cffd515087a9e644d58e2bf11 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 5 Jun 2025 21:33:02 -0700 Subject: [PATCH 02/13] src/analysis/nanopore.cpp: adding a source file to do the same thing as counts but with nanopore data --- src/analysis/nanopore.cpp | 649 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 649 insertions(+) create mode 100644 src/analysis/nanopore.cpp diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp new file mode 100644 index 00000000..5a8b49a3 --- /dev/null +++ b/src/analysis/nanopore.cpp @@ -0,0 +1,649 @@ +/* nanopore: counts for nanopore data + * + * Copyright (C) 2011-2025 University of Southern California and + * Andrew D. Smith + * + * Author: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include // for [u]int[0-9]+_t +#include +#include +#include +#include +#include +#include +#include +#include + +#include "OptionParser.hpp" +#include "bam_record_utils.hpp" +#include "bsutils.hpp" +#include "counts_header.hpp" +#include "dnmt_error.hpp" + +/* HTSlib */ +#include + +using std::cerr; +using std::endl; +using std::string; +using std::unordered_map; +using std::unordered_set; +using std::vector; + +using bamxx::bam_header; +using bamxx::bam_rec; +using bamxx::bgzf_file; + +struct quick_buf : public std::ostringstream, + public std::basic_stringbuf { + // ADS: By user ecatmur on SO; very fast. Seems to work... + quick_buf() { + // ...but this seems to depend on data layout + static_cast &>(*this).rdbuf(this); + } + void + clear() { + // reset buffer pointers (member functions) + setp(pbase(), pbase()); + } + char const * + c_str() { + /* between c_str and insertion make sure to clear() */ + *pptr() = '\0'; + return pbase(); + } +}; + +// ADS: we should never have to worry about coverage over > 32767 in +// any downstream analysis, so using "int16_t" here would allow to +// detect wrap around and report it as some kind of weird thing, maybe +// zeroing it and flagging the output. As it is now, if we really need +// >65535-fold coverage, we can make the change here. +typedef uint16_t count_type; + +static inline bool +eats_ref(const uint32_t c) { + return bam_cigar_type(bam_cigar_op(c)) & 2; +} + +static inline bool +eats_query(const uint32_t c) { + return bam_cigar_type(bam_cigar_op(c)) & 1; +} + +/* The three functions below here should probably be moved into + bsutils.hpp. I am not sure if the DDG function is needed, but it + seems like if one considers strand, and the CHH is not symmetric, + then one needs this. Also, Qiang should be consulted on this + because he spent much time thinking about it in the context of + plants. */ +static bool +is_chh(const std::string &s, size_t i) { + return (i < (s.length() - 2)) && is_cytosine(s[i]) && !is_guanine(s[i + 1]) && + !is_guanine(s[i + 2]); +} + +static bool +is_ddg(const std::string &s, size_t i) { + return (i < (s.length() - 2)) && !is_cytosine(s[i]) && + !is_cytosine(s[i + 1]) && is_guanine(s[i + 2]); +} + +static bool +is_c_at_g(const std::string &s, size_t i) { + return (i < (s.length() - 2)) && is_cytosine(s[i]) && + !is_cytosine(s[i + 1]) && !is_guanine(s[i + 1]) && + is_guanine(s[i + 2]); +} + +/* Right now the CountSet objects below are much larger than they need + to be, for the things we are computing. However, it's not clear + that the minimum information would really put the memory + requirement of the program into a more reasonable range, so keeping + all the information seems reasonable. */ +struct CountSet { + + string + tostring() const { + std::ostringstream oss; + oss << pos_prob << '\t' << neg_prob << '\t' << pos_n << '\t' << neg_n; + return oss.str(); + } + void + add_count_pos(const std::uint8_t x) { + pos_prob += x; + ++pos_n; + } + void + add_count_neg(const std::uint8_t x) { + neg_prob += x; + ++neg_n; + } + count_type + pos_total() const { + return pos_n; + } + count_type + neg_total() const { + return neg_n; + } + + count_type pos_prob{0}, neg_prob{0}; + count_type pos_n{0}, neg_n{0}; +}; + +/* The "tag" returned by this function should be exclusive, so that + * the order of checking conditions doesn't matter. There is also a + * bit of a hack in that the unsigned "pos" could wrap, but this still + * works as long as the chromosome size is not the maximum size of a + * size_t. + */ +static uint32_t +get_tag_from_genome(const string &s, const size_t pos) { + if (is_cytosine(s[pos])) { + if (is_cpg(s, pos)) + return 0; + else if (is_chh(s, pos)) + return 1; + else if (is_c_at_g(s, pos)) + return 2; + else + return 3; + } + if (is_guanine(s[pos])) { + if (is_cpg(s, pos - 1)) + return 0; + else if (is_ddg(s, pos - 2)) + return 1; + else if (is_c_at_g(s, pos - 2)) + return 2; + else + return 3; + } + return 4; // shouldn't be used for anything +} + +static const char *tag_values[] = { + "CpG", // 0 + "CHH", // 1 + "CXG", // 2 + "CCG", // 3 + "N", // 4 + "CpGx", // 5 <---- MUT_OFFSET + "CHHx", // 6 + "CXGx", // 7 + "CCGx", // 8 + "Nx" // 9 +}; +static const uint32_t MUT_OFFSET = 5; + +static inline uint32_t +tag_with_mut(const uint32_t tag, const bool mut) { + return tag + (mut ? MUT_OFFSET : 0); +} + +template +static void +write_output(const bam_header &hdr, bgzf_file &out, const int32_t tid, + const string &chrom, const vector &counts, + bool CPG_ONLY) { + + quick_buf buf; // keep underlying buffer space? + for (size_t i = 0; i < std::size(chrom); ++i) { + const char base = chrom[i]; + if (is_cytosine(base) || is_guanine(base)) { + + const uint32_t the_tag = get_tag_from_genome(chrom, i); + if (CPG_ONLY && the_tag != 0) + continue; + + const bool is_c = is_cytosine(base); + const double n_reads = is_c ? counts[i].pos_n : counts[i].neg_n; + double n_meth = is_c ? counts[i].pos_prob : counts[i].neg_prob; + n_meth = n_meth > 0.0 ? (n_meth / 256.0) : 0.0; + + if (require_covered && n_reads == 0) + continue; + + const bool mut = false; + buf.clear(); + // ADS: here is where we make an MSite, but not using MSite + buf << sam_hdr_tid2name(hdr, tid) << '\t' << i << '\t' + << (is_c ? '+' : '-') << '\t' + << tag_values[tag_with_mut(the_tag, mut)] << '\t' + << (n_reads > 0.0 ? n_meth / n_reads : 0.0) << '\t' << n_reads + << '\n'; + if (!out.write(buf.c_str(), buf.tellp())) + throw dnmt_error("error writing output"); + } + } +} + +static inline std::vector +get_ml_values(const bamxx::bam_rec &aln) { + static constexpr auto min_ml_size = 7; // ML:B:C + // Get the ML values + kstring_t s = {0, 0, nullptr}; + if (bam_aux_get_str(aln.b, "ML", &s) <= 0) { + ks_free(&s); + return {}; + } + std::string ml(ks_str(&s)); + ks_free(&s); + if (std::size(ml) < min_ml_size) + return {}; + ml = ml.substr(min_ml_size); + std::vector parts = smithlab::split(ml, ",", false); + std::vector probs; + for (const auto &p : parts) + probs.push_back(atoi(p.data())); + + auto prob_itr = std::cbegin(probs) + std::size(probs) / 2; + std::vector vals(get_l_qseq(aln), 0); + const auto seq = bam_get_seq(aln); + for (std::int32_t i = 0; i + 1 < get_l_qseq(aln); ++i) + if (seq_nt16_str[bam_seqi(seq, i)] == 'C' && + seq_nt16_str[bam_seqi(seq, i + 1)] == 'G') + vals[i] += *prob_itr++; + assert(prob_itr == std::cend(probs)); + + return vals; +} + +static inline std::vector +get_ml_values_rev(const bamxx::bam_rec &aln) { + static constexpr auto min_ml_size = 7; // ML:B:C + // Get the ML values + kstring_t s = {0, 0, nullptr}; + if (bam_aux_get_str(aln.b, "ML", &s) <= 0) { + ks_free(&s); + return {}; + } + std::string ml(ks_str(&s)); + ks_free(&s); + if (std::size(ml) < min_ml_size) + return {}; + ml = ml.substr(min_ml_size); + std::vector parts = smithlab::split(ml, ",", false); + std::vector probs; + for (const auto &p : parts) + probs.push_back(atoi(p.data())); + + auto prob_itr = std::cbegin(probs) + std::size(probs) / 2; + const std::uint32_t n = get_l_qseq(aln); + std::vector vals(n, 0); + const auto seq = bam_get_seq(aln); + for (std::int32_t i = 0; i + 1 < get_l_qseq(aln); ++i) + if (seq_nt16_str[bam_seqi(seq, i)] == 'C' && + seq_nt16_str[bam_seqi(seq, i + 1)] == 'G') + vals[n - i - 2] += *prob_itr++; + assert(prob_itr == std::cend(probs)); + + return vals; +} + +static void +count_states_pos(const bam_rec &aln, vector &counts) { + /* Move through cigar, reference and read positions without + inflating cigar or read sequence */ + const auto beg_cig = bam_get_cigar(aln); + const auto end_cig = beg_cig + get_n_cigar(aln); + + auto rpos = get_pos(aln); + auto qpos = 0; // to match type with b->core.l_qseq + + const auto probs = get_ml_values(aln); + if (probs.empty()) + return; + + auto prob_itr = std::cbegin(probs); + + for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { + const char op = bam_cigar_op(*c_itr); + const uint32_t n = bam_cigar_oplen(*c_itr); + if (eats_ref(op) && eats_query(op)) { + const decltype(qpos) end_qpos = qpos + n; + for (; qpos < end_qpos; ++qpos) + counts[rpos++].add_count_pos(*prob_itr++); + } + else if (eats_query(op)) { + qpos += n; + prob_itr += n; + } + else if (eats_ref(op)) { + rpos += n; + } + } + // ADS: somehow previous code included a correction for rpos going + // past the end of the chromosome; this should result at least in a + // soft-clip by any mapper. I'm not checking it here as even if it + // happens I don't want to terminate. + // assert(qpos == get_l_qseq(aln)); + if (qpos != get_l_qseq(aln)) { + std::cerr << qpos << '\t' << get_l_qseq(aln) << std::endl; + } +} + +[[maybe_unused]] static void +count_states_neg(const bam_rec &aln, vector &counts) { + /* Move through cigar, reference and (*backward*) through read + positions without inflating cigar or read sequence */ + const auto beg_cig = bam_get_cigar(aln); + const auto end_cig = beg_cig + get_n_cigar(aln); + size_t rpos = get_pos(aln); + size_t qpos = get_l_qseq(aln); // to match type with b->core.l_qseq + + const auto probs = get_ml_values_rev(aln); + if (probs.empty()) + return; + assert(std::size(probs) == qpos); + + auto prob_itr = std::cend(probs); + + if (qpos == 0) + return; + for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { + const char op = bam_cigar_op(*c_itr); + const uint32_t n = bam_cigar_oplen(*c_itr); + if (eats_ref(op) && eats_query(op)) { + assert(qpos >= n); + const size_t end_qpos = qpos - n; // to match type with qpos + for (; qpos > end_qpos; --qpos) + counts[rpos++].add_count_neg(*--prob_itr); + } + else if (eats_query(op)) { + qpos -= n; + prob_itr -= n; + } + else if (eats_ref(op)) { + rpos += n; + } + } + /* qpos is unsigned; would wrap around if < 0 */ + // ADS: Same as count_states_pos; see comment there + assert(qpos == 0); +} + +static unordered_map +get_tid_to_idx(const bam_header &hdr, + const unordered_map name_to_idx) { + unordered_map tid_to_idx; + for (int32_t i = 0; i < hdr.h->n_targets; ++i) { + // "curr_name" gives a "tid_to_name" mapping allowing to jump + // through "name_to_idx" and get "tid_to_idx" + const string curr_name(hdr.h->target_name[i]); + const auto name_itr(name_to_idx.find(curr_name)); + if (name_itr == end(name_to_idx)) + throw dnmt_error("failed to find chrom: " + curr_name); + tid_to_idx[i] = name_itr->second; + } + return tid_to_idx; +} + +template +static void +output_skipped_chromosome(const bool CPG_ONLY, const int32_t tid, + const unordered_map &tid_to_idx, + const bam_header &hdr, + const vector::const_iterator chroms_beg, + const vector &chrom_sizes, vector &counts, + bgzf_file &out) { + + // get the index of the next chrom sequence + const auto chrom_idx = tid_to_idx.find(tid); + if (chrom_idx == cend(tid_to_idx)) + throw dnmt_error("chrom not found: " + sam_hdr_tid2name(hdr, tid)); + + const auto chrom_itr = chroms_beg + chrom_idx->second; + + // reset the counts + counts.clear(); + counts.resize(chrom_sizes[chrom_idx->second]); + + write_output(hdr, out, tid, *chrom_itr, counts, CPG_ONLY); +} + +static bool +consistent_targets(const bam_header &hdr, + const unordered_map &tid_to_idx, + const vector &names, const vector &sizes) { + const size_t n_targets = hdr.h->n_targets; + if (n_targets != names.size()) + return false; + + for (size_t tid = 0; tid < n_targets; ++tid) { + const string tid_name_sam = sam_hdr_tid2name(hdr, tid); + const size_t tid_size_sam = sam_hdr_tid2len(hdr, tid); + const auto idx_itr = tid_to_idx.find(tid); + if (idx_itr == cend(tid_to_idx)) + return false; + const auto idx = idx_itr->second; + if (tid_name_sam != names[idx] || tid_size_sam != sizes[idx]) + return false; + } + return true; +} + +template +static void +process_reads(const bool VERBOSE, const bool show_progress, + const bool compress_output, const bool include_header, + const size_t n_threads, const string &infile, + const string &outfile, const string &chroms_file, + const bool CPG_ONLY) { + // first get the chromosome names and sequences from the FASTA file + vector chroms, names; + read_fasta_file_short_names(chroms_file, names, chroms); + for (auto &i : chroms) + transform(cbegin(i), cend(i), begin(i), + [](const char c) { return std::toupper(c); }); + if (VERBOSE) + cerr << "[n chroms in reference: " << chroms.size() << "]" << endl; + const auto chroms_beg = cbegin(chroms); + + unordered_map name_to_idx; + vector chrom_sizes(chroms.size(), 0); + for (size_t i = 0; i < chroms.size(); ++i) { + name_to_idx[names[i]] = i; + chrom_sizes[i] = chroms[i].size(); + } + + bamxx::bam_tpool tp(n_threads); // Must be destroyed after hts + + // open the hts SAM/BAM input file and get the header + bamxx::bam_in hts(infile); + if (!hts) + throw dnmt_error("failed to open input file"); + // load the input file's header + bam_header hdr(hts); + if (!hdr) + throw dnmt_error("failed to read header"); + + unordered_map tid_to_idx = get_tid_to_idx(hdr, name_to_idx); + + if (!consistent_targets(hdr, tid_to_idx, names, chrom_sizes)) + throw dnmt_error("inconsistent reference genome information"); + + // open the output file + const string output_mode = compress_output ? "w" : "wu"; + bgzf_file out(outfile, output_mode); + if (!out) + throw dnmt_error("error opening output file: " + outfile); + + // set the threads for the input file decompression + if (n_threads > 1) { + tp.set_io(hts); + tp.set_io(out); + } + + if (include_header) + write_counts_header_from_bam_header(hdr, out); + + // now iterate over the reads, switching chromosomes and writing + // output as needed + bam_rec aln; + int32_t prev_tid = -1; + + // this is where all the counts are accumulated + vector counts; + + vector::const_iterator chrom_itr{}; + + while (hts.read(hdr, aln)) { + const int32_t tid = get_tid(aln); + if (get_l_qseq(aln) == 0) + continue; + + if (tid == -1) // ADS: skip reads that have no tid -- they are not mapped + continue; + if (tid == prev_tid) { + if (bam_is_rev(aln)) + count_states_neg(aln, counts); + else + count_states_pos(aln, counts); + } + else { // chrom has changed, so output results and get the next chrom + + // write output if there is any; counts is empty only for first chrom + if (!counts.empty()) + write_output(hdr, out, prev_tid, *chrom_itr, counts, + CPG_ONLY); + // make sure reads are sorted chrom tid number in header + if (tid < prev_tid) { + const std::string message = "SAM file is not sorted " + "previous tid: " + + std::to_string(prev_tid) + + " current tid: " + std::to_string(tid); + throw dnmt_error(message); + } + + if (!require_covered) + for (auto i = prev_tid + 1; i < tid; ++i) + output_skipped_chromosome(CPG_ONLY, i, tid_to_idx, hdr, chroms_beg, + chrom_sizes, counts, out); + + // get the next chrom to process + auto chrom_idx(tid_to_idx.find(tid)); + if (chrom_idx == end(tid_to_idx)) + throw dnmt_error("chromosome not found: " + + string(sam_hdr_tid2name(hdr, tid))); + if (show_progress) + cerr << "processing " << sam_hdr_tid2name(hdr, tid) << endl; + + prev_tid = tid; + chrom_itr = chroms_beg + chrom_idx->second; + + // reset the counts + counts.clear(); + counts.resize(chrom_sizes[chrom_idx->second]); + } + } + if (!counts.empty()) + write_output(hdr, out, prev_tid, *chrom_itr, counts, + CPG_ONLY); + + // ADS: if some chroms might not be covered by reads, we have to + // iterate over what remains + if (!require_covered) + for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) + output_skipped_chromosome(CPG_ONLY, i, tid_to_idx, hdr, chroms_beg, + chrom_sizes, counts, out); +} + +int +main_nanopore(int argc, const char **argv) { + + try { + + bool VERBOSE = false; + bool show_progress = false; + bool CPG_ONLY = false; + bool require_covered = false; + bool compress_output = false; + bool include_header = false; + + string chroms_file; + string outfile; + int n_threads = 1; + + /****************** COMMAND LINE OPTIONS ********************/ + OptionParser opt_parse(strip_path(argv[0]), + "get methylation levels from mapped nanopore reads", + "-c "); + opt_parse.add_opt("threads", 't', "threads to use (few needed)", false, + n_threads); + opt_parse.add_opt("output", 'o', "output file name (default: stdout)", + false, outfile); + opt_parse.add_opt("chrom", 'c', "reference genome file (FASTA format)", + true, chroms_file); + opt_parse.add_opt("cpg-only", 'n', "print only CpG context cytosines", + false, CPG_ONLY); + opt_parse.add_opt("require-covered", 'r', "only output covered sites", + false, require_covered); + opt_parse.add_opt("header", 'H', "add a header to identify the reference", + false, include_header); + opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); + opt_parse.add_opt("progress", '\0', "show progress", false, show_progress); + opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); + vector leftover_args; + opt_parse.parse(argc, argv, leftover_args); + if (opt_parse.about_requested() || opt_parse.help_requested() || + leftover_args.empty()) { + cerr << opt_parse.help_message() << endl + << opt_parse.about_message() << endl; + return EXIT_SUCCESS; + } + if (opt_parse.option_missing()) { + cerr << opt_parse.option_missing_message() << endl; + return EXIT_SUCCESS; + } + const string mapped_reads_file = leftover_args.front(); + /****************** END COMMAND LINE OPTIONS *****************/ + + if (n_threads < 0) + throw dnmt_error("thread count cannot be negative"); + + std::ostringstream cmd; + copy(argv, argv + argc, std::ostream_iterator(cmd, " ")); + + // file types from HTSlib use "-" for the filename to go to stdout + if (outfile.empty()) + outfile = "-"; + + if (VERBOSE) + cerr << "[input BAM/SAM file: " << mapped_reads_file << "]" << endl + << "[output file: " << outfile << "]" << endl + << "[output format: " << (compress_output ? "bgzf" : "text") << "]" + << endl + << "[genome file: " << chroms_file << "]" << endl + << "[threads requested: " << n_threads << "]" << endl + << "[CpG only mode: " << (CPG_ONLY ? "yes" : "no") << "]" << endl + << "[command line: \"" << cmd.str() << "\"]" << endl; + + if (require_covered) + process_reads(VERBOSE, show_progress, compress_output, + include_header, n_threads, mapped_reads_file, outfile, + chroms_file, CPG_ONLY); + else + process_reads(VERBOSE, show_progress, compress_output, include_header, + n_threads, mapped_reads_file, outfile, chroms_file, + CPG_ONLY); + } + catch (const std::exception &e) { + cerr << e.what() << endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} From 6e432a8df18e33c831643ac6d834194f10fae5ac Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 5 Jun 2025 21:33:30 -0700 Subject: [PATCH 03/13] src/dnmtools.cpp: adding the nano command to do the nanopore counts; likely will be changed --- src/dnmtools.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/dnmtools.cpp b/src/dnmtools.cpp index 5ba90743..30638ac1 100644 --- a/src/dnmtools.cpp +++ b/src/dnmtools.cpp @@ -24,8 +24,8 @@ #include using std::begin; -using std::end; using std::cout; +using std::end; using std::endl; using std::pair; using std::string; @@ -39,7 +39,8 @@ struct dnmtools_command { string description; std::function fun; - auto operator()(const int argc, const char **argv) const -> int { + auto + operator()(const int argc, const char **argv) const -> int { return fun(argc - 1, argv + 1); } }; @@ -64,6 +65,8 @@ simreads(int argc, const char **argv); int main_counts(int argc, const char **argv); int +main_nanopore(int argc, const char **argv); +int main_allelicmeth(int argc, const char **argv); int main_amrfinder(int argc, const char **argv); @@ -145,7 +148,8 @@ print_help( << "Commands:" << endl; for (auto &&g : command_groups) { cout << " " << g.first << ":" << endl; - for (auto &&c : g.second) cout << c << endl; + for (auto &&c : g.second) + cout << c << endl; cout << endl; } } @@ -154,7 +158,7 @@ int main(int argc, const char **argv) { try { vector>> command_groups = { -// clang-format off + // clang-format off {{"mapping", {{{"abismal", "map FASTQ reads to a FASTA reference genome or an index", abismal}, {"abismalidx", "convert a FASTA reference genome to an abismal index", abismalidx}, @@ -165,6 +169,7 @@ main(int argc, const char **argv) { {"uniq", "remove duplicate reads from sorted mapped reads", main_uniq}, {"bsrate", "compute the BS conversion rate from BS-seq reads mapped to a genome", main_bsrate}, {"counts", "get methylation levels from mapped WGBS reads", main_counts}, + {"nano", "get methylation levels from mapped nanopore reads", main_nanopore}, {"sym", "get CpG sites and make methylation levels symmetric", main_symmetric_cpgs}, {"levels", "compute methylation summary statistics from a counts file", main_levels}}}}, @@ -208,7 +213,7 @@ main(int argc, const char **argv) { {"unxcounts", "reverse the xcounts process yielding a counts file", main_unxcounts}, {"selectsites", "sites inside a set of genomic intervals", main_selectsites}, {"kmersites", "make track file for sites matching kmer", kmersites}}}}}}; -// clang-format on + // clang-format on if (argc < 2) { print_help(command_groups); @@ -221,11 +226,11 @@ main(int argc, const char **argv) { for (auto &g : command_groups) { const auto the_cmd = find_if(begin(g.second), end(g.second), has_tag); - if (the_cmd != end(g.second)) return (*the_cmd)(argc, argv); + if (the_cmd != end(g.second)) + return (*the_cmd)(argc, argv); } std::cerr << "ERROR: invalid command " << argv[1] << std::endl; - } catch (const std::exception &e) { std::cerr << "ERROR:\t" << e.what() << endl; From f023d493d0b5f8c59c8f7b4ad34bef587543c96c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 5 Jun 2025 21:33:52 -0700 Subject: [PATCH 04/13] Makefile.am: adding the source for the nano command --- Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.am b/Makefile.am index c79401c4..9e832e16 100644 --- a/Makefile.am +++ b/Makefile.am @@ -205,6 +205,7 @@ dnmtools_SOURCES += src/analysis/methstates.cpp dnmtools_SOURCES += src/analysis/bsrate.cpp dnmtools_SOURCES += src/analysis/methentropy.cpp dnmtools_SOURCES += src/analysis/methcounts.cpp +dnmtools_SOURCES += src/analysis/nanopore.cpp dnmtools_SOURCES += src/analysis/roimethstat.cpp dnmtools_SOURCES += src/analysis/cpgbins.cpp dnmtools_SOURCES += src/analysis/multimethstat.cpp From 5c33954ddb5c5b65cce88ba95328ca1a92c5342f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 6 Jun 2025 21:47:53 -0700 Subject: [PATCH 05/13] src/analysis/nanopore.cpp: reworking code for efficiency and to cross-check CpG site locations with the indicated sites of modifications --- src/analysis/nanopore.cpp | 389 ++++++++++++++++++++++++-------------- 1 file changed, 250 insertions(+), 139 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index 5a8b49a3..288cfba6 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -16,14 +16,14 @@ * General Public License for more details. */ +#include #include // for [u]int[0-9]+_t +#include #include #include -#include #include #include #include -#include #include #include "OptionParser.hpp" @@ -39,7 +39,6 @@ using std::cerr; using std::endl; using std::string; using std::unordered_map; -using std::unordered_set; using std::vector; using bamxx::bam_header; @@ -74,12 +73,12 @@ struct quick_buf : public std::ostringstream, typedef uint16_t count_type; static inline bool -eats_ref(const uint32_t c) { +eats_ref(const std::uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 2; } static inline bool -eats_query(const uint32_t c) { +eats_query(const std::uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 1; } @@ -114,34 +113,27 @@ is_c_at_g(const std::string &s, size_t i) { requirement of the program into a more reasonable range, so keeping all the information seems reasonable. */ struct CountSet { - string tostring() const { std::ostringstream oss; - oss << pos_prob << '\t' << neg_prob << '\t' << pos_n << '\t' << neg_n; + oss << prob_pos << '\t' << n_reads_pos << '\t' << prob_neg << '\t' + << n_reads_neg << '\n'; return oss.str(); } void add_count_pos(const std::uint8_t x) { - pos_prob += x; - ++pos_n; + prob_pos += x; + ++n_reads_pos; } void add_count_neg(const std::uint8_t x) { - neg_prob += x; - ++neg_n; - } - count_type - pos_total() const { - return pos_n; + prob_neg += x; + ++n_reads_neg; } - count_type - neg_total() const { - return neg_n; - } - - count_type pos_prob{0}, neg_prob{0}; - count_type pos_n{0}, neg_n{0}; + count_type prob_pos{0}; + count_type prob_neg{0}; + count_type n_reads_pos{0}; + count_type n_reads_neg{0}; }; /* The "tag" returned by this function should be exclusive, so that @@ -150,7 +142,7 @@ struct CountSet { * works as long as the chromosome size is not the maximum size of a * size_t. */ -static uint32_t +static std::uint32_t get_tag_from_genome(const string &s, const size_t pos) { if (is_cytosine(s[pos])) { if (is_cpg(s, pos)) @@ -176,27 +168,16 @@ get_tag_from_genome(const string &s, const size_t pos) { } static const char *tag_values[] = { - "CpG", // 0 - "CHH", // 1 - "CXG", // 2 - "CCG", // 3 - "N", // 4 - "CpGx", // 5 <---- MUT_OFFSET - "CHHx", // 6 - "CXGx", // 7 - "CCGx", // 8 - "Nx" // 9 + "CpG", // 0 + "CHH", // 1 + "CXG", // 2 + "CCG", // 3 + "N", // 4 }; -static const uint32_t MUT_OFFSET = 5; - -static inline uint32_t -tag_with_mut(const uint32_t tag, const bool mut) { - return tag + (mut ? MUT_OFFSET : 0); -} template static void -write_output(const bam_header &hdr, bgzf_file &out, const int32_t tid, +write_output(const bam_header &hdr, bgzf_file &out, const std::int32_t tid, const string &chrom, const vector &counts, bool CPG_ONLY) { @@ -205,24 +186,24 @@ write_output(const bam_header &hdr, bgzf_file &out, const int32_t tid, const char base = chrom[i]; if (is_cytosine(base) || is_guanine(base)) { - const uint32_t the_tag = get_tag_from_genome(chrom, i); + const std::uint32_t the_tag = get_tag_from_genome(chrom, i); if (CPG_ONLY && the_tag != 0) continue; const bool is_c = is_cytosine(base); - const double n_reads = is_c ? counts[i].pos_n : counts[i].neg_n; - double n_meth = is_c ? counts[i].pos_prob : counts[i].neg_prob; + const double n_reads = + is_c ? counts[i].n_reads_pos : counts[i].n_reads_neg; + double n_meth = is_c ? counts[i].prob_pos : counts[i].prob_neg; n_meth = n_meth > 0.0 ? (n_meth / 256.0) : 0.0; if (require_covered && n_reads == 0) continue; - const bool mut = false; + // const bool mut = false; buf.clear(); // ADS: here is where we make an MSite, but not using MSite buf << sam_hdr_tid2name(hdr, tid) << '\t' << i << '\t' - << (is_c ? '+' : '-') << '\t' - << tag_values[tag_with_mut(the_tag, mut)] << '\t' + << (is_c ? '+' : '-') << '\t' << tag_values[the_tag] << '\t' << (n_reads > 0.0 ? n_meth / n_reads : 0.0) << '\t' << n_reads << '\n'; if (!out.write(buf.c_str(), buf.tellp())) @@ -231,92 +212,204 @@ write_output(const bam_header &hdr, bgzf_file &out, const int32_t tid, } } -static inline std::vector -get_ml_values(const bamxx::bam_rec &aln) { - static constexpr auto min_ml_size = 7; // ML:B:C - // Get the ML values - kstring_t s = {0, 0, nullptr}; - if (bam_aux_get_str(aln.b, "ML", &s) <= 0) { - ks_free(&s); - return {}; - } - std::string ml(ks_str(&s)); - ks_free(&s); - if (std::size(ml) < min_ml_size) - return {}; - ml = ml.substr(min_ml_size); - std::vector parts = smithlab::split(ml, ",", false); - std::vector probs; - for (const auto &p : parts) - probs.push_back(atoi(p.data())); - - auto prob_itr = std::cbegin(probs) + std::size(probs) / 2; - std::vector vals(get_l_qseq(aln), 0); +static inline bool +confirm_mm_vals(const bamxx::bam_rec &aln, std::vector &vals) { + const char hydroxy_tag[] = "C+h?"; + const auto hydroxy_tag_size = 4; + const char methyl_tag[] = "C+m?"; + const auto methyl_tag_size = 4; + + const auto mm_aux = bam_aux_get(aln.b, "MM"); + if (mm_aux == nullptr) + return false; + + auto mm_beg = bam_aux2Z(mm_aux); + auto mm_end = mm_beg + std::strlen(mm_beg); + + auto hydroxy_beg = strstr(mm_beg, hydroxy_tag); + auto hydroxy_end = std::find(hydroxy_beg, mm_end, ';'); + + auto methyl_beg = strstr(mm_beg, methyl_tag); + // auto methyl_end = std::find(methyl_beg, mm_end, ';'); + + hydroxy_beg += hydroxy_tag_size; + methyl_beg += methyl_tag_size; + + if (!std::equal(hydroxy_beg, hydroxy_end, methyl_beg)) + return false; + + const auto isdig = [](const auto x) { + return std::isdigit(static_cast(x)); + }; + + const auto get_next = [&](auto &b, auto &e) -> std::int32_t { + b = std::find_if(b, e, isdig); + if (b == e) + return -1; + auto r = atoi(b); + b = std::find_if_not(b, e, isdig); + return r; + }; + + const auto ml_aux = bam_aux_get(aln.b, "ML"); + if (ml_aux == nullptr) + return false; + const auto ml_aux_len = bam_auxB_len(ml_aux); + + // number of commas is number of hydroxy substrates = CpG sites + const auto n_cpgs = std::count(hydroxy_beg, hydroxy_end, ','); + + const auto n = get_l_qseq(aln); const auto seq = bam_get_seq(aln); - for (std::int32_t i = 0; i + 1 < get_l_qseq(aln); ++i) - if (seq_nt16_str[bam_seqi(seq, i)] == 'C' && - seq_nt16_str[bam_seqi(seq, i + 1)] == 'G') - vals[i] += *prob_itr++; - assert(prob_itr == std::cend(probs)); + vals = std::vector(n, 0); + std::int32_t delta = get_next(hydroxy_beg, hydroxy_end); + + auto ml_idx = n_cpgs; // start methyl after hydroxy + + if (bam_is_rev(aln)) { + for (auto i = 1; i <= n; ++i) { + const auto nuc = seq_nt16_str[bam_seqi(seq, n - i)]; + if (nuc == 'G') { + if (seq_nt16_str[bam_seqi(seq, n - i - 1)] == 'C') { + if (delta != 0) + return false; + vals[i - 1] = bam_auxB2i(ml_aux, ml_idx++); + } + --delta; + if (delta < 0) + delta = get_next(hydroxy_beg, hydroxy_end); + } + } + } + else { + for (auto i = 0; i + 1 < n; ++i) { + const auto nuc = seq_nt16_str[bam_seqi(seq, i)]; + if (nuc == 'C') { + if (seq_nt16_str[bam_seqi(seq, i + 1)] == 'G') { + if (delta != 0) + return false; + vals[i] = bam_auxB2i(ml_aux, ml_idx++); + } + --delta; + if (delta < 0) + delta = get_next(hydroxy_beg, hydroxy_end); + } + } + } - return vals; + return true; } -static inline std::vector -get_ml_values_rev(const bamxx::bam_rec &aln) { - static constexpr auto min_ml_size = 7; // ML:B:C - // Get the ML values - kstring_t s = {0, 0, nullptr}; - if (bam_aux_get_str(aln.b, "ML", &s) <= 0) { - ks_free(&s); - return {}; +std::uint8_t enc[] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 16 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 32 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 48 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 64 + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 80 + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 96 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 112 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 128 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 144 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 160 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 176 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 192 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 208 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 224 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 240 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 // 256 +}; + +struct match_counter { + std::array c{}; + + void + add_pos(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, + const std::uint8_t q2) { + const auto r1e = enc[r1]; + const auto r2e = enc[r2]; + const auto q1e = enc[q1]; + const auto q2e = enc[q2]; + if ((r1e | r2e | q1e | q2e) & 4u) + return; + c[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; } - std::string ml(ks_str(&s)); - ks_free(&s); - if (std::size(ml) < min_ml_size) - return {}; - ml = ml.substr(min_ml_size); - std::vector parts = smithlab::split(ml, ",", false); - std::vector probs; - for (const auto &p : parts) - probs.push_back(atoi(p.data())); - - auto prob_itr = std::cbegin(probs) + std::size(probs) / 2; - const std::uint32_t n = get_l_qseq(aln); - std::vector vals(n, 0); - const auto seq = bam_get_seq(aln); - for (std::int32_t i = 0; i + 1 < get_l_qseq(aln); ++i) - if (seq_nt16_str[bam_seqi(seq, i)] == 'C' && - seq_nt16_str[bam_seqi(seq, i + 1)] == 'G') - vals[n - i - 2] += *prob_itr++; - assert(prob_itr == std::cend(probs)); - return vals; -} + std::string + tostring() const { + static constexpr auto n_dinucs = 16u; + const auto dinucs = std::vector{ + "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", + "GA", "GC", "GG", "GT", "TA", "TC", "TG", "TT", + }; + std::ostringstream oss; + for (auto i = 0u; i < n_dinucs; ++i) { + oss << dinucs[i]; + for (auto j = 0u; j < n_dinucs; ++j) + oss << '\t' << c[i * n_dinucs + j]; + oss << '\n'; + } + return oss.str(); + } + + std::string + tostring_frac() const { + static constexpr auto width = 8; + static constexpr auto prec = 3; + static constexpr auto n_dinucs = 16u; + const auto dinucs = std::vector{ + "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", + "GA", "GC", "GG", "GT", "TA", "TC", "TG", "TT", + }; + std::ostringstream oss; + for (auto i = 0u; i < n_dinucs; ++i) { + oss << dinucs[i]; + const auto tot = + std::accumulate(std::cbegin(c) + (i * n_dinucs), + std::cbegin(c) + ((i + 1) * n_dinucs), 0.0); + for (auto j = 0u; j < n_dinucs; ++j) { + oss << std::setw(width) << std::fixed << std::setprecision(prec) + << c[i * n_dinucs + j] / tot; + } + oss << '\n'; + } + return oss.str(); + } +}; static void -count_states_pos(const bam_rec &aln, vector &counts) { +count_states_pos(const bam_rec &aln, vector &counts, + const std::string &chrom, match_counter &mc) { /* Move through cigar, reference and read positions without inflating cigar or read sequence */ + const auto seq = bam_get_seq(aln); const auto beg_cig = bam_get_cigar(aln); const auto end_cig = beg_cig + get_n_cigar(aln); auto rpos = get_pos(aln); - auto qpos = 0; // to match type with b->core.l_qseq + auto r_itr = std::cbegin(chrom) + rpos; + const auto r_itr_lim = std::cend(chrom); - const auto probs = get_ml_values(aln); - if (probs.empty()) + std::vector probs; + if (!confirm_mm_vals(aln, probs)) return; + auto qpos = 0; // to match type with b->core.l_qseq + auto q_lim = get_l_qseq(aln) - 1; // if here, this is >= 0 + auto prob_itr = std::cbegin(probs); for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { const char op = bam_cigar_op(*c_itr); - const uint32_t n = bam_cigar_oplen(*c_itr); + const std::uint32_t n = bam_cigar_oplen(*c_itr); if (eats_ref(op) && eats_query(op)) { const decltype(qpos) end_qpos = qpos + n; - for (; qpos < end_qpos; ++qpos) + for (; qpos < end_qpos; ++qpos) { + const auto r_nuc = *r_itr++; + mc.add_pos(r_nuc, r_itr == r_itr_lim ? 'N' : *r_itr, + seq_nt16_str[bam_seqi(seq, qpos)], + qpos == q_lim ? 'N' : seq_nt16_str[bam_seqi(seq, qpos + 1)]); counts[rpos++].add_count_pos(*prob_itr++); + } } else if (eats_query(op)) { qpos += n; @@ -324,31 +417,35 @@ count_states_pos(const bam_rec &aln, vector &counts) { } else if (eats_ref(op)) { rpos += n; + r_itr += n; } } // ADS: somehow previous code included a correction for rpos going // past the end of the chromosome; this should result at least in a // soft-clip by any mapper. I'm not checking it here as even if it // happens I don't want to terminate. - // assert(qpos == get_l_qseq(aln)); - if (qpos != get_l_qseq(aln)) { - std::cerr << qpos << '\t' << get_l_qseq(aln) << std::endl; - } + assert(qpos == get_l_qseq(aln)); } [[maybe_unused]] static void -count_states_neg(const bam_rec &aln, vector &counts) { +count_states_neg(const bam_rec &aln, vector &counts, + const std::string &chrom, match_counter &mc) { /* Move through cigar, reference and (*backward*) through read positions without inflating cigar or read sequence */ + const auto seq = bam_get_seq(aln); const auto beg_cig = bam_get_cigar(aln); const auto end_cig = beg_cig + get_n_cigar(aln); size_t rpos = get_pos(aln); + auto r_itr = std::cbegin(chrom) + rpos; + const auto r_lim = std::cend(chrom); + size_t qpos = get_l_qseq(aln); // to match type with b->core.l_qseq + size_t q_idx = 0; + size_t l_qseq = get_l_qseq(aln); - const auto probs = get_ml_values_rev(aln); - if (probs.empty()) + std::vector probs; // vals; + if (!confirm_mm_vals(aln, probs)) return; - assert(std::size(probs) == qpos); auto prob_itr = std::cend(probs); @@ -356,31 +453,40 @@ count_states_neg(const bam_rec &aln, vector &counts) { return; for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { const char op = bam_cigar_op(*c_itr); - const uint32_t n = bam_cigar_oplen(*c_itr); + const std::uint32_t n = bam_cigar_oplen(*c_itr); if (eats_ref(op) && eats_query(op)) { assert(qpos >= n); const size_t end_qpos = qpos - n; // to match type with qpos - for (; qpos > end_qpos; --qpos) + for (; qpos > end_qpos; --qpos) { + const auto r_nuc = *r_itr++; + const auto q_nuc = seq_nt16_str[bam_seqi(seq, q_idx)]; + ++q_idx; + mc.add_pos(r_nuc, r_itr == r_lim ? 'N' : *r_itr, q_nuc, + q_idx == l_qseq ? 'N' : seq_nt16_str[bam_seqi(seq, q_idx)]); counts[rpos++].add_count_neg(*--prob_itr); + } } else if (eats_query(op)) { qpos -= n; + q_idx += n; prob_itr -= n; } else if (eats_ref(op)) { rpos += n; + r_itr += n; } } + /* qpos is unsigned; would wrap around if < 0 */ // ADS: Same as count_states_pos; see comment there assert(qpos == 0); } -static unordered_map +static std::unordered_map get_tid_to_idx(const bam_header &hdr, - const unordered_map name_to_idx) { - unordered_map tid_to_idx; - for (int32_t i = 0; i < hdr.h->n_targets; ++i) { + const std::unordered_map name_to_idx) { + std::unordered_map tid_to_idx; + for (std::int32_t i = 0; i < hdr.h->n_targets; ++i) { // "curr_name" gives a "tid_to_name" mapping allowing to jump // through "name_to_idx" and get "tid_to_idx" const string curr_name(hdr.h->target_name[i]); @@ -394,16 +500,15 @@ get_tid_to_idx(const bam_header &hdr, template static void -output_skipped_chromosome(const bool CPG_ONLY, const int32_t tid, - const unordered_map &tid_to_idx, - const bam_header &hdr, - const vector::const_iterator chroms_beg, - const vector &chrom_sizes, vector &counts, - bgzf_file &out) { +output_skipped_chromosome( + const bool CPG_ONLY, const std::int32_t tid, + const std::unordered_map &tid_to_idx, + const bam_header &hdr, const vector::const_iterator chroms_beg, + const vector &chrom_sizes, vector &counts, bgzf_file &out) { // get the index of the next chrom sequence const auto chrom_idx = tid_to_idx.find(tid); - if (chrom_idx == cend(tid_to_idx)) + if (chrom_idx == std::cend(tid_to_idx)) throw dnmt_error("chrom not found: " + sam_hdr_tid2name(hdr, tid)); const auto chrom_itr = chroms_beg + chrom_idx->second; @@ -417,7 +522,7 @@ output_skipped_chromosome(const bool CPG_ONLY, const int32_t tid, static bool consistent_targets(const bam_header &hdr, - const unordered_map &tid_to_idx, + const std::unordered_map &tid_to_idx, const vector &names, const vector &sizes) { const size_t n_targets = hdr.h->n_targets; if (n_targets != names.size()) @@ -427,7 +532,7 @@ consistent_targets(const bam_header &hdr, const string tid_name_sam = sam_hdr_tid2name(hdr, tid); const size_t tid_size_sam = sam_hdr_tid2len(hdr, tid); const auto idx_itr = tid_to_idx.find(tid); - if (idx_itr == cend(tid_to_idx)) + if (idx_itr == std::cend(tid_to_idx)) return false; const auto idx = idx_itr->second; if (tid_name_sam != names[idx] || tid_size_sam != sizes[idx]) @@ -447,13 +552,13 @@ process_reads(const bool VERBOSE, const bool show_progress, vector chroms, names; read_fasta_file_short_names(chroms_file, names, chroms); for (auto &i : chroms) - transform(cbegin(i), cend(i), begin(i), + transform(std::cbegin(i), std::cend(i), std::begin(i), [](const char c) { return std::toupper(c); }); if (VERBOSE) cerr << "[n chroms in reference: " << chroms.size() << "]" << endl; - const auto chroms_beg = cbegin(chroms); + const auto chroms_beg = std::cbegin(chroms); - unordered_map name_to_idx; + std::unordered_map name_to_idx; vector chrom_sizes(chroms.size(), 0); for (size_t i = 0; i < chroms.size(); ++i) { name_to_idx[names[i]] = i; @@ -471,7 +576,8 @@ process_reads(const bool VERBOSE, const bool show_progress, if (!hdr) throw dnmt_error("failed to read header"); - unordered_map tid_to_idx = get_tid_to_idx(hdr, name_to_idx); + std::unordered_map tid_to_idx = + get_tid_to_idx(hdr, name_to_idx); if (!consistent_targets(hdr, tid_to_idx, names, chrom_sizes)) throw dnmt_error("inconsistent reference genome information"); @@ -494,15 +600,18 @@ process_reads(const bool VERBOSE, const bool show_progress, // now iterate over the reads, switching chromosomes and writing // output as needed bam_rec aln; - int32_t prev_tid = -1; + std::int32_t prev_tid = -1; // this is where all the counts are accumulated vector counts; vector::const_iterator chrom_itr{}; + match_counter mc_pos; + match_counter mc_neg; + while (hts.read(hdr, aln)) { - const int32_t tid = get_tid(aln); + const std::int32_t tid = get_tid(aln); if (get_l_qseq(aln) == 0) continue; @@ -510,9 +619,9 @@ process_reads(const bool VERBOSE, const bool show_progress, continue; if (tid == prev_tid) { if (bam_is_rev(aln)) - count_states_neg(aln, counts); + count_states_neg(aln, counts, *chrom_itr, mc_neg); else - count_states_pos(aln, counts); + count_states_pos(aln, counts, *chrom_itr, mc_pos); } else { // chrom has changed, so output results and get the next chrom @@ -560,6 +669,8 @@ process_reads(const bool VERBOSE, const bool show_progress, for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) output_skipped_chromosome(CPG_ONLY, i, tid_to_idx, hdr, chroms_beg, chrom_sizes, counts, out); + std::cout << mc_pos.tostring_frac() << std::endl; + std::cout << mc_neg.tostring_frac() << std::endl; } int From c54bb55135b92b05bbbb24644f638a3dbbfe373d Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 7 Jun 2025 20:05:28 -0700 Subject: [PATCH 06/13] src/utils/symmetric-cpgs.cpp: adding a flag to allow extra fields in the input, which are ignored, when making symmetric CpG counts files. This is for nanopore, which currently has extra fields in the counts files by default --- src/utils/symmetric-cpgs.cpp | 38 ++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/utils/symmetric-cpgs.cpp b/src/utils/symmetric-cpgs.cpp index b1d0cd9f..5b7b9620 100644 --- a/src/utils/symmetric-cpgs.cpp +++ b/src/utils/symmetric-cpgs.cpp @@ -1,8 +1,7 @@ /* symmetric-cpgs: extract the CpG sites from a methcounts output * file and produce a new one with the CpGs treated unstranded. * - * Copyright (C) 2014 University of Southern California and - * Andrew D. Smith + * Copyright (C) 2014-2025 Andrew D. Smith * * Authors: Andrew D. Smith * @@ -56,7 +55,8 @@ ensure_positive_cpg(MSite &s) { s.strand = '+'; } -template static std::tuple +template +static std::tuple get_first_site(T &in, T &out) { bool prev_is_cpg = false; MSite prev_site; @@ -68,14 +68,16 @@ get_first_site(T &in, T &out) { } else { prev_site.initialize(line.data(), line.data() + size(line)); - if (prev_site.is_cpg()) prev_is_cpg = true; + if (prev_site.is_cpg()) + prev_is_cpg = true; within_header = false; } } return {prev_site, prev_is_cpg}; } -template static bool +template +static bool process_sites(const bool verbose, T &in, T &out) { // get the first site while dealing with the header @@ -90,7 +92,8 @@ process_sites(const bool verbose, T &in, T &out) { while (read_site(in, curr_site)) { const bool same_chrom = prev_site.chrom == curr_site.chrom; if (same_chrom) { - if (curr_site.pos <= prev_site.pos) return false; + if (curr_site.pos <= prev_site.pos) + return false; if (prev_is_cpg) { if (curr_site.is_mate_of(prev_site)) curr_site.add(prev_site); @@ -103,7 +106,8 @@ process_sites(const bool verbose, T &in, T &out) { else { if (verbose) cerr << "processing: " << curr_site.chrom << endl; - if (chroms_seen.find(curr_site.chrom) != cend(chroms_seen)) return false; + if (chroms_seen.find(curr_site.chrom) != cend(chroms_seen)) + return false; chroms_seen.insert(curr_site.chrom); if (prev_is_cpg) { @@ -129,6 +133,7 @@ main_symmetric_cpgs(int argc, const char **argv) { string outfile{"-"}; bool verbose = false; bool compress_output = false; + bool allow_extra_fields = false; int32_t n_threads = 1; const string description = @@ -141,6 +146,8 @@ main_symmetric_cpgs(int argc, const char **argv) { outfile); opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); + opt_parse.add_opt("relaxed", '\0', "allow extra fields in input", false, + allow_extra_fields); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); @@ -164,20 +171,26 @@ main_symmetric_cpgs(int argc, const char **argv) { const string filename(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ - if (n_threads <= 0) throw runtime_error("threads must be positive"); + MSite::no_extra_fields = (allow_extra_fields == false); + + if (n_threads <= 0) + throw runtime_error("threads must be positive"); bamxx::bam_tpool tp(n_threads); // const bool show_progress = VERBOSE && isatty(fileno(stderr)); bgzf_file in(filename, "r"); - if (!in) throw runtime_error("could not open file: " + filename); + if (!in) + throw runtime_error("could not open file: " + filename); // open the output file const string output_mode = compress_output ? "w" : "wu"; bamxx::bgzf_file out(outfile, output_mode); - if (!out) throw runtime_error("error opening output file: " + outfile); + if (!out) + throw runtime_error("error opening output file: " + outfile); if (n_threads > 1) { - if (in.is_bgzf()) tp.set_io(in); + if (in.is_bgzf()) + tp.set_io(in); tp.set_io(out); } @@ -187,7 +200,8 @@ main_symmetric_cpgs(int argc, const char **argv) { cerr << "sites are not sorted in: " << filename << endl; namespace fs = std::filesystem; const fs::path outpath{outfile}; - if (fs::exists(outpath)) fs::remove(outpath); + if (fs::exists(outpath)) + fs::remove(outpath); return EXIT_FAILURE; } } From bfff2d868b52c1a3fe93ac60a3687c1f61db9e17 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 7 Jun 2025 20:06:22 -0700 Subject: [PATCH 07/13] src/dnmtools.cpp: the command for counts from nanopore data is now nanocount --- src/dnmtools.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dnmtools.cpp b/src/dnmtools.cpp index 30638ac1..cd77fb89 100644 --- a/src/dnmtools.cpp +++ b/src/dnmtools.cpp @@ -65,7 +65,7 @@ simreads(int argc, const char **argv); int main_counts(int argc, const char **argv); int -main_nanopore(int argc, const char **argv); +main_nanocount(int argc, const char **argv); int main_allelicmeth(int argc, const char **argv); int @@ -169,7 +169,7 @@ main(int argc, const char **argv) { {"uniq", "remove duplicate reads from sorted mapped reads", main_uniq}, {"bsrate", "compute the BS conversion rate from BS-seq reads mapped to a genome", main_bsrate}, {"counts", "get methylation levels from mapped WGBS reads", main_counts}, - {"nano", "get methylation levels from mapped nanopore reads", main_nanopore}, + {"nano", "get methylation levels from mapped nanopore reads", main_nanocount}, {"sym", "get CpG sites and make methylation levels symmetric", main_symmetric_cpgs}, {"levels", "compute methylation summary statistics from a counts file", main_levels}}}}, From e1d939d4a21ab117f570d7b35916f7c2b722e63f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 7 Jun 2025 20:08:26 -0700 Subject: [PATCH 08/13] src/common/MSite.hcpp: adding a static bool member to the MSite class that when set allows parsing of MSite / counts files that have extra fields that are to be ignored. The 'add' function for making symmetric CpGs by adding info from both strands now also does not assume integer values for the amount of methylation. Also code formatting. --- src/common/MSite.cpp | 95 +++++++++++++++++++------------------- src/common/MSite.hpp | 107 +++++++++++++++++++++++++------------------ 2 files changed, 109 insertions(+), 93 deletions(-) diff --git a/src/common/MSite.cpp b/src/common/MSite.cpp index 27bd1a22..a9275929 100644 --- a/src/common/MSite.cpp +++ b/src/common/MSite.cpp @@ -15,26 +15,27 @@ #include "MSite.hpp" -#include -#include +#include #include +#include +#include #include #include -#include -#include +#include -#include "smithlab_utils.hpp" #include "counts_header.hpp" +#include "smithlab_utils.hpp" -using std::string; -using std::runtime_error; -using std::regex_match; -using std::from_chars; -using std::find_if; using std::cbegin; using std::cend; using std::end; +using std::find_if; +using std::from_chars; +using std::regex_match; +using std::runtime_error; +using std::string; +bool MSite::no_extra_fields = true; bool MSite::initialize(const char *c, const char *c_end) { @@ -45,7 +46,8 @@ MSite::initialize(const char *c, const char *c_end) { auto field_s = c; auto field_e = find_if(field_s + 1, c_end, is_sep); - if (field_e == c_end) failed = true; + if (field_e == c_end) + failed = true; { const uint32_t d = std::distance(field_s, field_e); @@ -95,7 +97,7 @@ MSite::initialize(const char *c, const char *c_end) { { const auto [ptr, ec] = from_chars(field_s, c_end, n_reads); - failed = failed || (ptr != c_end); + failed = failed || (no_extra_fields && ptr != c_end); } // ADS: the value below would work for a flag // pos = std::numeric_limits::max(); @@ -103,36 +105,28 @@ MSite::initialize(const char *c, const char *c_end) { return !failed; } - MSite::MSite(const string &line) { if (!initialize(line.data(), line.data() + size(line))) throw runtime_error("bad count line: " + line); } - MSite::MSite(const char *line, const int n) { if (!initialize(line, line + n)) throw runtime_error("bad count line: " + string(line)); } - string MSite::tostring() const { std::ostringstream oss; - oss << chrom << '\t' - << pos << '\t' - << strand << '\t' - << context << '\t' - << meth << '\t' - << n_reads; + oss << chrom << '\t' << pos << '\t' << strand << '\t' << context << '\t' + << meth << '\t' << n_reads; return oss.str(); } size_t distance(const MSite &a, const MSite &b) { - return a.chrom == b.chrom ? - std::max(a.pos, b.pos) - std::min(a.pos, b.pos) : - std::numeric_limits::max(); + return a.chrom == b.chrom ? std::max(a.pos, b.pos) - std::min(a.pos, b.pos) + : std::numeric_limits::max(); } using std::ifstream; @@ -152,10 +146,8 @@ move_to_start_of_line(ifstream &in) { in.clear(); } - void -find_offset_for_msite(const std::string &chr, - const size_t idx, +find_offset_for_msite(const std::string &chr, const size_t idx, std::ifstream &site_in) { site_in.seekg(0, ios_base::beg); @@ -166,7 +158,7 @@ find_offset_for_msite(const std::string &chr, if (end_pos - begin_pos < 2) throw runtime_error("empty counts file"); - size_t step_size = (end_pos - begin_pos)/2; + size_t step_size = (end_pos - begin_pos) / 2; site_in.seekg(0, ios_base::beg); string low_chr; @@ -185,7 +177,7 @@ find_offset_for_msite(const std::string &chr, size_t pos = step_size; site_in.seekg(pos, ios_base::beg); move_to_start_of_line(site_in); - size_t prev_pos = 0; // keep track of previous position in file + size_t prev_pos = 0; // keep track of previous position in file // binary search inside sorted file on disk // iterate until step size is 0 or positions are identical @@ -194,8 +186,8 @@ find_offset_for_msite(const std::string &chr, // track (mid) position in file to make sure it keeps moving prev_pos = pos; - string mid_chr; // chromosome name at mid point - size_t mid_idx = 0; // position at mid point + string mid_chr; // chromosome name at mid point + size_t mid_idx = 0; // position at mid point if (!(site_in >> mid_chr >> mid_idx)) throw runtime_error("failed navigating inside file"); @@ -203,10 +195,10 @@ find_offset_for_msite(const std::string &chr, // might catch an unsorted file if (mid_chr < low_chr || mid_chr > high_chr) throw runtime_error("chromosomes unsorted inside file: " - "low=" + low_chr + ",mid=" + mid_chr + - ",high=" + high_chr); + "low=" + + low_chr + ",mid=" + mid_chr + ",high=" + high_chr); - step_size /= 2; // cut the range in half + step_size /= 2; // cut the range in half if (chr < mid_chr || (chr == mid_chr && idx <= mid_idx)) { // move to the left @@ -229,8 +221,7 @@ find_offset_for_msite(const std::string &chr, using std::unordered_map; void find_offset_for_msite(const unordered_map &chrom_order, - const std::string &chr, - const size_t idx, + const std::string &chr, const size_t idx, std::ifstream &site_in) { site_in.seekg(0, ios_base::beg); @@ -241,7 +232,7 @@ find_offset_for_msite(const unordered_map &chrom_order, if (end_pos - begin_pos < 2) throw runtime_error("empty counts file"); - size_t step_size = (end_pos - begin_pos)/2; + size_t step_size = (end_pos - begin_pos) / 2; const auto chr_idx_itr = chrom_order.find(chr); if (chr_idx_itr == end(chrom_order)) { @@ -277,7 +268,7 @@ find_offset_for_msite(const unordered_map &chrom_order, size_t pos = step_size; site_in.seekg(pos, ios_base::beg); move_to_start_of_line(site_in); - size_t prev_pos = 0; // keep track of previous position in file + size_t prev_pos = 0; // keep track of previous position in file // binary search inside sorted file on disk // iterate until step size is 0 or positions are identical @@ -286,8 +277,8 @@ find_offset_for_msite(const unordered_map &chrom_order, // track (mid) position in file to make sure it keeps moving prev_pos = pos; - string mid_chr; // chromosome name at mid point - size_t mid_idx{}; // position at mid point + string mid_chr; // chromosome name at mid point + size_t mid_idx{}; // position at mid point if (!(site_in >> mid_chr >> mid_idx)) throw runtime_error("failed navigating inside file"); @@ -301,10 +292,12 @@ find_offset_for_msite(const unordered_map &chrom_order, using std::to_string; if (mid_chr_idx < low_chr_idx || mid_chr_idx > high_chr_idx) throw runtime_error("chromosomes unsorted inside file: " - "low=" + to_string(low_chr_idx) + ",mid=" + to_string(mid_chr_idx) + + "low=" + + to_string(low_chr_idx) + + ",mid=" + to_string(mid_chr_idx) + ",high=" + to_string(high_chr_idx)); - step_size /= 2; // cut the range in half + step_size /= 2; // cut the range in half if (chr_idx < mid_chr_idx || (chr_idx == mid_chr_idx && idx <= mid_idx)) { // move to the left @@ -328,10 +321,12 @@ is_msite_line(const string &line) { std::istringstream iss(line); string chrom; - if (!(iss >> chrom)) return false; + if (!(iss >> chrom)) + return false; int64_t pos = 0; - if (!(iss >> pos || pos < 0)) return false; + if (!(iss >> pos || pos < 0)) + return false; string strand; if (!(iss >> strand) || (strand.size() != 1) || @@ -341,13 +336,16 @@ is_msite_line(const string &line) { string context; // ADS: below, allowing any context // std::regex pattern("^C[pHWX][GH]$"); - if (!(iss >> context)) return false; + if (!(iss >> context)) + return false; double level = 0.0; - if (!(iss >> level) || level < 0.0 || level > 1.0) return false; + if (!(iss >> level) || level < 0.0 || level > 1.0) + return false; int64_t n_reads = 0; - if (!(iss >> n_reads || n_reads < 0)) return false; + if (!(iss >> n_reads || n_reads < 0)) + return false; string temp; if (iss >> temp) @@ -359,7 +357,8 @@ is_msite_line(const string &line) { bool is_msite_file(const string &file) { bamxx::bgzf_file in(file, "r"); - if (!in) throw runtime_error("cannot open file: " + file); + if (!in) + throw runtime_error("cannot open file: " + file); string line; while (getline(in, line) && is_counts_header_line(line)) diff --git a/src/common/MSite.hpp b/src/common/MSite.hpp index 355aad72..a74fc36a 100644 --- a/src/common/MSite.hpp +++ b/src/common/MSite.hpp @@ -18,12 +18,20 @@ #ifndef MSITE_HPP #define MSITE_HPP -#include +#include + #include #include +#include +#include #include struct MSite { + + // This defaults to true, and when true MSite lines cannot be parsed if they + // have additional columns in the file. + static bool no_extra_fields; + std::string chrom{}; size_t pos{}; char strand{}; @@ -32,34 +40,37 @@ struct MSite { size_t n_reads{}; MSite() = default; - MSite(const std::string &chrom, - const size_t pos, - const char strand, - const std::string &context, - const double meth, - const size_t n_reads) : - chrom{chrom}, pos{pos}, strand{strand}, - context{context}, meth{meth}, n_reads{n_reads} {} + MSite(const std::string &chrom, const size_t pos, const char strand, + const std::string &context, const double meth, const size_t n_reads) : + chrom{chrom}, pos{pos}, strand{strand}, context{context}, meth{meth}, + n_reads{n_reads} {} explicit MSite(const std::string &line); explicit MSite(const char *line, const int n); - bool initialize(const char *c, const char *c_end); + bool + initialize(const char *c, const char *c_end); - bool operator<(const MSite &other) const { + bool + operator<(const MSite &other) const { int r = chrom.compare(other.chrom); - return (r < 0 || - (r == 0 && - (pos < other.pos || - (pos == other.pos && strand < other.strand)))); + return (r < 0 || (r == 0 && (pos < other.pos || + (pos == other.pos && strand < other.strand)))); } - size_t n_meth() const {return std::round(meth*n_reads);} - size_t n_unmeth() const {return n_reads - n_meth();} + size_t + n_meth() const { + return std::round(meth * n_reads); + } + size_t + n_unmeth() const { + return n_reads - n_meth(); + } ////////////////////////////////////////////////////////////// /// FUNCTIONS BELOW ARE FOR MANIPULATING SYMMETRIC CPG SITES ////////////////////////////////////////////////////////////// - void add(const MSite &other) { + void + add(const MSite &other) { // ADS: possible that this function has specific behavior that // should be placed in the 'sym' source, since we might not want // this line below to operate generally. @@ -67,14 +78,15 @@ struct MSite { context += 'x'; // ADS: order matters below as n_reads update invalidates n_meth() // function until meth has been updated - const size_t total_c_reads = n_meth() + other.n_meth(); + const double total_c_reads = (meth * n_reads + other.meth * other.n_reads); n_reads += other.n_reads; - meth = static_cast(total_c_reads)/std::max(1ul, n_reads); + meth = total_c_reads / std::max(1ul, n_reads); } // ADS: function below has redundant check for is_cpg, which is // expensive and might be ok to remove - bool is_mate_of(const MSite &first) { + bool + is_mate_of(const MSite &first) { return (first.pos + 1 == pos && first.is_cpg() && is_cpg() && first.strand == '+' && strand == '-'); } @@ -85,39 +97,48 @@ struct MSite { ///// CpG within. Also included is a function that tests if a site ///// has a mutation. //////////////////////////////////////////////////////////////////////// - bool is_cpg() const { + bool + is_cpg() const { return context.length() >= 3 && - (context[0] == 'C' && context[1] == 'p' && context[2] == 'G'); + (context[0] == 'C' && context[1] == 'p' && context[2] == 'G'); } - bool is_chh() const { + bool + is_chh() const { return context.length() >= 3 && - (context[0] == 'C' && context[1] == 'H' && context[2] == 'H'); + (context[0] == 'C' && context[1] == 'H' && context[2] == 'H'); } - bool is_ccg() const { + bool + is_ccg() const { return context.length() >= 3 && - (context[0] == 'C' && context[1] == 'C' && context[2] == 'G'); + (context[0] == 'C' && context[1] == 'C' && context[2] == 'G'); } - bool is_cxg() const { + bool + is_cxg() const { return context.length() >= 3 && - (context[0] == 'C' && context[1] == 'X' && context[2] == 'G'); + (context[0] == 'C' && context[1] == 'X' && context[2] == 'G'); } - bool is_mutated() const { + bool + is_mutated() const { return context.length() == 4 && context[3] == 'x'; } - void set_mutated() { + void + set_mutated() { if (!is_mutated()) context += 'x'; } - void set_unmutated() { + void + set_unmutated() { if (is_mutated()) context.resize(context.length() - 1); } - std::string tostring() const; + std::string + tostring() const; }; -template T & +template +T & operator>>(T &in, MSite &s) { std::string line; if (getline(in, line)) @@ -125,9 +146,10 @@ operator>>(T &in, MSite &s) { return in; } -template T & +template +T & operator<<(T &out, const MSite &s) { - out << s.tostring(); // seems to be an issue returning this directly + out << s.tostring(); // seems to be an issue returning this directly return out; } @@ -138,15 +160,13 @@ distance(const MSite &a, const MSite &b); // file (for the specified stream) that does not precede the specified // position given by the (chrom, start_pos) pair. void -find_offset_for_msite(const std::string &chrom, - const size_t start_pos, +find_offset_for_msite(const std::string &chrom, const size_t start_pos, std::ifstream &site_in); void -find_offset_for_msite(const std::unordered_map &chrom_order, - const std::string &chr, - const size_t idx, - std::ifstream &site_in); +find_offset_for_msite( + const std::unordered_map &chrom_order, + const std::string &chr, const size_t idx, std::ifstream &site_in); // ADS: using the functions below for now for static polymorphism // because I don't want to have insertion and extraction operators @@ -154,9 +174,6 @@ find_offset_for_msite(const std::unordered_map &chrom_ord // if we are going to use them, we need to be aware of which functions // we are calling, and not just do it unconsciously. -#include -#include - inline bamxx::bgzf_file & write_site(bamxx::bgzf_file &f, const MSite &s) { // ADS: to slow?? From 9d2b93a624ff9096444bc434698b30b2aaf08ac7 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 7 Jun 2025 20:10:06 -0700 Subject: [PATCH 09/13] src/analysis/nanopore.cpp: added functionality to deal separately with methyl and hydroxy, both of which are assumed present in the BAM file. Also performance improvements, stats output to file, and a debug option to use only reads from a specified strand (which will probably be removed soon) --- src/analysis/nanopore.cpp | 420 +++++++++++++++++++++++--------------- 1 file changed, 260 insertions(+), 160 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index 288cfba6..d760da5f 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -1,19 +1,18 @@ -/* nanopore: counts for nanopore data +/* nanocount: methylation counts for nanopore data * - * Copyright (C) 2011-2025 University of Southern California and - * Andrew D. Smith + * Copyright (C) 2025 Andrew D. Smith * * Author: Andrew D. Smith * - * This program is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. */ #include @@ -65,11 +64,8 @@ struct quick_buf : public std::ostringstream, } }; -// ADS: we should never have to worry about coverage over > 32767 in -// any downstream analysis, so using "int16_t" here would allow to -// detect wrap around and report it as some kind of weird thing, maybe -// zeroing it and flagging the output. As it is now, if we really need -// >65535-fold coverage, we can make the change here. +// ADS: here the uint16_t allows for up to 256 reads, each contributing up to +// 256 "counts" in the probability encoding. typedef uint16_t count_type; static inline bool @@ -113,25 +109,48 @@ is_c_at_g(const std::string &s, size_t i) { requirement of the program into a more reasonable range, so keeping all the information seems reasonable. */ struct CountSet { - string + std::string tostring() const { + // clang-format off std::ostringstream oss; - oss << prob_pos << '\t' << n_reads_pos << '\t' << prob_neg << '\t' - << n_reads_neg << '\n'; + oss << hydroxy_pos << '\t' << hydroxy_neg << '\t' + << methyl_pos << '\t' << methyl_neg << '\t' + << n_reads_pos << '\t' << n_reads_neg << '\n'; + // clang-format on return oss.str(); } void - add_count_pos(const std::uint8_t x) { - prob_pos += x; + add_count_pos(const std::uint8_t h, const std::uint8_t m) { + hydroxy_pos += h; + methyl_pos += m; ++n_reads_pos; } void - add_count_neg(const std::uint8_t x) { - prob_neg += x; + add_count_neg(const std::uint8_t h, const std::uint8_t m) { + hydroxy_neg += h; + methyl_neg += m; ++n_reads_neg; } - count_type prob_pos{0}; - count_type prob_neg{0}; + double + get_hydroxy(const bool is_c) const { + return is_c ? hydroxy_pos : hydroxy_neg; + } + double + get_methyl(const bool is_c) const { + return is_c ? methyl_pos : methyl_neg; + } + double + get_mods(const bool is_c) const { + return get_hydroxy(is_c) + get_methyl(is_c); + } + double + get_n_reads(const bool is_c) const { + return is_c ? n_reads_pos : n_reads_neg; + } + count_type hydroxy_pos{0}; + count_type hydroxy_neg{0}; + count_type methyl_pos{0}; + count_type methyl_neg{0}; count_type n_reads_pos{0}; count_type n_reads_neg{0}; }; @@ -191,116 +210,169 @@ write_output(const bam_header &hdr, bgzf_file &out, const std::int32_t tid, continue; const bool is_c = is_cytosine(base); - const double n_reads = - is_c ? counts[i].n_reads_pos : counts[i].n_reads_neg; - double n_meth = is_c ? counts[i].prob_pos : counts[i].prob_neg; - n_meth = n_meth > 0.0 ? (n_meth / 256.0) : 0.0; - if (require_covered && n_reads == 0) + const double n_reads = counts[i].get_n_reads(is_c); + if (require_covered && n_reads < 1.0) continue; + const double denom = std::max(n_reads, 1.0); + const double hydroxy = counts[i].get_hydroxy(is_c) / 256.0 / denom; + const double methyl = counts[i].get_methyl(is_c) / 256.0 / denom; + const double mods = counts[i].get_mods(is_c) / 256.0 / denom; - // const bool mut = false; buf.clear(); // ADS: here is where we make an MSite, but not using MSite + // clang-format off buf << sam_hdr_tid2name(hdr, tid) << '\t' << i << '\t' << (is_c ? '+' : '-') << '\t' << tag_values[the_tag] << '\t' - << (n_reads > 0.0 ? n_meth / n_reads : 0.0) << '\t' << n_reads + << mods << '\t' + << n_reads << '\t' + << hydroxy << '\t' + << methyl << '\n'; + // clang-format on if (!out.write(buf.c_str(), buf.tellp())) - throw dnmt_error("error writing output"); + throw std::runtime_error("error writing output"); } } } -static inline bool -confirm_mm_vals(const bamxx::bam_rec &aln, std::vector &vals) { +template +static std::tuple +get_hydroxy_sites(T mod_pos_beg, T mod_pos_end) { const char hydroxy_tag[] = "C+h?"; const auto hydroxy_tag_size = 4; - const char methyl_tag[] = "C+m?"; + auto hydroxy_beg = strstr(mod_pos_beg, hydroxy_tag); + if (hydroxy_beg == mod_pos_end) + return std::tuple{}; + auto hydroxy_end = std::find(hydroxy_beg, mod_pos_end, ';'); + if (hydroxy_end == mod_pos_end) + return std::tuple{}; + hydroxy_beg += hydroxy_tag_size; + return std::tuple(hydroxy_beg, hydroxy_end); +} + +template +static std::tuple +get_methyl_sites(T mod_pos_beg, T mod_pos_end) { + const char methyl_tag[] = "C+h?"; const auto methyl_tag_size = 4; + auto methyl_beg = strstr(mod_pos_beg, methyl_tag); + if (methyl_beg == mod_pos_end) + return std::tuple{}; + auto methyl_end = std::find(methyl_beg, mod_pos_end, ';'); + if (methyl_end == mod_pos_end) + return std::tuple{}; + methyl_beg += methyl_tag_size; + return std::tuple(methyl_beg, methyl_end); +} +static std::tuple +get_modification_positions(const bamxx::bam_rec &aln) { const auto mm_aux = bam_aux_get(aln.b, "MM"); if (mm_aux == nullptr) - return false; - - auto mm_beg = bam_aux2Z(mm_aux); - auto mm_end = mm_beg + std::strlen(mm_beg); - - auto hydroxy_beg = strstr(mm_beg, hydroxy_tag); - auto hydroxy_end = std::find(hydroxy_beg, mm_end, ';'); + return {}; + auto mod_pos_beg = bam_aux2Z(mm_aux); + auto mod_pos_end = mod_pos_beg + std::strlen(mod_pos_beg); + return {mod_pos_beg, mod_pos_end}; +} - auto methyl_beg = strstr(mm_beg, methyl_tag); - // auto methyl_end = std::find(methyl_beg, mm_end, ';'); +struct mod_prob_buffer { + static constexpr auto init_capacity{128 * 1024}; + std::vector hydroxy_probs; + std::vector methyl_probs; + mod_prob_buffer() { + methyl_probs.reserve(init_capacity); + hydroxy_probs.reserve(init_capacity); + } + bool + set_probs(const bamxx::bam_rec &aln) { + const auto get_next_mod_pos = [](auto &b, const auto e) -> std::int32_t { + const auto isdig = [](const auto x) { + return std::isdigit(static_cast(x)); + }; + b = std::find_if(b, e, isdig); + if (b == e) + return -1; + auto r = atoi(b); + b = std::find_if_not(b, e, isdig); + return r; + }; - hydroxy_beg += hydroxy_tag_size; - methyl_beg += methyl_tag_size; + auto [mod_pos_beg, mod_pos_end] = get_modification_positions(aln); + auto [hydroxy_beg, hydroxy_end] = + get_hydroxy_sites(mod_pos_beg, mod_pos_end); + auto [methyl_beg, methyl_end] = get_methyl_sites(mod_pos_beg, mod_pos_end); - if (!std::equal(hydroxy_beg, hydroxy_end, methyl_beg)) - return false; + // check if any are false + if (mod_pos_beg == nullptr || hydroxy_beg == nullptr || + methyl_beg == nullptr) + return false; - const auto isdig = [](const auto x) { - return std::isdigit(static_cast(x)); - }; - - const auto get_next = [&](auto &b, auto &e) -> std::int32_t { - b = std::find_if(b, e, isdig); - if (b == e) - return -1; - auto r = atoi(b); - b = std::find_if_not(b, e, isdig); - return r; - }; - - const auto ml_aux = bam_aux_get(aln.b, "ML"); - if (ml_aux == nullptr) - return false; - const auto ml_aux_len = bam_auxB_len(ml_aux); + // assume that hydroxy and methyl both point to CpG sites + if (!std::equal(hydroxy_beg, hydroxy_end, methyl_beg, methyl_end)) + return false; - // number of commas is number of hydroxy substrates = CpG sites - const auto n_cpgs = std::count(hydroxy_beg, hydroxy_end, ','); + const auto mod_prob = bam_aux_get(aln.b, "ML"); + if (mod_prob == nullptr) + return false; - const auto n = get_l_qseq(aln); - const auto seq = bam_get_seq(aln); - vals = std::vector(n, 0); - std::int32_t delta = get_next(hydroxy_beg, hydroxy_end); - - auto ml_idx = n_cpgs; // start methyl after hydroxy - - if (bam_is_rev(aln)) { - for (auto i = 1; i <= n; ++i) { - const auto nuc = seq_nt16_str[bam_seqi(seq, n - i)]; - if (nuc == 'G') { - if (seq_nt16_str[bam_seqi(seq, n - i - 1)] == 'C') { - if (delta != 0) - return false; - vals[i - 1] = bam_auxB2i(ml_aux, ml_idx++); + // number of commas is number of hydroxy substrates = CpG sites + const auto n_cpgs = std::count(hydroxy_beg, hydroxy_end, ','); + + const auto qlen = get_l_qseq(aln); + const auto seq = bam_get_seq(aln); + + methyl_probs.clear(); + methyl_probs.resize(qlen, 0); + + hydroxy_probs.clear(); + hydroxy_probs.resize(qlen, 0); + + std::int32_t delta = get_next_mod_pos(hydroxy_beg, hydroxy_end); + + auto hydroxy_prob_idx = 0; // start of modifications + auto methyl_prob_idx = n_cpgs; // start methyl after hydroxy + + if (bam_is_rev(aln)) { + for (auto i = 0; i < qlen; ++i) { + const auto nuc = seq_nt16_str[bam_seqi(seq, qlen - i - 1)]; + if (nuc == 'G') { + if (seq_nt16_str[bam_seqi(seq, qlen - i - 2)] == 'C') { + // assume that when delta hits 0 we have a CpG site + if (delta != 0) + return false; + methyl_probs[i] = bam_auxB2i(mod_prob, methyl_prob_idx++); + hydroxy_probs[i] = bam_auxB2i(mod_prob, hydroxy_prob_idx++); + } + --delta; + if (delta < 0) + delta = get_next_mod_pos(hydroxy_beg, hydroxy_end); } - --delta; - if (delta < 0) - delta = get_next(hydroxy_beg, hydroxy_end); } } - } - else { - for (auto i = 0; i + 1 < n; ++i) { - const auto nuc = seq_nt16_str[bam_seqi(seq, i)]; - if (nuc == 'C') { - if (seq_nt16_str[bam_seqi(seq, i + 1)] == 'G') { - if (delta != 0) - return false; - vals[i] = bam_auxB2i(ml_aux, ml_idx++); + else { + for (auto i = 0; i + 1 < qlen; ++i) { + const auto nuc = seq_nt16_str[bam_seqi(seq, i)]; + if (nuc == 'C') { + if (seq_nt16_str[bam_seqi(seq, i + 1)] == 'G') { + // assume that when delta hits 0 we have a CpG site + if (delta != 0) + return false; + methyl_probs[i] = bam_auxB2i(mod_prob, methyl_prob_idx++); + hydroxy_probs[i] = bam_auxB2i(mod_prob, hydroxy_prob_idx++); + } + --delta; + if (delta < 0) + delta = get_next_mod_pos(hydroxy_beg, hydroxy_end); } - --delta; - if (delta < 0) - delta = get_next(hydroxy_beg, hydroxy_end); } } + return true; } +}; - return true; -} - -std::uint8_t enc[] = { +// clang-format off +static const std::uint8_t enc[] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 16 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 32 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 48 @@ -318,6 +390,17 @@ std::uint8_t enc[] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 240 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 // 256 }; +// clang-format on + +static constexpr auto n_dinucs = 16u; +// clang-format off +static const auto dinucs = std::vector{ + "AA", "AC", "AG", "AT", + "CA", "CC", "CG", "CT", + "GA", "GC", "GG", "GT", + "TA", "TC", "TG", "TT", +}; +// clang-format on struct match_counter { std::array c{}; @@ -336,16 +419,12 @@ struct match_counter { std::string tostring() const { - static constexpr auto n_dinucs = 16u; - const auto dinucs = std::vector{ - "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", - "GA", "GC", "GG", "GT", "TA", "TC", "TG", "TT", - }; std::ostringstream oss; for (auto i = 0u; i < n_dinucs; ++i) { + const auto beg = std::cbegin(c) + (i * n_dinucs); oss << dinucs[i]; - for (auto j = 0u; j < n_dinucs; ++j) - oss << '\t' << c[i * n_dinucs + j]; + for (auto itr = beg; itr != beg + n_dinucs; ++itr) + oss << '\t' << *itr; oss << '\n'; } return oss.str(); @@ -355,21 +434,15 @@ struct match_counter { tostring_frac() const { static constexpr auto width = 8; static constexpr auto prec = 3; - static constexpr auto n_dinucs = 16u; - const auto dinucs = std::vector{ - "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", - "GA", "GC", "GG", "GT", "TA", "TC", "TG", "TT", - }; std::ostringstream oss; + oss.precision(prec); + oss.setf(std::ios::fixed, std::ios::floatfield); for (auto i = 0u; i < n_dinucs; ++i) { oss << dinucs[i]; - const auto tot = - std::accumulate(std::cbegin(c) + (i * n_dinucs), - std::cbegin(c) + ((i + 1) * n_dinucs), 0.0); - for (auto j = 0u; j < n_dinucs; ++j) { - oss << std::setw(width) << std::fixed << std::setprecision(prec) - << c[i * n_dinucs + j] / tot; - } + const auto beg = std::cbegin(c) + (i * n_dinucs); + const auto tot = std::max(std::accumulate(beg, beg + n_dinucs, 0.0), 1.0); + for (auto itr = beg; itr != beg + n_dinucs; ++itr) + oss << std::setw(width) << *itr / tot; oss << '\n'; } return oss.str(); @@ -378,7 +451,8 @@ struct match_counter { static void count_states_pos(const bam_rec &aln, vector &counts, - const std::string &chrom, match_counter &mc) { + mod_prob_buffer &mod_buf, const std::string &chrom, + match_counter &mc) { /* Move through cigar, reference and read positions without inflating cigar or read sequence */ const auto seq = bam_get_seq(aln); @@ -389,14 +463,14 @@ count_states_pos(const bam_rec &aln, vector &counts, auto r_itr = std::cbegin(chrom) + rpos; const auto r_itr_lim = std::cend(chrom); - std::vector probs; - if (!confirm_mm_vals(aln, probs)) + if (!mod_buf.set_probs(aln)) return; auto qpos = 0; // to match type with b->core.l_qseq auto q_lim = get_l_qseq(aln) - 1; // if here, this is >= 0 - auto prob_itr = std::cbegin(probs); + auto hydroxy_prob_itr = std::cbegin(mod_buf.hydroxy_probs); + auto methyl_prob_itr = std::cbegin(mod_buf.methyl_probs); for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { const char op = bam_cigar_op(*c_itr); @@ -408,12 +482,15 @@ count_states_pos(const bam_rec &aln, vector &counts, mc.add_pos(r_nuc, r_itr == r_itr_lim ? 'N' : *r_itr, seq_nt16_str[bam_seqi(seq, qpos)], qpos == q_lim ? 'N' : seq_nt16_str[bam_seqi(seq, qpos + 1)]); - counts[rpos++].add_count_pos(*prob_itr++); + counts[rpos++].add_count_pos(*hydroxy_prob_itr, *methyl_prob_itr); + ++methyl_prob_itr; + ++hydroxy_prob_itr; } } else if (eats_query(op)) { qpos += n; - prob_itr += n; + methyl_prob_itr += n; + hydroxy_prob_itr += n; } else if (eats_ref(op)) { rpos += n; @@ -429,7 +506,8 @@ count_states_pos(const bam_rec &aln, vector &counts, [[maybe_unused]] static void count_states_neg(const bam_rec &aln, vector &counts, - const std::string &chrom, match_counter &mc) { + mod_prob_buffer &mod_buf, const std::string &chrom, + match_counter &mc) { /* Move through cigar, reference and (*backward*) through read positions without inflating cigar or read sequence */ const auto seq = bam_get_seq(aln); @@ -443,11 +521,11 @@ count_states_neg(const bam_rec &aln, vector &counts, size_t q_idx = 0; size_t l_qseq = get_l_qseq(aln); - std::vector probs; // vals; - if (!confirm_mm_vals(aln, probs)) + if (!mod_buf.set_probs(aln)) return; - auto prob_itr = std::cend(probs); + auto hydroxy_prob_itr = std::cend(mod_buf.hydroxy_probs); + auto methyl_prob_itr = std::cend(mod_buf.methyl_probs); if (qpos == 0) return; @@ -463,13 +541,16 @@ count_states_neg(const bam_rec &aln, vector &counts, ++q_idx; mc.add_pos(r_nuc, r_itr == r_lim ? 'N' : *r_itr, q_nuc, q_idx == l_qseq ? 'N' : seq_nt16_str[bam_seqi(seq, q_idx)]); - counts[rpos++].add_count_neg(*--prob_itr); + --methyl_prob_itr; + --hydroxy_prob_itr; + counts[rpos++].add_count_neg(*hydroxy_prob_itr, *methyl_prob_itr); } } else if (eats_query(op)) { qpos -= n; q_idx += n; - prob_itr -= n; + methyl_prob_itr -= n; + hydroxy_prob_itr -= n; } else if (eats_ref(op)) { rpos += n; @@ -492,7 +573,7 @@ get_tid_to_idx(const bam_header &hdr, const string curr_name(hdr.h->target_name[i]); const auto name_itr(name_to_idx.find(curr_name)); if (name_itr == end(name_to_idx)) - throw dnmt_error("failed to find chrom: " + curr_name); + throw std::runtime_error("failed to find chrom: " + curr_name); tid_to_idx[i] = name_itr->second; } return tid_to_idx; @@ -509,7 +590,7 @@ output_skipped_chromosome( // get the index of the next chrom sequence const auto chrom_idx = tid_to_idx.find(tid); if (chrom_idx == std::cend(tid_to_idx)) - throw dnmt_error("chrom not found: " + sam_hdr_tid2name(hdr, tid)); + throw std::runtime_error("chrom not found: " + sam_hdr_tid2name(hdr, tid)); const auto chrom_itr = chroms_beg + chrom_idx->second; @@ -542,12 +623,12 @@ consistent_targets(const bam_header &hdr, } template -static void +static std::tuple process_reads(const bool VERBOSE, const bool show_progress, const bool compress_output, const bool include_header, const size_t n_threads, const string &infile, const string &outfile, const string &chroms_file, - const bool CPG_ONLY) { + const bool CPG_ONLY, const int strand) { // first get the chromosome names and sequences from the FASTA file vector chroms, names; read_fasta_file_short_names(chroms_file, names, chroms); @@ -570,23 +651,23 @@ process_reads(const bool VERBOSE, const bool show_progress, // open the hts SAM/BAM input file and get the header bamxx::bam_in hts(infile); if (!hts) - throw dnmt_error("failed to open input file"); + throw std::runtime_error("failed to open input file"); // load the input file's header bam_header hdr(hts); if (!hdr) - throw dnmt_error("failed to read header"); + throw std::runtime_error("failed to read header"); std::unordered_map tid_to_idx = get_tid_to_idx(hdr, name_to_idx); if (!consistent_targets(hdr, tid_to_idx, names, chrom_sizes)) - throw dnmt_error("inconsistent reference genome information"); + throw std::runtime_error("inconsistent reference genome information"); // open the output file const string output_mode = compress_output ? "w" : "wu"; bgzf_file out(outfile, output_mode); if (!out) - throw dnmt_error("error opening output file: " + outfile); + throw std::runtime_error("error opening output file: " + outfile); // set the threads for the input file decompression if (n_threads > 1) { @@ -609,6 +690,7 @@ process_reads(const bool VERBOSE, const bool show_progress, match_counter mc_pos; match_counter mc_neg; + mod_prob_buffer mod_buf; while (hts.read(hdr, aln)) { const std::int32_t tid = get_tid(aln); @@ -618,10 +700,11 @@ process_reads(const bool VERBOSE, const bool show_progress, if (tid == -1) // ADS: skip reads that have no tid -- they are not mapped continue; if (tid == prev_tid) { - if (bam_is_rev(aln)) - count_states_neg(aln, counts, *chrom_itr, mc_neg); - else - count_states_pos(aln, counts, *chrom_itr, mc_pos); + const bool is_rev = bam_is_rev(aln); + if (is_rev && strand != 1) + count_states_neg(aln, counts, mod_buf, *chrom_itr, mc_neg); + if (!is_rev && strand != 2) + count_states_pos(aln, counts, mod_buf, *chrom_itr, mc_pos); } else { // chrom has changed, so output results and get the next chrom @@ -635,7 +718,7 @@ process_reads(const bool VERBOSE, const bool show_progress, "previous tid: " + std::to_string(prev_tid) + " current tid: " + std::to_string(tid); - throw dnmt_error(message); + throw std::runtime_error(message); } if (!require_covered) @@ -646,8 +729,8 @@ process_reads(const bool VERBOSE, const bool show_progress, // get the next chrom to process auto chrom_idx(tid_to_idx.find(tid)); if (chrom_idx == end(tid_to_idx)) - throw dnmt_error("chromosome not found: " + - string(sam_hdr_tid2name(hdr, tid))); + throw std::runtime_error("chromosome not found: " + + string(sam_hdr_tid2name(hdr, tid))); if (show_progress) cerr << "processing " << sam_hdr_tid2name(hdr, tid) << endl; @@ -669,12 +752,11 @@ process_reads(const bool VERBOSE, const bool show_progress, for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) output_skipped_chromosome(CPG_ONLY, i, tid_to_idx, hdr, chroms_beg, chrom_sizes, counts, out); - std::cout << mc_pos.tostring_frac() << std::endl; - std::cout << mc_neg.tostring_frac() << std::endl; + return {mc_pos, mc_neg}; } int -main_nanopore(int argc, const char **argv) { +main_nanocount(int argc, const char **argv) { try { @@ -684,9 +766,11 @@ main_nanopore(int argc, const char **argv) { bool require_covered = false; bool compress_output = false; bool include_header = false; + int strand = 0; string chroms_file; string outfile; + string stats_file; int n_threads = 1; /****************** COMMAND LINE OPTIONS ********************/ @@ -703,6 +787,11 @@ main_nanopore(int argc, const char **argv) { false, CPG_ONLY); opt_parse.add_opt("require-covered", 'r', "only output covered sites", false, require_covered); + opt_parse.add_opt("strand", '\0', + "use strand (1=positive, 2=negative, 0=default)", false, + strand); + opt_parse.add_opt("stats", 's', "output match/mismatch stats to this file", + false, stats_file); opt_parse.add_opt("header", 'H', "add a header to identify the reference", false, include_header); opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); @@ -724,7 +813,7 @@ main_nanopore(int argc, const char **argv) { /****************** END COMMAND LINE OPTIONS *****************/ if (n_threads < 0) - throw dnmt_error("thread count cannot be negative"); + throw std::runtime_error("thread count cannot be negative"); std::ostringstream cmd; copy(argv, argv + argc, std::ostream_iterator(cmd, " ")); @@ -743,14 +832,25 @@ main_nanopore(int argc, const char **argv) { << "[CpG only mode: " << (CPG_ONLY ? "yes" : "no") << "]" << endl << "[command line: \"" << cmd.str() << "\"]" << endl; - if (require_covered) - process_reads(VERBOSE, show_progress, compress_output, - include_header, n_threads, mapped_reads_file, outfile, - chroms_file, CPG_ONLY); - else - process_reads(VERBOSE, show_progress, compress_output, include_header, - n_threads, mapped_reads_file, outfile, chroms_file, - CPG_ONLY); + const auto [mc_pos, mc_neg] = [&] { + if (require_covered) + return process_reads(VERBOSE, show_progress, compress_output, + include_header, n_threads, mapped_reads_file, + outfile, chroms_file, CPG_ONLY, strand); + return process_reads(VERBOSE, show_progress, compress_output, + include_header, n_threads, mapped_reads_file, + outfile, chroms_file, CPG_ONLY, strand); + }(); + + if (!stats_file.empty()) { + std::ofstream stats_out(stats_file); + if (!stats_out) + std::cerr << "Error opening stats file" << std::endl; + else { + stats_out << mc_pos.tostring_frac() << std::endl; + stats_out << mc_neg.tostring_frac() << std::endl; + } + } } catch (const std::exception &e) { cerr << e.what() << endl; From aebe4cc96943bd3bc96c8bbf57e543a67554ac8c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 8 Jun 2025 21:35:24 -0700 Subject: [PATCH 10/13] src/analysis/nanopore.cpp: added a check for the basecaller and improved file writing speed --- src/analysis/nanopore.cpp | 689 +++++++++++++++++++++----------------- 1 file changed, 379 insertions(+), 310 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index d760da5f..0d826377 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "OptionParser.hpp" @@ -34,46 +35,56 @@ /* HTSlib */ #include -using std::cerr; -using std::endl; -using std::string; -using std::unordered_map; -using std::vector; - using bamxx::bam_header; using bamxx::bam_rec; using bamxx::bgzf_file; -struct quick_buf : public std::ostringstream, - public std::basic_stringbuf { - // ADS: By user ecatmur on SO; very fast. Seems to work... - quick_buf() { - // ...but this seems to depend on data layout - static_cast &>(*this).rdbuf(this); - } - void - clear() { - // reset buffer pointers (member functions) - setp(pbase(), pbase()); - } - char const * - c_str() { - /* between c_str and insertion make sure to clear() */ - *pptr() = '\0'; - return pbase(); +[[nodiscard]] static std::string +get_basecall_model(const bam_header &hdr) { + kstring_t ks{}; + + ks = {0, 0, nullptr}; + const auto rg_ret = sam_hdr_find_line_pos(hdr.h, "RG", 0, &ks); + if (rg_ret == -2) + throw std::runtime_error("failure of sam_hdr_find_line_pos"); + if (rg_ret == -1) + return {}; + ks_free(&ks); + + ks = {0, 0, nullptr}; + const auto ds_ret = sam_hdr_find_tag_pos(hdr.h, "RG", 0, "DS", &ks); + if (ds_ret == -2) + throw std::runtime_error("failure of sam_hdr_find_tag_pos"); + if (ds_ret == -1) + return {}; + std::istringstream buffer(ks.s); + ks_free(&ks); + + const std::vector parts( + (std::istream_iterator(buffer)), {}); + + std::string basecall_model; + for (const auto &key_val : parts) { + const auto equal_pos = key_val.find('='); + if (equal_pos == std::string::npos) + throw std::runtime_error("malformed DS key=val pair: " + key_val); + if (key_val.substr(0, equal_pos) == "basecall_model") + return key_val.substr(equal_pos + 1); } -}; -// ADS: here the uint16_t allows for up to 256 reads, each contributing up to -// 256 "counts" in the probability encoding. -typedef uint16_t count_type; + return {}; +} + +// ADS: here the std::uint16_t allows for up to 256 reads, each contributing up +// to 256 "counts" in the probability encoding. +typedef std::uint16_t count_type; -static inline bool +[[nodiscard]] static inline bool eats_ref(const std::uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 2; } -static inline bool +[[nodiscard]] static inline bool eats_query(const std::uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 1; } @@ -84,23 +95,22 @@ eats_query(const std::uint32_t c) { then one needs this. Also, Qiang should be consulted on this because he spent much time thinking about it in the context of plants. */ -static bool -is_chh(const std::string &s, size_t i) { - return (i < (s.length() - 2)) && is_cytosine(s[i]) && !is_guanine(s[i + 1]) && +[[nodiscard]] static bool +is_chh(const std::string &s, std::size_t i) { + return i + 2 < std::size(s) && is_cytosine(s[i]) && !is_guanine(s[i + 1]) && !is_guanine(s[i + 2]); } -static bool -is_ddg(const std::string &s, size_t i) { - return (i < (s.length() - 2)) && !is_cytosine(s[i]) && - !is_cytosine(s[i + 1]) && is_guanine(s[i + 2]); +[[nodiscard]] static bool +is_ddg(const std::string &s, std::size_t i) { + return i + 2 < std::size(s) && !is_cytosine(s[i]) && !is_cytosine(s[i + 1]) && + is_guanine(s[i + 2]); } -static bool -is_c_at_g(const std::string &s, size_t i) { - return (i < (s.length() - 2)) && is_cytosine(s[i]) && - !is_cytosine(s[i + 1]) && !is_guanine(s[i + 1]) && - is_guanine(s[i + 2]); +[[nodiscard]] static bool +is_c_at_g(const std::string &s, std::size_t i) { + return i + 2 < std::size(s) && is_cytosine(s[i]) && !is_cytosine(s[i + 1]) && + !is_guanine(s[i + 1]) && is_guanine(s[i + 2]); } /* Right now the CountSet objects below are much larger than they need @@ -159,10 +169,10 @@ struct CountSet { * the order of checking conditions doesn't matter. There is also a * bit of a hack in that the unsigned "pos" could wrap, but this still * works as long as the chromosome size is not the maximum size of a - * size_t. + * std::size_t. */ -static std::uint32_t -get_tag_from_genome(const string &s, const size_t pos) { +[[nodiscard]] static std::uint32_t +get_tag_from_genome(const std::string &s, const std::size_t pos) { if (is_cytosine(s[pos])) { if (is_cpg(s, pos)) return 0; @@ -194,50 +204,8 @@ static const char *tag_values[] = { "N", // 4 }; -template -static void -write_output(const bam_header &hdr, bgzf_file &out, const std::int32_t tid, - const string &chrom, const vector &counts, - bool CPG_ONLY) { - - quick_buf buf; // keep underlying buffer space? - for (size_t i = 0; i < std::size(chrom); ++i) { - const char base = chrom[i]; - if (is_cytosine(base) || is_guanine(base)) { - - const std::uint32_t the_tag = get_tag_from_genome(chrom, i); - if (CPG_ONLY && the_tag != 0) - continue; - - const bool is_c = is_cytosine(base); - - const double n_reads = counts[i].get_n_reads(is_c); - if (require_covered && n_reads < 1.0) - continue; - const double denom = std::max(n_reads, 1.0); - const double hydroxy = counts[i].get_hydroxy(is_c) / 256.0 / denom; - const double methyl = counts[i].get_methyl(is_c) / 256.0 / denom; - const double mods = counts[i].get_mods(is_c) / 256.0 / denom; - - buf.clear(); - // ADS: here is where we make an MSite, but not using MSite - // clang-format off - buf << sam_hdr_tid2name(hdr, tid) << '\t' << i << '\t' - << (is_c ? '+' : '-') << '\t' << tag_values[the_tag] << '\t' - << mods << '\t' - << n_reads << '\t' - << hydroxy << '\t' - << methyl - << '\n'; - // clang-format on - if (!out.write(buf.c_str(), buf.tellp())) - throw std::runtime_error("error writing output"); - } - } -} - template -static std::tuple +[[nodiscard]] static std::tuple get_hydroxy_sites(T mod_pos_beg, T mod_pos_end) { const char hydroxy_tag[] = "C+h?"; const auto hydroxy_tag_size = 4; @@ -252,7 +220,7 @@ get_hydroxy_sites(T mod_pos_beg, T mod_pos_end) { } template -static std::tuple +[[nodiscard]] static std::tuple get_methyl_sites(T mod_pos_beg, T mod_pos_end) { const char methyl_tag[] = "C+h?"; const auto methyl_tag_size = 4; @@ -266,7 +234,7 @@ get_methyl_sites(T mod_pos_beg, T mod_pos_end) { return std::tuple(methyl_beg, methyl_end); } -static std::tuple +[[nodiscard]] static std::tuple get_modification_positions(const bamxx::bam_rec &aln) { const auto mm_aux = bam_aux_get(aln.b, "MM"); if (mm_aux == nullptr) @@ -278,12 +246,15 @@ get_modification_positions(const bamxx::bam_rec &aln) { struct mod_prob_buffer { static constexpr auto init_capacity{128 * 1024}; + std::vector hydroxy_probs; std::vector methyl_probs; + mod_prob_buffer() { methyl_probs.reserve(init_capacity); hydroxy_probs.reserve(init_capacity); } + bool set_probs(const bamxx::bam_rec &aln) { const auto get_next_mod_pos = [](auto &b, const auto e) -> std::int32_t { @@ -417,7 +388,7 @@ struct match_counter { c[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; } - std::string + [[nodiscard]] std::string tostring() const { std::ostringstream oss; for (auto i = 0u; i < n_dinucs; ++i) { @@ -430,7 +401,7 @@ struct match_counter { return oss.str(); } - std::string + [[nodiscard]] std::string tostring_frac() const { static constexpr auto width = 8; static constexpr auto prec = 3; @@ -450,7 +421,7 @@ struct match_counter { }; static void -count_states_pos(const bam_rec &aln, vector &counts, +count_states_pos(const bam_rec &aln, std::vector &counts, mod_prob_buffer &mod_buf, const std::string &chrom, match_counter &mc) { /* Move through cigar, reference and read positions without @@ -458,10 +429,10 @@ count_states_pos(const bam_rec &aln, vector &counts, const auto seq = bam_get_seq(aln); const auto beg_cig = bam_get_cigar(aln); const auto end_cig = beg_cig + get_n_cigar(aln); + const auto end_ref = std::cend(chrom); auto rpos = get_pos(aln); - auto r_itr = std::cbegin(chrom) + rpos; - const auto r_itr_lim = std::cend(chrom); + auto ref_itr = std::cbegin(chrom) + rpos; if (!mod_buf.set_probs(aln)) return; @@ -478,8 +449,8 @@ count_states_pos(const bam_rec &aln, vector &counts, if (eats_ref(op) && eats_query(op)) { const decltype(qpos) end_qpos = qpos + n; for (; qpos < end_qpos; ++qpos) { - const auto r_nuc = *r_itr++; - mc.add_pos(r_nuc, r_itr == r_itr_lim ? 'N' : *r_itr, + const auto r_nuc = *ref_itr++; + mc.add_pos(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, seq_nt16_str[bam_seqi(seq, qpos)], qpos == q_lim ? 'N' : seq_nt16_str[bam_seqi(seq, qpos + 1)]); counts[rpos++].add_count_pos(*hydroxy_prob_itr, *methyl_prob_itr); @@ -494,7 +465,7 @@ count_states_pos(const bam_rec &aln, vector &counts, } else if (eats_ref(op)) { rpos += n; - r_itr += n; + ref_itr += n; } } // ADS: somehow previous code included a correction for rpos going @@ -504,8 +475,8 @@ count_states_pos(const bam_rec &aln, vector &counts, assert(qpos == get_l_qseq(aln)); } -[[maybe_unused]] static void -count_states_neg(const bam_rec &aln, vector &counts, +static void +count_states_neg(const bam_rec &aln, std::vector &counts, mod_prob_buffer &mod_buf, const std::string &chrom, match_counter &mc) { /* Move through cigar, reference and (*backward*) through read @@ -513,13 +484,13 @@ count_states_neg(const bam_rec &aln, vector &counts, const auto seq = bam_get_seq(aln); const auto beg_cig = bam_get_cigar(aln); const auto end_cig = beg_cig + get_n_cigar(aln); - size_t rpos = get_pos(aln); - auto r_itr = std::cbegin(chrom) + rpos; - const auto r_lim = std::cend(chrom); + const auto end_ref = std::cend(chrom); + auto rpos = get_pos(aln); + auto ref_itr = std::cbegin(chrom) + rpos; - size_t qpos = get_l_qseq(aln); // to match type with b->core.l_qseq - size_t q_idx = 0; - size_t l_qseq = get_l_qseq(aln); + std::size_t qpos = get_l_qseq(aln); // to match type with b->core.l_qseq + std::size_t q_idx = 0; + std::size_t l_qseq = get_l_qseq(aln); if (!mod_buf.set_probs(aln)) return; @@ -534,12 +505,12 @@ count_states_neg(const bam_rec &aln, vector &counts, const std::uint32_t n = bam_cigar_oplen(*c_itr); if (eats_ref(op) && eats_query(op)) { assert(qpos >= n); - const size_t end_qpos = qpos - n; // to match type with qpos + const std::size_t end_qpos = qpos - n; // to match type with qpos for (; qpos > end_qpos; --qpos) { - const auto r_nuc = *r_itr++; + const auto r_nuc = *ref_itr++; const auto q_nuc = seq_nt16_str[bam_seqi(seq, q_idx)]; ++q_idx; - mc.add_pos(r_nuc, r_itr == r_lim ? 'N' : *r_itr, q_nuc, + mc.add_pos(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, q_nuc, q_idx == l_qseq ? 'N' : seq_nt16_str[bam_seqi(seq, q_idx)]); --methyl_prob_itr; --hydroxy_prob_itr; @@ -554,7 +525,7 @@ count_states_neg(const bam_rec &aln, vector &counts, } else if (eats_ref(op)) { rpos += n; - r_itr += n; + ref_itr += n; } } @@ -563,14 +534,14 @@ count_states_neg(const bam_rec &aln, vector &counts, assert(qpos == 0); } -static std::unordered_map +[[nodiscard]] static std::unordered_map get_tid_to_idx(const bam_header &hdr, - const std::unordered_map name_to_idx) { - std::unordered_map tid_to_idx; + const std::unordered_map name_to_idx) { + std::unordered_map tid_to_idx; for (std::int32_t i = 0; i < hdr.h->n_targets; ++i) { // "curr_name" gives a "tid_to_name" mapping allowing to jump // through "name_to_idx" and get "tid_to_idx" - const string curr_name(hdr.h->target_name[i]); + const std::string curr_name(hdr.h->target_name[i]); const auto name_itr(name_to_idx.find(curr_name)); if (name_itr == end(name_to_idx)) throw std::runtime_error("failed to find chrom: " + curr_name); @@ -579,39 +550,19 @@ get_tid_to_idx(const bam_header &hdr, return tid_to_idx; } -template -static void -output_skipped_chromosome( - const bool CPG_ONLY, const std::int32_t tid, - const std::unordered_map &tid_to_idx, - const bam_header &hdr, const vector::const_iterator chroms_beg, - const vector &chrom_sizes, vector &counts, bgzf_file &out) { - - // get the index of the next chrom sequence - const auto chrom_idx = tid_to_idx.find(tid); - if (chrom_idx == std::cend(tid_to_idx)) - throw std::runtime_error("chrom not found: " + sam_hdr_tid2name(hdr, tid)); - - const auto chrom_itr = chroms_beg + chrom_idx->second; - - // reset the counts - counts.clear(); - counts.resize(chrom_sizes[chrom_idx->second]); - - write_output(hdr, out, tid, *chrom_itr, counts, CPG_ONLY); -} - -static bool -consistent_targets(const bam_header &hdr, - const std::unordered_map &tid_to_idx, - const vector &names, const vector &sizes) { - const size_t n_targets = hdr.h->n_targets; +[[nodiscard]] static bool +consistent_targets( + const bam_header &hdr, + const std::unordered_map &tid_to_idx, + const std::vector &names, + const std::vector &sizes) { + const std::size_t n_targets = hdr.h->n_targets; if (n_targets != names.size()) return false; - for (size_t tid = 0; tid < n_targets; ++tid) { - const string tid_name_sam = sam_hdr_tid2name(hdr, tid); - const size_t tid_size_sam = sam_hdr_tid2len(hdr, tid); + for (std::size_t tid = 0; tid < n_targets; ++tid) { + const std::string tid_name_sam = sam_hdr_tid2name(hdr, tid); + const std::size_t tid_size_sam = sam_hdr_tid2len(hdr, tid); const auto idx_itr = tid_to_idx.find(tid); if (idx_itr == std::cend(tid_to_idx)) return false; @@ -622,197 +573,328 @@ consistent_targets(const bam_header &hdr, return true; } -template -static std::tuple -process_reads(const bool VERBOSE, const bool show_progress, - const bool compress_output, const bool include_header, - const size_t n_threads, const string &infile, - const string &outfile, const string &chroms_file, - const bool CPG_ONLY, const int strand) { - // first get the chromosome names and sequences from the FASTA file - vector chroms, names; - read_fasta_file_short_names(chroms_file, names, chroms); - for (auto &i : chroms) - transform(std::cbegin(i), std::cend(i), std::begin(i), - [](const char c) { return std::toupper(c); }); - if (VERBOSE) - cerr << "[n chroms in reference: " << chroms.size() << "]" << endl; - const auto chroms_beg = std::cbegin(chroms); - - std::unordered_map name_to_idx; - vector chrom_sizes(chroms.size(), 0); - for (size_t i = 0; i < chroms.size(); ++i) { - name_to_idx[names[i]] = i; - chrom_sizes[i] = chroms[i].size(); +struct read_processor { + static constexpr auto default_expected_basecall_model = + "dna_r10.4.1_e8.2_400bps_hac@v4.3.0"; + + bool verbose{}; + bool show_progress{}; + bool compress_output{}; + bool include_header{}; + bool require_covered{}; + bool cpg_only{}; + bool force{}; + std::uint32_t n_threads{1}; + int strand{0}; + std::string expected_basecall_model{}; + + [[nodiscard]] std::string + tostring() const { + std::ostringstream oss; + oss << std::boolalpha; + oss << "[verbose: " << verbose << "]\n" + << "[show_progress: " << show_progress << "]\n" + << "[compress_output: " << compress_output << "]\n" + << "[include_header: " << include_header << "]\n" + << "[require_covered: " << require_covered << "]\n" + << "[cpg_only: " << cpg_only << "]\n" + << "[force: " << force << "]\n" + << "[n_threads: " << n_threads << "]\n" + << "[strand: " << strand << "]\n" + << "[expected_basecall_model: " << expected_basecall_model_str() + << "]\n"; + return oss.str(); + } + + read_processor() : expected_basecall_model{default_expected_basecall_model} {} + + [[nodiscard]] std::string + expected_basecall_model_str() const { + return expected_basecall_model.empty() ? "NA" : expected_basecall_model; } - bamxx::bam_tpool tp(n_threads); // Must be destroyed after hts - - // open the hts SAM/BAM input file and get the header - bamxx::bam_in hts(infile); - if (!hts) - throw std::runtime_error("failed to open input file"); - // load the input file's header - bam_header hdr(hts); - if (!hdr) - throw std::runtime_error("failed to read header"); - - std::unordered_map tid_to_idx = - get_tid_to_idx(hdr, name_to_idx); - - if (!consistent_targets(hdr, tid_to_idx, names, chrom_sizes)) - throw std::runtime_error("inconsistent reference genome information"); - - // open the output file - const string output_mode = compress_output ? "w" : "wu"; - bgzf_file out(outfile, output_mode); - if (!out) - throw std::runtime_error("error opening output file: " + outfile); - - // set the threads for the input file decompression - if (n_threads > 1) { - tp.set_io(hts); - tp.set_io(out); + void + output_skipped_chromosome( + const std::int32_t tid, + const std::unordered_map &tid_to_idx, + const bam_header &hdr, + const std::vector::const_iterator chroms_beg, + const std::vector &chrom_sizes, std::vector &counts, + bgzf_file &out) const { + + // get the index of the next chrom sequence + const auto chrom_idx = tid_to_idx.find(tid); + if (chrom_idx == std::cend(tid_to_idx)) + throw std::runtime_error("chrom not found: " + + sam_hdr_tid2name(hdr, tid)); + + const auto chrom_itr = chroms_beg + chrom_idx->second; + + // reset the counts + counts.clear(); + counts.resize(chrom_sizes[chrom_idx->second]); + + // ADS: 'false' below for require_covered b/c these sites can't be covered. + write_output(hdr, out, tid, *chrom_itr, counts); + } + + void + write_output(const bam_header &hdr, bgzf_file &out, const std::int32_t tid, + const std::string &chrom, + const std::vector &counts) const { + static constexpr auto out_fmt = "%ld\t%c\t%s\t%.6g\t%d\t%.6g\t%.6g\n"; + static constexpr auto buf_size = 1024; + char buffer[buf_size]; + + // Put chrom name in buffer and then skip that part for each site because + // it doesn't change. + const auto chrom_name = sam_hdr_tid2name(hdr.h, tid); + if (chrom_name == nullptr) + throw std::runtime_error("failed to identify chrom for tid: " + + std::to_string(tid)); + const auto chrom_name_offset = std::sprintf(buffer, "%s\t", chrom_name); + if (chrom_name_offset < 0) + throw std::runtime_error("failed to write to output buffer"); + auto buffer_after_chrom = buffer + chrom_name_offset; + + for (std::size_t i = 0; i < std::size(chrom); ++i) { + const char base = chrom[i]; + if (is_cytosine(base) || is_guanine(base)) { + + const std::uint32_t the_tag = get_tag_from_genome(chrom, i); + if (cpg_only && the_tag != 0) + continue; + + const bool is_c = is_cytosine(base); + + const std::uint32_t n_reads = counts[i].get_n_reads(is_c); + if (require_covered && n_reads == 0) + continue; + const double denom = std::max(n_reads, 1u); + const double hydroxy = counts[i].get_hydroxy(is_c) / 256.0 / denom; + const double methyl = counts[i].get_methyl(is_c) / 256.0 / denom; + const double mods = counts[i].get_mods(is_c) / 256.0 / denom; + + // clang-format off + int r = std::sprintf(buffer_after_chrom, out_fmt, + i, + is_c ? '+' : '-', + tag_values[the_tag], + mods, + n_reads, + hydroxy, + methyl); + // clang-format on + + if (r < 0) + throw std::runtime_error("failed to write to output buffer"); + out.write(buffer); + } + } } - if (include_header) - write_counts_header_from_bam_header(hdr, out); + [[nodiscard]] std::tuple + operator()(const std::string &infile, const std::string &outfile, + const std::string &chroms_file) const { + // first get the chromosome names and sequences from the FASTA file + std::vector chroms, names; + read_fasta_file_short_names(chroms_file, names, chroms); + for (auto &i : chroms) + transform(std::cbegin(i), std::cend(i), std::begin(i), + [](const char c) { return std::toupper(c); }); + if (verbose) + std::cerr << "[n chroms in reference: " << chroms.size() << "]" + << std::endl; + const auto chroms_beg = std::cbegin(chroms); + + std::unordered_map name_to_idx; + std::vector chrom_sizes(chroms.size(), 0); + for (std::size_t i = 0; i < chroms.size(); ++i) { + name_to_idx[names[i]] = i; + chrom_sizes[i] = chroms[i].size(); + } + + bamxx::bam_tpool tp(n_threads); // Must be destroyed after hts + + // open the hts SAM/BAM input file and get the header + bamxx::bam_in hts(infile); + if (!hts) + throw std::runtime_error("failed to open input file"); + // load the input file's header + bam_header hdr(hts); + if (!hdr) + throw std::runtime_error("failed to read header"); + + std::unordered_map tid_to_idx = + get_tid_to_idx(hdr, name_to_idx); + + if (!consistent_targets(hdr, tid_to_idx, names, chrom_sizes)) + throw std::runtime_error("inconsistent reference genome information"); + + // open the output file + const std::string output_mode = compress_output ? "w" : "wu"; + bgzf_file out(outfile, output_mode); + if (!out) + throw std::runtime_error("error opening output file: " + outfile); + + // set the threads for the input file decompression + if (n_threads > 1) { + tp.set_io(hts); + tp.set_io(out); + } + + // validate the basecall model + const auto basecall_model = get_basecall_model(hdr); + if (verbose) + std::cerr << "[observed basecall model: " + << (basecall_model.empty() ? "NA" : basecall_model) << "]" + << std::endl; + if (!expected_basecall_model.empty() && + basecall_model != expected_basecall_model) { + std::cerr << "failed to match basecall model:" << "\n" + << "observed=" + << (basecall_model.empty() ? "NA" : basecall_model) << "\n" + << "expected=" << expected_basecall_model_str() << std::endl; + return {}; + } - // now iterate over the reads, switching chromosomes and writing - // output as needed - bam_rec aln; - std::int32_t prev_tid = -1; + if (include_header) + write_counts_header_from_bam_header(hdr, out); - // this is where all the counts are accumulated - vector counts; + // this is where all the counts are accumulated + std::vector counts; - vector::const_iterator chrom_itr{}; + // now iterate over the reads, switching chromosomes and writing + // output as needed + bam_rec aln; + std::int32_t prev_tid = -1; - match_counter mc_pos; - match_counter mc_neg; - mod_prob_buffer mod_buf; + std::vector::const_iterator chrom_itr{}; - while (hts.read(hdr, aln)) { - const std::int32_t tid = get_tid(aln); - if (get_l_qseq(aln) == 0) - continue; + match_counter mc_pos; + match_counter mc_neg; + mod_prob_buffer mod_buf; - if (tid == -1) // ADS: skip reads that have no tid -- they are not mapped - continue; - if (tid == prev_tid) { - const bool is_rev = bam_is_rev(aln); - if (is_rev && strand != 1) - count_states_neg(aln, counts, mod_buf, *chrom_itr, mc_neg); - if (!is_rev && strand != 2) - count_states_pos(aln, counts, mod_buf, *chrom_itr, mc_pos); - } - else { // chrom has changed, so output results and get the next chrom - - // write output if there is any; counts is empty only for first chrom - if (!counts.empty()) - write_output(hdr, out, prev_tid, *chrom_itr, counts, - CPG_ONLY); - // make sure reads are sorted chrom tid number in header - if (tid < prev_tid) { - const std::string message = "SAM file is not sorted " - "previous tid: " + - std::to_string(prev_tid) + - " current tid: " + std::to_string(tid); - throw std::runtime_error(message); + while (hts.read(hdr, aln)) { + const std::int32_t tid = get_tid(aln); + if (get_l_qseq(aln) == 0) + continue; + + if (tid == -1) // ADS: skip reads that have no tid -- they are not mapped + continue; + if (tid == prev_tid) { + const bool is_rev = bam_is_rev(aln); + if (is_rev && strand != 1) + count_states_neg(aln, counts, mod_buf, *chrom_itr, mc_neg); + if (!is_rev && strand != 2) + count_states_pos(aln, counts, mod_buf, *chrom_itr, mc_pos); } + else { // chrom has changed, so output results and get the next chrom + + // write output if there is any; counts is empty only for first chrom + if (!counts.empty()) + write_output(hdr, out, prev_tid, *chrom_itr, counts); + // make sure reads are sorted chrom tid number in header + if (tid < prev_tid) { + const std::string message = "SAM file is not sorted " + "previous tid: " + + std::to_string(prev_tid) + + " current tid: " + std::to_string(tid); + throw std::runtime_error(message); + } - if (!require_covered) - for (auto i = prev_tid + 1; i < tid; ++i) - output_skipped_chromosome(CPG_ONLY, i, tid_to_idx, hdr, chroms_beg, - chrom_sizes, counts, out); - - // get the next chrom to process - auto chrom_idx(tid_to_idx.find(tid)); - if (chrom_idx == end(tid_to_idx)) - throw std::runtime_error("chromosome not found: " + - string(sam_hdr_tid2name(hdr, tid))); - if (show_progress) - cerr << "processing " << sam_hdr_tid2name(hdr, tid) << endl; - - prev_tid = tid; - chrom_itr = chroms_beg + chrom_idx->second; - - // reset the counts - counts.clear(); - counts.resize(chrom_sizes[chrom_idx->second]); + if (!require_covered) + for (auto i = prev_tid + 1; i < tid; ++i) + output_skipped_chromosome(i, tid_to_idx, hdr, chroms_beg, + chrom_sizes, counts, out); + + // get the next chrom to process + auto chrom_idx(tid_to_idx.find(tid)); + if (chrom_idx == end(tid_to_idx)) + throw std::runtime_error("chromosome not found: " + + std::string(sam_hdr_tid2name(hdr, tid))); + if (show_progress) + std::cerr << "processing " << sam_hdr_tid2name(hdr, tid) << std::endl; + + prev_tid = tid; + chrom_itr = chroms_beg + chrom_idx->second; + + // reset the counts + counts.clear(); + counts.resize(chrom_sizes[chrom_idx->second]); + } } + if (!counts.empty()) + write_output(hdr, out, prev_tid, *chrom_itr, counts); + + // ADS: if some chroms might not be covered by reads, we have to + // iterate over what remains + if (!require_covered) + for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) + output_skipped_chromosome(i, tid_to_idx, hdr, chroms_beg, chrom_sizes, + counts, out); + return {mc_pos, mc_neg}; } - if (!counts.empty()) - write_output(hdr, out, prev_tid, *chrom_itr, counts, - CPG_ONLY); - - // ADS: if some chroms might not be covered by reads, we have to - // iterate over what remains - if (!require_covered) - for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) - output_skipped_chromosome(CPG_ONLY, i, tid_to_idx, hdr, chroms_beg, - chrom_sizes, counts, out); - return {mc_pos, mc_neg}; -} +}; int main_nanocount(int argc, const char **argv) { try { - bool VERBOSE = false; - bool show_progress = false; - bool CPG_ONLY = false; - bool require_covered = false; - bool compress_output = false; - bool include_header = false; - int strand = 0; + read_processor rp; - string chroms_file; - string outfile; - string stats_file; - int n_threads = 1; + std::string chroms_file; + std::string outfile; + std::string stats_file; /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(strip_path(argv[0]), "get methylation levels from mapped nanopore reads", "-c "); opt_parse.add_opt("threads", 't', "threads to use (few needed)", false, - n_threads); + rp.n_threads); opt_parse.add_opt("output", 'o', "output file name (default: stdout)", false, outfile); opt_parse.add_opt("chrom", 'c', "reference genome file (FASTA format)", true, chroms_file); opt_parse.add_opt("cpg-only", 'n', "print only CpG context cytosines", - false, CPG_ONLY); + false, rp.cpg_only); opt_parse.add_opt("require-covered", 'r', "only output covered sites", - false, require_covered); + false, rp.require_covered); opt_parse.add_opt("strand", '\0', "use strand (1=positive, 2=negative, 0=default)", false, - strand); + rp.strand); opt_parse.add_opt("stats", 's', "output match/mismatch stats to this file", false, stats_file); + opt_parse.add_opt("force", '\0', "skip consistency checks", false, + rp.force); opt_parse.add_opt("header", 'H', "add a header to identify the reference", - false, include_header); - opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); - opt_parse.add_opt("progress", '\0', "show progress", false, show_progress); - opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); - vector leftover_args; + false, rp.include_header); + opt_parse.add_opt("zip", 'z', "output gzip format", false, + rp.compress_output); + opt_parse.add_opt("progress", '\0', "show progress", false, + rp.show_progress); + opt_parse.add_opt("verbose", 'v', "print more run info", false, rp.verbose); + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (opt_parse.about_requested() || opt_parse.help_requested() || leftover_args.empty()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + std::cerr << opt_parse.help_message() << std::endl + << opt_parse.about_message() << std::endl; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << std::endl; return EXIT_SUCCESS; } - const string mapped_reads_file = leftover_args.front(); + const std::string mapped_reads_file = leftover_args.front(); /****************** END COMMAND LINE OPTIONS *****************/ - if (n_threads < 0) + if (rp.force) + rp.expected_basecall_model.clear(); + + if (rp.n_threads <= 0) throw std::runtime_error("thread count cannot be negative"); std::ostringstream cmd; @@ -822,38 +904,25 @@ main_nanocount(int argc, const char **argv) { if (outfile.empty()) outfile = "-"; - if (VERBOSE) - cerr << "[input BAM/SAM file: " << mapped_reads_file << "]" << endl - << "[output file: " << outfile << "]" << endl - << "[output format: " << (compress_output ? "bgzf" : "text") << "]" - << endl - << "[genome file: " << chroms_file << "]" << endl - << "[threads requested: " << n_threads << "]" << endl - << "[CpG only mode: " << (CPG_ONLY ? "yes" : "no") << "]" << endl - << "[command line: \"" << cmd.str() << "\"]" << endl; - - const auto [mc_pos, mc_neg] = [&] { - if (require_covered) - return process_reads(VERBOSE, show_progress, compress_output, - include_header, n_threads, mapped_reads_file, - outfile, chroms_file, CPG_ONLY, strand); - return process_reads(VERBOSE, show_progress, compress_output, - include_header, n_threads, mapped_reads_file, - outfile, chroms_file, CPG_ONLY, strand); - }(); + if (rp.verbose) + std::cerr << "[input BAM/SAM file: " << mapped_reads_file << "]\n" + << "[output file: " << outfile << "]\n" + << "[genome file: " << chroms_file << "]\n" + << rp.tostring(); + const auto [mc_pos, mc_neg] = rp(mapped_reads_file, outfile, chroms_file); if (!stats_file.empty()) { std::ofstream stats_out(stats_file); - if (!stats_out) + if (!stats_out) { std::cerr << "Error opening stats file" << std::endl; - else { - stats_out << mc_pos.tostring_frac() << std::endl; - stats_out << mc_neg.tostring_frac() << std::endl; + return EXIT_FAILURE; } + stats_out << mc_pos.tostring_frac() << std::endl + << mc_neg.tostring_frac() << std::endl; } } catch (const std::exception &e) { - cerr << e.what() << endl; + std::cerr << e.what() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; From 3cbf2038bfa0a5f31b2c572ef29e9d50ca46527e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 8 Jun 2025 22:15:08 -0700 Subject: [PATCH 11/13] src/analysis/nanopore.cpp: added writing for stats in json format --- src/analysis/nanopore.cpp | 106 +++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 19 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index 0d826377..c53cdfd0 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -1,4 +1,5 @@ -/* nanocount: methylation counts for nanopore data +/* nanocount: methylation counts for nanopore data; see docs for 'counts' + * command for details. * * Copyright (C) 2025 Andrew D. Smith * @@ -374,7 +375,9 @@ static const auto dinucs = std::vector{ // clang-format on struct match_counter { - std::array c{}; + typedef std::array container; + container pos{}; + container neg{}; void add_pos(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, @@ -385,22 +388,81 @@ struct match_counter { const auto q2e = enc[q2]; if ((r1e | r2e | q1e | q2e) & 4u) return; - c[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; + pos[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; + } + + void + add_neg(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, + const std::uint8_t q2) { + const auto r1e = enc[r1]; + const auto r2e = enc[r2]; + const auto q1e = enc[q1]; + const auto q2e = enc[q2]; + if ((r1e | r2e | q1e | q2e) & 4u) + return; + neg[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; } [[nodiscard]] std::string tostring() const { std::ostringstream oss; + oss << "target_query_fwd" << '\n'; for (auto i = 0u; i < n_dinucs; ++i) { - const auto beg = std::cbegin(c) + (i * n_dinucs); + const auto beg = std::cbegin(pos) + (i * n_dinucs); oss << dinucs[i]; for (auto itr = beg; itr != beg + n_dinucs; ++itr) oss << '\t' << *itr; oss << '\n'; } + oss << "target_query_rev" << '\n'; + for (auto i = 0u; i < n_dinucs; ++i) { + const auto beg = std::cbegin(neg) + (i * n_dinucs); + oss << dinucs[i]; + for (auto itr = beg; itr != beg + n_dinucs; ++itr) + oss << '\t' << *itr; + oss << '\n'; + } + return oss.str(); + } + + [[nodiscard]] std::string + json_row(const container::const_iterator beg) const { + std::ostringstream oss; + oss << "{"; + auto j = 0; + for (auto itr = beg; itr != beg + n_dinucs; ++itr) { + if (j > 0) + oss << ","; + oss << R"(")" << dinucs[j++] << R"(":")" << *itr << R"(")"; + } + oss << "}"; return oss.str(); } + [[nodiscard]] std::string + json_table(const container &a) const { + std::ostringstream oss; + const auto beg = std::cbegin(a); + oss << "{"; + for (auto i = 0u; i < n_dinucs; ++i) { + if (i > 0) + oss << ","; + oss << R"(")" << dinucs[i] << R"(":)" << json_row(beg + (i * n_dinucs)); + } + oss << "}"; + return oss.str(); + } + + [[nodiscard]] std::string + json() const { + // clang-format off + return (std::ostringstream() << "{" + << R"("map_fwd":)" << json_table(pos) << "," + << R"("map_rev":)" << json_table(neg) + << "}").str(); + // clang-format on + } + [[nodiscard]] std::string tostring_frac() const { static constexpr auto width = 8; @@ -408,9 +470,19 @@ struct match_counter { std::ostringstream oss; oss.precision(prec); oss.setf(std::ios::fixed, std::ios::floatfield); + oss << "map_fwd\n"; + for (auto i = 0u; i < n_dinucs; ++i) { + oss << dinucs[i]; + const auto beg = std::cbegin(pos) + (i * n_dinucs); + const auto tot = std::max(std::accumulate(beg, beg + n_dinucs, 0.0), 1.0); + for (auto itr = beg; itr != beg + n_dinucs; ++itr) + oss << std::setw(width) << *itr / tot; + oss << '\n'; + } + oss << "map_ref\n"; for (auto i = 0u; i < n_dinucs; ++i) { oss << dinucs[i]; - const auto beg = std::cbegin(c) + (i * n_dinucs); + const auto beg = std::cbegin(neg) + (i * n_dinucs); const auto tot = std::max(std::accumulate(beg, beg + n_dinucs, 0.0), 1.0); for (auto itr = beg; itr != beg + n_dinucs; ++itr) oss << std::setw(width) << *itr / tot; @@ -510,7 +582,7 @@ count_states_neg(const bam_rec &aln, std::vector &counts, const auto r_nuc = *ref_itr++; const auto q_nuc = seq_nt16_str[bam_seqi(seq, q_idx)]; ++q_idx; - mc.add_pos(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, q_nuc, + mc.add_neg(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, q_nuc, q_idx == l_qseq ? 'N' : seq_nt16_str[bam_seqi(seq, q_idx)]); --methyl_prob_itr; --hydroxy_prob_itr; @@ -693,7 +765,7 @@ struct read_processor { } } - [[nodiscard]] std::tuple + [[nodiscard]] match_counter operator()(const std::string &infile, const std::string &outfile, const std::string &chroms_file) const { // first get the chromosome names and sequences from the FASTA file @@ -771,8 +843,7 @@ struct read_processor { std::vector::const_iterator chrom_itr{}; - match_counter mc_pos; - match_counter mc_neg; + match_counter mc; mod_prob_buffer mod_buf; while (hts.read(hdr, aln)) { @@ -785,9 +856,9 @@ struct read_processor { if (tid == prev_tid) { const bool is_rev = bam_is_rev(aln); if (is_rev && strand != 1) - count_states_neg(aln, counts, mod_buf, *chrom_itr, mc_neg); + count_states_neg(aln, counts, mod_buf, *chrom_itr, mc); if (!is_rev && strand != 2) - count_states_pos(aln, counts, mod_buf, *chrom_itr, mc_pos); + count_states_pos(aln, counts, mod_buf, *chrom_itr, mc); } else { // chrom has changed, so output results and get the next chrom @@ -833,7 +904,7 @@ struct read_processor { for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) output_skipped_chromosome(i, tid_to_idx, hdr, chroms_beg, chrom_sizes, counts, out); - return {mc_pos, mc_neg}; + return mc; } }; @@ -910,15 +981,12 @@ main_nanocount(int argc, const char **argv) { << "[genome file: " << chroms_file << "]\n" << rp.tostring(); - const auto [mc_pos, mc_neg] = rp(mapped_reads_file, outfile, chroms_file); + const auto mc = rp(mapped_reads_file, outfile, chroms_file); if (!stats_file.empty()) { std::ofstream stats_out(stats_file); - if (!stats_out) { - std::cerr << "Error opening stats file" << std::endl; - return EXIT_FAILURE; - } - stats_out << mc_pos.tostring_frac() << std::endl - << mc_neg.tostring_frac() << std::endl; + if (!stats_out) + throw std::runtime_error("Error opening stats file: " + stats_file); + stats_out << mc.json() << std::endl; } } catch (const std::exception &e) { From a3574962a84ad86474aa32cc7b002ab78e2ee97d Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 8 Jun 2025 22:16:30 -0700 Subject: [PATCH 12/13] src/dnmtools.cpp: changed the command for counts for nanopore reads to nanocounts --- src/dnmtools.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dnmtools.cpp b/src/dnmtools.cpp index cd77fb89..ff5881d7 100644 --- a/src/dnmtools.cpp +++ b/src/dnmtools.cpp @@ -169,7 +169,7 @@ main(int argc, const char **argv) { {"uniq", "remove duplicate reads from sorted mapped reads", main_uniq}, {"bsrate", "compute the BS conversion rate from BS-seq reads mapped to a genome", main_bsrate}, {"counts", "get methylation levels from mapped WGBS reads", main_counts}, - {"nano", "get methylation levels from mapped nanopore reads", main_nanocount}, + {"counts-nano", "get methylation levels from mapped nanopore reads", main_nanocount}, {"sym", "get CpG sites and make methylation levels symmetric", main_symmetric_cpgs}, {"levels", "compute methylation summary statistics from a counts file", main_levels}}}}, From e5023bd635eced0cf97721173f1577fcb2b2e582 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 8 Jun 2025 22:29:11 -0700 Subject: [PATCH 13/13] data/md5sum.txt: updating checksums for last digits of values in the sym and roi test files which have changed after incorporating a different rounding in sym that works for nanopore --- data/md5sum.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/md5sum.txt b/data/md5sum.txt index 91c77bbf..d796535c 100644 --- a/data/md5sum.txt +++ b/data/md5sum.txt @@ -4,7 +4,6 @@ ae05a28de5643a512386e767b3aa963a tests/araTha1_simulated.hypermr 75777c209bf820ab700801d87a0a3615 tests/reads.bsrate e73facd597c3b903cbfe29afa9f58371 tests/reads.counts 56575da7d3af9b696258512142903d1e tests/reads.counts.select -34bf6331c3e6c59fd8709fe112960ec4 tests/reads.counts.sym 0f72560aa101e85679783a1ecaf80615 tests/reads.epiread 9dbd476424d48a8d0f043dfc00af0d23 tests/reads.fmt.srt.sam 4085cc74b003a918b4a4743fca7922a4 tests/reads.hmr @@ -12,7 +11,6 @@ e73facd597c3b903cbfe29afa9f58371 tests/reads.counts a389e4ee1687895517b0b0d26e9338ee tests/reads.mstats d8856f9731af76b8a4ab3cc7d667cdb2 tests/reads.ustats bcbf01be810cbf4051292813eb6b9225 tests/tRex1.idx -c0951c0682abfb4fb6383c94b96c84cf tests/tRex1_promoters.roi.bed ec6a686617cad31e9f7a37a3d378e6ed tests/two_epialleles.states 93e38b20d162062a5d147c4290095a13 tests/mlml.out d947fe3d61ef7b1564558a69608f0e64 tests/methylome.pmd @@ -22,3 +20,5 @@ d6c983396860541a29b730e4acd0ef88 tests/reads.sam 1251050c362589319ca65151708ff998 tests/reads.fmt.srt.uniq.sam 534ed1d8320e4b250c6d675bba23fac9 tests/reads.xcounts 838e61c5a3155db8075eeeec53d25efa tests/reads.unxcounts +001b9d966f62fa439b24cf2198cc3de5 tests/reads.counts.sym +2b8a0406015458be51b8b1c9e58b3602 tests/tRex1_promoters.roi.bed