diff --git a/Makefile.am b/Makefile.am index c79401c4..9e832e16 100644 --- a/Makefile.am +++ b/Makefile.am @@ -205,6 +205,7 @@ dnmtools_SOURCES += src/analysis/methstates.cpp dnmtools_SOURCES += src/analysis/bsrate.cpp dnmtools_SOURCES += src/analysis/methentropy.cpp dnmtools_SOURCES += src/analysis/methcounts.cpp +dnmtools_SOURCES += src/analysis/nanopore.cpp dnmtools_SOURCES += src/analysis/roimethstat.cpp dnmtools_SOURCES += src/analysis/cpgbins.cpp dnmtools_SOURCES += src/analysis/multimethstat.cpp diff --git a/data/md5sum.txt b/data/md5sum.txt index 91c77bbf..d796535c 100644 --- a/data/md5sum.txt +++ b/data/md5sum.txt @@ -4,7 +4,6 @@ ae05a28de5643a512386e767b3aa963a tests/araTha1_simulated.hypermr 75777c209bf820ab700801d87a0a3615 tests/reads.bsrate e73facd597c3b903cbfe29afa9f58371 tests/reads.counts 56575da7d3af9b696258512142903d1e tests/reads.counts.select -34bf6331c3e6c59fd8709fe112960ec4 tests/reads.counts.sym 0f72560aa101e85679783a1ecaf80615 tests/reads.epiread 9dbd476424d48a8d0f043dfc00af0d23 tests/reads.fmt.srt.sam 4085cc74b003a918b4a4743fca7922a4 tests/reads.hmr @@ -12,7 +11,6 @@ e73facd597c3b903cbfe29afa9f58371 tests/reads.counts a389e4ee1687895517b0b0d26e9338ee tests/reads.mstats d8856f9731af76b8a4ab3cc7d667cdb2 tests/reads.ustats bcbf01be810cbf4051292813eb6b9225 tests/tRex1.idx -c0951c0682abfb4fb6383c94b96c84cf tests/tRex1_promoters.roi.bed ec6a686617cad31e9f7a37a3d378e6ed tests/two_epialleles.states 93e38b20d162062a5d147c4290095a13 tests/mlml.out d947fe3d61ef7b1564558a69608f0e64 tests/methylome.pmd @@ -22,3 +20,5 @@ d6c983396860541a29b730e4acd0ef88 tests/reads.sam 1251050c362589319ca65151708ff998 tests/reads.fmt.srt.uniq.sam 534ed1d8320e4b250c6d675bba23fac9 tests/reads.xcounts 838e61c5a3155db8075eeeec53d25efa tests/reads.unxcounts +001b9d966f62fa439b24cf2198cc3de5 tests/reads.counts.sym +2b8a0406015458be51b8b1c9e58b3602 tests/tRex1_promoters.roi.bed diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp new file mode 100644 index 00000000..c53cdfd0 --- /dev/null +++ b/src/analysis/nanopore.cpp @@ -0,0 +1,997 @@ +/* nanocount: methylation counts for nanopore data; see docs for 'counts' + * command for details. + * + * Copyright (C) 2025 Andrew D. Smith + * + * Author: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include // for [u]int[0-9]+_t +#include +#include +#include +#include +#include +#include +#include +#include + +#include "OptionParser.hpp" +#include "bam_record_utils.hpp" +#include "bsutils.hpp" +#include "counts_header.hpp" +#include "dnmt_error.hpp" + +/* HTSlib */ +#include + +using bamxx::bam_header; +using bamxx::bam_rec; +using bamxx::bgzf_file; + +[[nodiscard]] static std::string +get_basecall_model(const bam_header &hdr) { + kstring_t ks{}; + + ks = {0, 0, nullptr}; + const auto rg_ret = sam_hdr_find_line_pos(hdr.h, "RG", 0, &ks); + if (rg_ret == -2) + throw std::runtime_error("failure of sam_hdr_find_line_pos"); + if (rg_ret == -1) + return {}; + ks_free(&ks); + + ks = {0, 0, nullptr}; + const auto ds_ret = sam_hdr_find_tag_pos(hdr.h, "RG", 0, "DS", &ks); + if (ds_ret == -2) + throw std::runtime_error("failure of sam_hdr_find_tag_pos"); + if (ds_ret == -1) + return {}; + std::istringstream buffer(ks.s); + ks_free(&ks); + + const std::vector parts( + (std::istream_iterator(buffer)), {}); + + std::string basecall_model; + for (const auto &key_val : parts) { + const auto equal_pos = key_val.find('='); + if (equal_pos == std::string::npos) + throw std::runtime_error("malformed DS key=val pair: " + key_val); + if (key_val.substr(0, equal_pos) == "basecall_model") + return key_val.substr(equal_pos + 1); + } + + return {}; +} + +// ADS: here the std::uint16_t allows for up to 256 reads, each contributing up +// to 256 "counts" in the probability encoding. +typedef std::uint16_t count_type; + +[[nodiscard]] static inline bool +eats_ref(const std::uint32_t c) { + return bam_cigar_type(bam_cigar_op(c)) & 2; +} + +[[nodiscard]] static inline bool +eats_query(const std::uint32_t c) { + return bam_cigar_type(bam_cigar_op(c)) & 1; +} + +/* The three functions below here should probably be moved into + bsutils.hpp. I am not sure if the DDG function is needed, but it + seems like if one considers strand, and the CHH is not symmetric, + then one needs this. Also, Qiang should be consulted on this + because he spent much time thinking about it in the context of + plants. */ +[[nodiscard]] static bool +is_chh(const std::string &s, std::size_t i) { + return i + 2 < std::size(s) && is_cytosine(s[i]) && !is_guanine(s[i + 1]) && + !is_guanine(s[i + 2]); +} + +[[nodiscard]] static bool +is_ddg(const std::string &s, std::size_t i) { + return i + 2 < std::size(s) && !is_cytosine(s[i]) && !is_cytosine(s[i + 1]) && + is_guanine(s[i + 2]); +} + +[[nodiscard]] static bool +is_c_at_g(const std::string &s, std::size_t i) { + return i + 2 < std::size(s) && is_cytosine(s[i]) && !is_cytosine(s[i + 1]) && + !is_guanine(s[i + 1]) && is_guanine(s[i + 2]); +} + +/* Right now the CountSet objects below are much larger than they need + to be, for the things we are computing. However, it's not clear + that the minimum information would really put the memory + requirement of the program into a more reasonable range, so keeping + all the information seems reasonable. */ +struct CountSet { + std::string + tostring() const { + // clang-format off + std::ostringstream oss; + oss << hydroxy_pos << '\t' << hydroxy_neg << '\t' + << methyl_pos << '\t' << methyl_neg << '\t' + << n_reads_pos << '\t' << n_reads_neg << '\n'; + // clang-format on + return oss.str(); + } + void + add_count_pos(const std::uint8_t h, const std::uint8_t m) { + hydroxy_pos += h; + methyl_pos += m; + ++n_reads_pos; + } + void + add_count_neg(const std::uint8_t h, const std::uint8_t m) { + hydroxy_neg += h; + methyl_neg += m; + ++n_reads_neg; + } + double + get_hydroxy(const bool is_c) const { + return is_c ? hydroxy_pos : hydroxy_neg; + } + double + get_methyl(const bool is_c) const { + return is_c ? methyl_pos : methyl_neg; + } + double + get_mods(const bool is_c) const { + return get_hydroxy(is_c) + get_methyl(is_c); + } + double + get_n_reads(const bool is_c) const { + return is_c ? n_reads_pos : n_reads_neg; + } + count_type hydroxy_pos{0}; + count_type hydroxy_neg{0}; + count_type methyl_pos{0}; + count_type methyl_neg{0}; + count_type n_reads_pos{0}; + count_type n_reads_neg{0}; +}; + +/* The "tag" returned by this function should be exclusive, so that + * the order of checking conditions doesn't matter. There is also a + * bit of a hack in that the unsigned "pos" could wrap, but this still + * works as long as the chromosome size is not the maximum size of a + * std::size_t. + */ +[[nodiscard]] static std::uint32_t +get_tag_from_genome(const std::string &s, const std::size_t pos) { + if (is_cytosine(s[pos])) { + if (is_cpg(s, pos)) + return 0; + else if (is_chh(s, pos)) + return 1; + else if (is_c_at_g(s, pos)) + return 2; + else + return 3; + } + if (is_guanine(s[pos])) { + if (is_cpg(s, pos - 1)) + return 0; + else if (is_ddg(s, pos - 2)) + return 1; + else if (is_c_at_g(s, pos - 2)) + return 2; + else + return 3; + } + return 4; // shouldn't be used for anything +} + +static const char *tag_values[] = { + "CpG", // 0 + "CHH", // 1 + "CXG", // 2 + "CCG", // 3 + "N", // 4 +}; + +template +[[nodiscard]] static std::tuple +get_hydroxy_sites(T mod_pos_beg, T mod_pos_end) { + const char hydroxy_tag[] = "C+h?"; + const auto hydroxy_tag_size = 4; + auto hydroxy_beg = strstr(mod_pos_beg, hydroxy_tag); + if (hydroxy_beg == mod_pos_end) + return std::tuple{}; + auto hydroxy_end = std::find(hydroxy_beg, mod_pos_end, ';'); + if (hydroxy_end == mod_pos_end) + return std::tuple{}; + hydroxy_beg += hydroxy_tag_size; + return std::tuple(hydroxy_beg, hydroxy_end); +} + +template +[[nodiscard]] static std::tuple +get_methyl_sites(T mod_pos_beg, T mod_pos_end) { + const char methyl_tag[] = "C+h?"; + const auto methyl_tag_size = 4; + auto methyl_beg = strstr(mod_pos_beg, methyl_tag); + if (methyl_beg == mod_pos_end) + return std::tuple{}; + auto methyl_end = std::find(methyl_beg, mod_pos_end, ';'); + if (methyl_end == mod_pos_end) + return std::tuple{}; + methyl_beg += methyl_tag_size; + return std::tuple(methyl_beg, methyl_end); +} + +[[nodiscard]] static std::tuple +get_modification_positions(const bamxx::bam_rec &aln) { + const auto mm_aux = bam_aux_get(aln.b, "MM"); + if (mm_aux == nullptr) + return {}; + auto mod_pos_beg = bam_aux2Z(mm_aux); + auto mod_pos_end = mod_pos_beg + std::strlen(mod_pos_beg); + return {mod_pos_beg, mod_pos_end}; +} + +struct mod_prob_buffer { + static constexpr auto init_capacity{128 * 1024}; + + std::vector hydroxy_probs; + std::vector methyl_probs; + + mod_prob_buffer() { + methyl_probs.reserve(init_capacity); + hydroxy_probs.reserve(init_capacity); + } + + bool + set_probs(const bamxx::bam_rec &aln) { + const auto get_next_mod_pos = [](auto &b, const auto e) -> std::int32_t { + const auto isdig = [](const auto x) { + return std::isdigit(static_cast(x)); + }; + b = std::find_if(b, e, isdig); + if (b == e) + return -1; + auto r = atoi(b); + b = std::find_if_not(b, e, isdig); + return r; + }; + + auto [mod_pos_beg, mod_pos_end] = get_modification_positions(aln); + auto [hydroxy_beg, hydroxy_end] = + get_hydroxy_sites(mod_pos_beg, mod_pos_end); + auto [methyl_beg, methyl_end] = get_methyl_sites(mod_pos_beg, mod_pos_end); + + // check if any are false + if (mod_pos_beg == nullptr || hydroxy_beg == nullptr || + methyl_beg == nullptr) + return false; + + // assume that hydroxy and methyl both point to CpG sites + if (!std::equal(hydroxy_beg, hydroxy_end, methyl_beg, methyl_end)) + return false; + + const auto mod_prob = bam_aux_get(aln.b, "ML"); + if (mod_prob == nullptr) + return false; + + // number of commas is number of hydroxy substrates = CpG sites + const auto n_cpgs = std::count(hydroxy_beg, hydroxy_end, ','); + + const auto qlen = get_l_qseq(aln); + const auto seq = bam_get_seq(aln); + + methyl_probs.clear(); + methyl_probs.resize(qlen, 0); + + hydroxy_probs.clear(); + hydroxy_probs.resize(qlen, 0); + + std::int32_t delta = get_next_mod_pos(hydroxy_beg, hydroxy_end); + + auto hydroxy_prob_idx = 0; // start of modifications + auto methyl_prob_idx = n_cpgs; // start methyl after hydroxy + + if (bam_is_rev(aln)) { + for (auto i = 0; i < qlen; ++i) { + const auto nuc = seq_nt16_str[bam_seqi(seq, qlen - i - 1)]; + if (nuc == 'G') { + if (seq_nt16_str[bam_seqi(seq, qlen - i - 2)] == 'C') { + // assume that when delta hits 0 we have a CpG site + if (delta != 0) + return false; + methyl_probs[i] = bam_auxB2i(mod_prob, methyl_prob_idx++); + hydroxy_probs[i] = bam_auxB2i(mod_prob, hydroxy_prob_idx++); + } + --delta; + if (delta < 0) + delta = get_next_mod_pos(hydroxy_beg, hydroxy_end); + } + } + } + else { + for (auto i = 0; i + 1 < qlen; ++i) { + const auto nuc = seq_nt16_str[bam_seqi(seq, i)]; + if (nuc == 'C') { + if (seq_nt16_str[bam_seqi(seq, i + 1)] == 'G') { + // assume that when delta hits 0 we have a CpG site + if (delta != 0) + return false; + methyl_probs[i] = bam_auxB2i(mod_prob, methyl_prob_idx++); + hydroxy_probs[i] = bam_auxB2i(mod_prob, hydroxy_prob_idx++); + } + --delta; + if (delta < 0) + delta = get_next_mod_pos(hydroxy_beg, hydroxy_end); + } + } + } + return true; + } +}; + +// clang-format off +static const std::uint8_t enc[] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 16 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 32 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 48 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 64 + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 80 + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 96 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 112 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 128 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 144 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 160 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 176 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 192 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 208 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 224 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 240 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 // 256 +}; +// clang-format on + +static constexpr auto n_dinucs = 16u; +// clang-format off +static const auto dinucs = std::vector{ + "AA", "AC", "AG", "AT", + "CA", "CC", "CG", "CT", + "GA", "GC", "GG", "GT", + "TA", "TC", "TG", "TT", +}; +// clang-format on + +struct match_counter { + typedef std::array container; + container pos{}; + container neg{}; + + void + add_pos(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, + const std::uint8_t q2) { + const auto r1e = enc[r1]; + const auto r2e = enc[r2]; + const auto q1e = enc[q1]; + const auto q2e = enc[q2]; + if ((r1e | r2e | q1e | q2e) & 4u) + return; + pos[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; + } + + void + add_neg(const std::uint8_t r1, const std::uint8_t r2, const std::uint8_t q1, + const std::uint8_t q2) { + const auto r1e = enc[r1]; + const auto r2e = enc[r2]; + const auto q1e = enc[q1]; + const auto q2e = enc[q2]; + if ((r1e | r2e | q1e | q2e) & 4u) + return; + neg[(r1e * 64) + (r2e * 16) + (q1e * 4) + (q2e * 1)]++; + } + + [[nodiscard]] std::string + tostring() const { + std::ostringstream oss; + oss << "target_query_fwd" << '\n'; + for (auto i = 0u; i < n_dinucs; ++i) { + const auto beg = std::cbegin(pos) + (i * n_dinucs); + oss << dinucs[i]; + for (auto itr = beg; itr != beg + n_dinucs; ++itr) + oss << '\t' << *itr; + oss << '\n'; + } + oss << "target_query_rev" << '\n'; + for (auto i = 0u; i < n_dinucs; ++i) { + const auto beg = std::cbegin(neg) + (i * n_dinucs); + oss << dinucs[i]; + for (auto itr = beg; itr != beg + n_dinucs; ++itr) + oss << '\t' << *itr; + oss << '\n'; + } + return oss.str(); + } + + [[nodiscard]] std::string + json_row(const container::const_iterator beg) const { + std::ostringstream oss; + oss << "{"; + auto j = 0; + for (auto itr = beg; itr != beg + n_dinucs; ++itr) { + if (j > 0) + oss << ","; + oss << R"(")" << dinucs[j++] << R"(":")" << *itr << R"(")"; + } + oss << "}"; + return oss.str(); + } + + [[nodiscard]] std::string + json_table(const container &a) const { + std::ostringstream oss; + const auto beg = std::cbegin(a); + oss << "{"; + for (auto i = 0u; i < n_dinucs; ++i) { + if (i > 0) + oss << ","; + oss << R"(")" << dinucs[i] << R"(":)" << json_row(beg + (i * n_dinucs)); + } + oss << "}"; + return oss.str(); + } + + [[nodiscard]] std::string + json() const { + // clang-format off + return (std::ostringstream() << "{" + << R"("map_fwd":)" << json_table(pos) << "," + << R"("map_rev":)" << json_table(neg) + << "}").str(); + // clang-format on + } + + [[nodiscard]] std::string + tostring_frac() const { + static constexpr auto width = 8; + static constexpr auto prec = 3; + std::ostringstream oss; + oss.precision(prec); + oss.setf(std::ios::fixed, std::ios::floatfield); + oss << "map_fwd\n"; + for (auto i = 0u; i < n_dinucs; ++i) { + oss << dinucs[i]; + const auto beg = std::cbegin(pos) + (i * n_dinucs); + const auto tot = std::max(std::accumulate(beg, beg + n_dinucs, 0.0), 1.0); + for (auto itr = beg; itr != beg + n_dinucs; ++itr) + oss << std::setw(width) << *itr / tot; + oss << '\n'; + } + oss << "map_ref\n"; + for (auto i = 0u; i < n_dinucs; ++i) { + oss << dinucs[i]; + const auto beg = std::cbegin(neg) + (i * n_dinucs); + const auto tot = std::max(std::accumulate(beg, beg + n_dinucs, 0.0), 1.0); + for (auto itr = beg; itr != beg + n_dinucs; ++itr) + oss << std::setw(width) << *itr / tot; + oss << '\n'; + } + return oss.str(); + } +}; + +static void +count_states_pos(const bam_rec &aln, std::vector &counts, + mod_prob_buffer &mod_buf, const std::string &chrom, + match_counter &mc) { + /* Move through cigar, reference and read positions without + inflating cigar or read sequence */ + const auto seq = bam_get_seq(aln); + const auto beg_cig = bam_get_cigar(aln); + const auto end_cig = beg_cig + get_n_cigar(aln); + const auto end_ref = std::cend(chrom); + + auto rpos = get_pos(aln); + auto ref_itr = std::cbegin(chrom) + rpos; + + if (!mod_buf.set_probs(aln)) + return; + + auto qpos = 0; // to match type with b->core.l_qseq + auto q_lim = get_l_qseq(aln) - 1; // if here, this is >= 0 + + auto hydroxy_prob_itr = std::cbegin(mod_buf.hydroxy_probs); + auto methyl_prob_itr = std::cbegin(mod_buf.methyl_probs); + + for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { + const char op = bam_cigar_op(*c_itr); + const std::uint32_t n = bam_cigar_oplen(*c_itr); + if (eats_ref(op) && eats_query(op)) { + const decltype(qpos) end_qpos = qpos + n; + for (; qpos < end_qpos; ++qpos) { + const auto r_nuc = *ref_itr++; + mc.add_pos(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, + seq_nt16_str[bam_seqi(seq, qpos)], + qpos == q_lim ? 'N' : seq_nt16_str[bam_seqi(seq, qpos + 1)]); + counts[rpos++].add_count_pos(*hydroxy_prob_itr, *methyl_prob_itr); + ++methyl_prob_itr; + ++hydroxy_prob_itr; + } + } + else if (eats_query(op)) { + qpos += n; + methyl_prob_itr += n; + hydroxy_prob_itr += n; + } + else if (eats_ref(op)) { + rpos += n; + ref_itr += n; + } + } + // ADS: somehow previous code included a correction for rpos going + // past the end of the chromosome; this should result at least in a + // soft-clip by any mapper. I'm not checking it here as even if it + // happens I don't want to terminate. + assert(qpos == get_l_qseq(aln)); +} + +static void +count_states_neg(const bam_rec &aln, std::vector &counts, + mod_prob_buffer &mod_buf, const std::string &chrom, + match_counter &mc) { + /* Move through cigar, reference and (*backward*) through read + positions without inflating cigar or read sequence */ + const auto seq = bam_get_seq(aln); + const auto beg_cig = bam_get_cigar(aln); + const auto end_cig = beg_cig + get_n_cigar(aln); + const auto end_ref = std::cend(chrom); + auto rpos = get_pos(aln); + auto ref_itr = std::cbegin(chrom) + rpos; + + std::size_t qpos = get_l_qseq(aln); // to match type with b->core.l_qseq + std::size_t q_idx = 0; + std::size_t l_qseq = get_l_qseq(aln); + + if (!mod_buf.set_probs(aln)) + return; + + auto hydroxy_prob_itr = std::cend(mod_buf.hydroxy_probs); + auto methyl_prob_itr = std::cend(mod_buf.methyl_probs); + + if (qpos == 0) + return; + for (auto c_itr = beg_cig; c_itr != end_cig; ++c_itr) { + const char op = bam_cigar_op(*c_itr); + const std::uint32_t n = bam_cigar_oplen(*c_itr); + if (eats_ref(op) && eats_query(op)) { + assert(qpos >= n); + const std::size_t end_qpos = qpos - n; // to match type with qpos + for (; qpos > end_qpos; --qpos) { + const auto r_nuc = *ref_itr++; + const auto q_nuc = seq_nt16_str[bam_seqi(seq, q_idx)]; + ++q_idx; + mc.add_neg(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, q_nuc, + q_idx == l_qseq ? 'N' : seq_nt16_str[bam_seqi(seq, q_idx)]); + --methyl_prob_itr; + --hydroxy_prob_itr; + counts[rpos++].add_count_neg(*hydroxy_prob_itr, *methyl_prob_itr); + } + } + else if (eats_query(op)) { + qpos -= n; + q_idx += n; + methyl_prob_itr -= n; + hydroxy_prob_itr -= n; + } + else if (eats_ref(op)) { + rpos += n; + ref_itr += n; + } + } + + /* qpos is unsigned; would wrap around if < 0 */ + // ADS: Same as count_states_pos; see comment there + assert(qpos == 0); +} + +[[nodiscard]] static std::unordered_map +get_tid_to_idx(const bam_header &hdr, + const std::unordered_map name_to_idx) { + std::unordered_map tid_to_idx; + for (std::int32_t i = 0; i < hdr.h->n_targets; ++i) { + // "curr_name" gives a "tid_to_name" mapping allowing to jump + // through "name_to_idx" and get "tid_to_idx" + const std::string curr_name(hdr.h->target_name[i]); + const auto name_itr(name_to_idx.find(curr_name)); + if (name_itr == end(name_to_idx)) + throw std::runtime_error("failed to find chrom: " + curr_name); + tid_to_idx[i] = name_itr->second; + } + return tid_to_idx; +} + +[[nodiscard]] static bool +consistent_targets( + const bam_header &hdr, + const std::unordered_map &tid_to_idx, + const std::vector &names, + const std::vector &sizes) { + const std::size_t n_targets = hdr.h->n_targets; + if (n_targets != names.size()) + return false; + + for (std::size_t tid = 0; tid < n_targets; ++tid) { + const std::string tid_name_sam = sam_hdr_tid2name(hdr, tid); + const std::size_t tid_size_sam = sam_hdr_tid2len(hdr, tid); + const auto idx_itr = tid_to_idx.find(tid); + if (idx_itr == std::cend(tid_to_idx)) + return false; + const auto idx = idx_itr->second; + if (tid_name_sam != names[idx] || tid_size_sam != sizes[idx]) + return false; + } + return true; +} + +struct read_processor { + static constexpr auto default_expected_basecall_model = + "dna_r10.4.1_e8.2_400bps_hac@v4.3.0"; + + bool verbose{}; + bool show_progress{}; + bool compress_output{}; + bool include_header{}; + bool require_covered{}; + bool cpg_only{}; + bool force{}; + std::uint32_t n_threads{1}; + int strand{0}; + std::string expected_basecall_model{}; + + [[nodiscard]] std::string + tostring() const { + std::ostringstream oss; + oss << std::boolalpha; + oss << "[verbose: " << verbose << "]\n" + << "[show_progress: " << show_progress << "]\n" + << "[compress_output: " << compress_output << "]\n" + << "[include_header: " << include_header << "]\n" + << "[require_covered: " << require_covered << "]\n" + << "[cpg_only: " << cpg_only << "]\n" + << "[force: " << force << "]\n" + << "[n_threads: " << n_threads << "]\n" + << "[strand: " << strand << "]\n" + << "[expected_basecall_model: " << expected_basecall_model_str() + << "]\n"; + return oss.str(); + } + + read_processor() : expected_basecall_model{default_expected_basecall_model} {} + + [[nodiscard]] std::string + expected_basecall_model_str() const { + return expected_basecall_model.empty() ? "NA" : expected_basecall_model; + } + + void + output_skipped_chromosome( + const std::int32_t tid, + const std::unordered_map &tid_to_idx, + const bam_header &hdr, + const std::vector::const_iterator chroms_beg, + const std::vector &chrom_sizes, std::vector &counts, + bgzf_file &out) const { + + // get the index of the next chrom sequence + const auto chrom_idx = tid_to_idx.find(tid); + if (chrom_idx == std::cend(tid_to_idx)) + throw std::runtime_error("chrom not found: " + + sam_hdr_tid2name(hdr, tid)); + + const auto chrom_itr = chroms_beg + chrom_idx->second; + + // reset the counts + counts.clear(); + counts.resize(chrom_sizes[chrom_idx->second]); + + // ADS: 'false' below for require_covered b/c these sites can't be covered. + write_output(hdr, out, tid, *chrom_itr, counts); + } + + void + write_output(const bam_header &hdr, bgzf_file &out, const std::int32_t tid, + const std::string &chrom, + const std::vector &counts) const { + static constexpr auto out_fmt = "%ld\t%c\t%s\t%.6g\t%d\t%.6g\t%.6g\n"; + static constexpr auto buf_size = 1024; + char buffer[buf_size]; + + // Put chrom name in buffer and then skip that part for each site because + // it doesn't change. + const auto chrom_name = sam_hdr_tid2name(hdr.h, tid); + if (chrom_name == nullptr) + throw std::runtime_error("failed to identify chrom for tid: " + + std::to_string(tid)); + const auto chrom_name_offset = std::sprintf(buffer, "%s\t", chrom_name); + if (chrom_name_offset < 0) + throw std::runtime_error("failed to write to output buffer"); + auto buffer_after_chrom = buffer + chrom_name_offset; + + for (std::size_t i = 0; i < std::size(chrom); ++i) { + const char base = chrom[i]; + if (is_cytosine(base) || is_guanine(base)) { + + const std::uint32_t the_tag = get_tag_from_genome(chrom, i); + if (cpg_only && the_tag != 0) + continue; + + const bool is_c = is_cytosine(base); + + const std::uint32_t n_reads = counts[i].get_n_reads(is_c); + if (require_covered && n_reads == 0) + continue; + const double denom = std::max(n_reads, 1u); + const double hydroxy = counts[i].get_hydroxy(is_c) / 256.0 / denom; + const double methyl = counts[i].get_methyl(is_c) / 256.0 / denom; + const double mods = counts[i].get_mods(is_c) / 256.0 / denom; + + // clang-format off + int r = std::sprintf(buffer_after_chrom, out_fmt, + i, + is_c ? '+' : '-', + tag_values[the_tag], + mods, + n_reads, + hydroxy, + methyl); + // clang-format on + + if (r < 0) + throw std::runtime_error("failed to write to output buffer"); + out.write(buffer); + } + } + } + + [[nodiscard]] match_counter + operator()(const std::string &infile, const std::string &outfile, + const std::string &chroms_file) const { + // first get the chromosome names and sequences from the FASTA file + std::vector chroms, names; + read_fasta_file_short_names(chroms_file, names, chroms); + for (auto &i : chroms) + transform(std::cbegin(i), std::cend(i), std::begin(i), + [](const char c) { return std::toupper(c); }); + if (verbose) + std::cerr << "[n chroms in reference: " << chroms.size() << "]" + << std::endl; + const auto chroms_beg = std::cbegin(chroms); + + std::unordered_map name_to_idx; + std::vector chrom_sizes(chroms.size(), 0); + for (std::size_t i = 0; i < chroms.size(); ++i) { + name_to_idx[names[i]] = i; + chrom_sizes[i] = chroms[i].size(); + } + + bamxx::bam_tpool tp(n_threads); // Must be destroyed after hts + + // open the hts SAM/BAM input file and get the header + bamxx::bam_in hts(infile); + if (!hts) + throw std::runtime_error("failed to open input file"); + // load the input file's header + bam_header hdr(hts); + if (!hdr) + throw std::runtime_error("failed to read header"); + + std::unordered_map tid_to_idx = + get_tid_to_idx(hdr, name_to_idx); + + if (!consistent_targets(hdr, tid_to_idx, names, chrom_sizes)) + throw std::runtime_error("inconsistent reference genome information"); + + // open the output file + const std::string output_mode = compress_output ? "w" : "wu"; + bgzf_file out(outfile, output_mode); + if (!out) + throw std::runtime_error("error opening output file: " + outfile); + + // set the threads for the input file decompression + if (n_threads > 1) { + tp.set_io(hts); + tp.set_io(out); + } + + // validate the basecall model + const auto basecall_model = get_basecall_model(hdr); + if (verbose) + std::cerr << "[observed basecall model: " + << (basecall_model.empty() ? "NA" : basecall_model) << "]" + << std::endl; + if (!expected_basecall_model.empty() && + basecall_model != expected_basecall_model) { + std::cerr << "failed to match basecall model:" << "\n" + << "observed=" + << (basecall_model.empty() ? "NA" : basecall_model) << "\n" + << "expected=" << expected_basecall_model_str() << std::endl; + return {}; + } + + if (include_header) + write_counts_header_from_bam_header(hdr, out); + + // this is where all the counts are accumulated + std::vector counts; + + // now iterate over the reads, switching chromosomes and writing + // output as needed + bam_rec aln; + std::int32_t prev_tid = -1; + + std::vector::const_iterator chrom_itr{}; + + match_counter mc; + mod_prob_buffer mod_buf; + + while (hts.read(hdr, aln)) { + const std::int32_t tid = get_tid(aln); + if (get_l_qseq(aln) == 0) + continue; + + if (tid == -1) // ADS: skip reads that have no tid -- they are not mapped + continue; + if (tid == prev_tid) { + const bool is_rev = bam_is_rev(aln); + if (is_rev && strand != 1) + count_states_neg(aln, counts, mod_buf, *chrom_itr, mc); + if (!is_rev && strand != 2) + count_states_pos(aln, counts, mod_buf, *chrom_itr, mc); + } + else { // chrom has changed, so output results and get the next chrom + + // write output if there is any; counts is empty only for first chrom + if (!counts.empty()) + write_output(hdr, out, prev_tid, *chrom_itr, counts); + // make sure reads are sorted chrom tid number in header + if (tid < prev_tid) { + const std::string message = "SAM file is not sorted " + "previous tid: " + + std::to_string(prev_tid) + + " current tid: " + std::to_string(tid); + throw std::runtime_error(message); + } + + if (!require_covered) + for (auto i = prev_tid + 1; i < tid; ++i) + output_skipped_chromosome(i, tid_to_idx, hdr, chroms_beg, + chrom_sizes, counts, out); + + // get the next chrom to process + auto chrom_idx(tid_to_idx.find(tid)); + if (chrom_idx == end(tid_to_idx)) + throw std::runtime_error("chromosome not found: " + + std::string(sam_hdr_tid2name(hdr, tid))); + if (show_progress) + std::cerr << "processing " << sam_hdr_tid2name(hdr, tid) << std::endl; + + prev_tid = tid; + chrom_itr = chroms_beg + chrom_idx->second; + + // reset the counts + counts.clear(); + counts.resize(chrom_sizes[chrom_idx->second]); + } + } + if (!counts.empty()) + write_output(hdr, out, prev_tid, *chrom_itr, counts); + + // ADS: if some chroms might not be covered by reads, we have to + // iterate over what remains + if (!require_covered) + for (auto i = prev_tid + 1; i < hdr.h->n_targets; ++i) + output_skipped_chromosome(i, tid_to_idx, hdr, chroms_beg, chrom_sizes, + counts, out); + return mc; + } +}; + +int +main_nanocount(int argc, const char **argv) { + + try { + + read_processor rp; + + std::string chroms_file; + std::string outfile; + std::string stats_file; + + /****************** COMMAND LINE OPTIONS ********************/ + OptionParser opt_parse(strip_path(argv[0]), + "get methylation levels from mapped nanopore reads", + "-c "); + opt_parse.add_opt("threads", 't', "threads to use (few needed)", false, + rp.n_threads); + opt_parse.add_opt("output", 'o', "output file name (default: stdout)", + false, outfile); + opt_parse.add_opt("chrom", 'c', "reference genome file (FASTA format)", + true, chroms_file); + opt_parse.add_opt("cpg-only", 'n', "print only CpG context cytosines", + false, rp.cpg_only); + opt_parse.add_opt("require-covered", 'r', "only output covered sites", + false, rp.require_covered); + opt_parse.add_opt("strand", '\0', + "use strand (1=positive, 2=negative, 0=default)", false, + rp.strand); + opt_parse.add_opt("stats", 's', "output match/mismatch stats to this file", + false, stats_file); + opt_parse.add_opt("force", '\0', "skip consistency checks", false, + rp.force); + opt_parse.add_opt("header", 'H', "add a header to identify the reference", + false, rp.include_header); + opt_parse.add_opt("zip", 'z', "output gzip format", false, + rp.compress_output); + opt_parse.add_opt("progress", '\0', "show progress", false, + rp.show_progress); + opt_parse.add_opt("verbose", 'v', "print more run info", false, rp.verbose); + std::vector leftover_args; + opt_parse.parse(argc, argv, leftover_args); + if (opt_parse.about_requested() || opt_parse.help_requested() || + leftover_args.empty()) { + std::cerr << opt_parse.help_message() << std::endl + << opt_parse.about_message() << std::endl; + return EXIT_SUCCESS; + } + if (opt_parse.option_missing()) { + std::cerr << opt_parse.option_missing_message() << std::endl; + return EXIT_SUCCESS; + } + const std::string mapped_reads_file = leftover_args.front(); + /****************** END COMMAND LINE OPTIONS *****************/ + + if (rp.force) + rp.expected_basecall_model.clear(); + + if (rp.n_threads <= 0) + throw std::runtime_error("thread count cannot be negative"); + + std::ostringstream cmd; + copy(argv, argv + argc, std::ostream_iterator(cmd, " ")); + + // file types from HTSlib use "-" for the filename to go to stdout + if (outfile.empty()) + outfile = "-"; + + if (rp.verbose) + std::cerr << "[input BAM/SAM file: " << mapped_reads_file << "]\n" + << "[output file: " << outfile << "]\n" + << "[genome file: " << chroms_file << "]\n" + << rp.tostring(); + + const auto mc = rp(mapped_reads_file, outfile, chroms_file); + if (!stats_file.empty()) { + std::ofstream stats_out(stats_file); + if (!stats_out) + throw std::runtime_error("Error opening stats file: " + stats_file); + stats_out << mc.json() << std::endl; + } + } + catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/src/common/MSite.cpp b/src/common/MSite.cpp index 27bd1a22..a9275929 100644 --- a/src/common/MSite.cpp +++ b/src/common/MSite.cpp @@ -15,26 +15,27 @@ #include "MSite.hpp" -#include -#include +#include #include +#include +#include #include #include -#include -#include +#include -#include "smithlab_utils.hpp" #include "counts_header.hpp" +#include "smithlab_utils.hpp" -using std::string; -using std::runtime_error; -using std::regex_match; -using std::from_chars; -using std::find_if; using std::cbegin; using std::cend; using std::end; +using std::find_if; +using std::from_chars; +using std::regex_match; +using std::runtime_error; +using std::string; +bool MSite::no_extra_fields = true; bool MSite::initialize(const char *c, const char *c_end) { @@ -45,7 +46,8 @@ MSite::initialize(const char *c, const char *c_end) { auto field_s = c; auto field_e = find_if(field_s + 1, c_end, is_sep); - if (field_e == c_end) failed = true; + if (field_e == c_end) + failed = true; { const uint32_t d = std::distance(field_s, field_e); @@ -95,7 +97,7 @@ MSite::initialize(const char *c, const char *c_end) { { const auto [ptr, ec] = from_chars(field_s, c_end, n_reads); - failed = failed || (ptr != c_end); + failed = failed || (no_extra_fields && ptr != c_end); } // ADS: the value below would work for a flag // pos = std::numeric_limits::max(); @@ -103,36 +105,28 @@ MSite::initialize(const char *c, const char *c_end) { return !failed; } - MSite::MSite(const string &line) { if (!initialize(line.data(), line.data() + size(line))) throw runtime_error("bad count line: " + line); } - MSite::MSite(const char *line, const int n) { if (!initialize(line, line + n)) throw runtime_error("bad count line: " + string(line)); } - string MSite::tostring() const { std::ostringstream oss; - oss << chrom << '\t' - << pos << '\t' - << strand << '\t' - << context << '\t' - << meth << '\t' - << n_reads; + oss << chrom << '\t' << pos << '\t' << strand << '\t' << context << '\t' + << meth << '\t' << n_reads; return oss.str(); } size_t distance(const MSite &a, const MSite &b) { - return a.chrom == b.chrom ? - std::max(a.pos, b.pos) - std::min(a.pos, b.pos) : - std::numeric_limits::max(); + return a.chrom == b.chrom ? std::max(a.pos, b.pos) - std::min(a.pos, b.pos) + : std::numeric_limits::max(); } using std::ifstream; @@ -152,10 +146,8 @@ move_to_start_of_line(ifstream &in) { in.clear(); } - void -find_offset_for_msite(const std::string &chr, - const size_t idx, +find_offset_for_msite(const std::string &chr, const size_t idx, std::ifstream &site_in) { site_in.seekg(0, ios_base::beg); @@ -166,7 +158,7 @@ find_offset_for_msite(const std::string &chr, if (end_pos - begin_pos < 2) throw runtime_error("empty counts file"); - size_t step_size = (end_pos - begin_pos)/2; + size_t step_size = (end_pos - begin_pos) / 2; site_in.seekg(0, ios_base::beg); string low_chr; @@ -185,7 +177,7 @@ find_offset_for_msite(const std::string &chr, size_t pos = step_size; site_in.seekg(pos, ios_base::beg); move_to_start_of_line(site_in); - size_t prev_pos = 0; // keep track of previous position in file + size_t prev_pos = 0; // keep track of previous position in file // binary search inside sorted file on disk // iterate until step size is 0 or positions are identical @@ -194,8 +186,8 @@ find_offset_for_msite(const std::string &chr, // track (mid) position in file to make sure it keeps moving prev_pos = pos; - string mid_chr; // chromosome name at mid point - size_t mid_idx = 0; // position at mid point + string mid_chr; // chromosome name at mid point + size_t mid_idx = 0; // position at mid point if (!(site_in >> mid_chr >> mid_idx)) throw runtime_error("failed navigating inside file"); @@ -203,10 +195,10 @@ find_offset_for_msite(const std::string &chr, // might catch an unsorted file if (mid_chr < low_chr || mid_chr > high_chr) throw runtime_error("chromosomes unsorted inside file: " - "low=" + low_chr + ",mid=" + mid_chr + - ",high=" + high_chr); + "low=" + + low_chr + ",mid=" + mid_chr + ",high=" + high_chr); - step_size /= 2; // cut the range in half + step_size /= 2; // cut the range in half if (chr < mid_chr || (chr == mid_chr && idx <= mid_idx)) { // move to the left @@ -229,8 +221,7 @@ find_offset_for_msite(const std::string &chr, using std::unordered_map; void find_offset_for_msite(const unordered_map &chrom_order, - const std::string &chr, - const size_t idx, + const std::string &chr, const size_t idx, std::ifstream &site_in) { site_in.seekg(0, ios_base::beg); @@ -241,7 +232,7 @@ find_offset_for_msite(const unordered_map &chrom_order, if (end_pos - begin_pos < 2) throw runtime_error("empty counts file"); - size_t step_size = (end_pos - begin_pos)/2; + size_t step_size = (end_pos - begin_pos) / 2; const auto chr_idx_itr = chrom_order.find(chr); if (chr_idx_itr == end(chrom_order)) { @@ -277,7 +268,7 @@ find_offset_for_msite(const unordered_map &chrom_order, size_t pos = step_size; site_in.seekg(pos, ios_base::beg); move_to_start_of_line(site_in); - size_t prev_pos = 0; // keep track of previous position in file + size_t prev_pos = 0; // keep track of previous position in file // binary search inside sorted file on disk // iterate until step size is 0 or positions are identical @@ -286,8 +277,8 @@ find_offset_for_msite(const unordered_map &chrom_order, // track (mid) position in file to make sure it keeps moving prev_pos = pos; - string mid_chr; // chromosome name at mid point - size_t mid_idx{}; // position at mid point + string mid_chr; // chromosome name at mid point + size_t mid_idx{}; // position at mid point if (!(site_in >> mid_chr >> mid_idx)) throw runtime_error("failed navigating inside file"); @@ -301,10 +292,12 @@ find_offset_for_msite(const unordered_map &chrom_order, using std::to_string; if (mid_chr_idx < low_chr_idx || mid_chr_idx > high_chr_idx) throw runtime_error("chromosomes unsorted inside file: " - "low=" + to_string(low_chr_idx) + ",mid=" + to_string(mid_chr_idx) + + "low=" + + to_string(low_chr_idx) + + ",mid=" + to_string(mid_chr_idx) + ",high=" + to_string(high_chr_idx)); - step_size /= 2; // cut the range in half + step_size /= 2; // cut the range in half if (chr_idx < mid_chr_idx || (chr_idx == mid_chr_idx && idx <= mid_idx)) { // move to the left @@ -328,10 +321,12 @@ is_msite_line(const string &line) { std::istringstream iss(line); string chrom; - if (!(iss >> chrom)) return false; + if (!(iss >> chrom)) + return false; int64_t pos = 0; - if (!(iss >> pos || pos < 0)) return false; + if (!(iss >> pos || pos < 0)) + return false; string strand; if (!(iss >> strand) || (strand.size() != 1) || @@ -341,13 +336,16 @@ is_msite_line(const string &line) { string context; // ADS: below, allowing any context // std::regex pattern("^C[pHWX][GH]$"); - if (!(iss >> context)) return false; + if (!(iss >> context)) + return false; double level = 0.0; - if (!(iss >> level) || level < 0.0 || level > 1.0) return false; + if (!(iss >> level) || level < 0.0 || level > 1.0) + return false; int64_t n_reads = 0; - if (!(iss >> n_reads || n_reads < 0)) return false; + if (!(iss >> n_reads || n_reads < 0)) + return false; string temp; if (iss >> temp) @@ -359,7 +357,8 @@ is_msite_line(const string &line) { bool is_msite_file(const string &file) { bamxx::bgzf_file in(file, "r"); - if (!in) throw runtime_error("cannot open file: " + file); + if (!in) + throw runtime_error("cannot open file: " + file); string line; while (getline(in, line) && is_counts_header_line(line)) diff --git a/src/common/MSite.hpp b/src/common/MSite.hpp index 355aad72..a74fc36a 100644 --- a/src/common/MSite.hpp +++ b/src/common/MSite.hpp @@ -18,12 +18,20 @@ #ifndef MSITE_HPP #define MSITE_HPP -#include +#include + #include #include +#include +#include #include struct MSite { + + // This defaults to true, and when true MSite lines cannot be parsed if they + // have additional columns in the file. + static bool no_extra_fields; + std::string chrom{}; size_t pos{}; char strand{}; @@ -32,34 +40,37 @@ struct MSite { size_t n_reads{}; MSite() = default; - MSite(const std::string &chrom, - const size_t pos, - const char strand, - const std::string &context, - const double meth, - const size_t n_reads) : - chrom{chrom}, pos{pos}, strand{strand}, - context{context}, meth{meth}, n_reads{n_reads} {} + MSite(const std::string &chrom, const size_t pos, const char strand, + const std::string &context, const double meth, const size_t n_reads) : + chrom{chrom}, pos{pos}, strand{strand}, context{context}, meth{meth}, + n_reads{n_reads} {} explicit MSite(const std::string &line); explicit MSite(const char *line, const int n); - bool initialize(const char *c, const char *c_end); + bool + initialize(const char *c, const char *c_end); - bool operator<(const MSite &other) const { + bool + operator<(const MSite &other) const { int r = chrom.compare(other.chrom); - return (r < 0 || - (r == 0 && - (pos < other.pos || - (pos == other.pos && strand < other.strand)))); + return (r < 0 || (r == 0 && (pos < other.pos || + (pos == other.pos && strand < other.strand)))); } - size_t n_meth() const {return std::round(meth*n_reads);} - size_t n_unmeth() const {return n_reads - n_meth();} + size_t + n_meth() const { + return std::round(meth * n_reads); + } + size_t + n_unmeth() const { + return n_reads - n_meth(); + } ////////////////////////////////////////////////////////////// /// FUNCTIONS BELOW ARE FOR MANIPULATING SYMMETRIC CPG SITES ////////////////////////////////////////////////////////////// - void add(const MSite &other) { + void + add(const MSite &other) { // ADS: possible that this function has specific behavior that // should be placed in the 'sym' source, since we might not want // this line below to operate generally. @@ -67,14 +78,15 @@ struct MSite { context += 'x'; // ADS: order matters below as n_reads update invalidates n_meth() // function until meth has been updated - const size_t total_c_reads = n_meth() + other.n_meth(); + const double total_c_reads = (meth * n_reads + other.meth * other.n_reads); n_reads += other.n_reads; - meth = static_cast(total_c_reads)/std::max(1ul, n_reads); + meth = total_c_reads / std::max(1ul, n_reads); } // ADS: function below has redundant check for is_cpg, which is // expensive and might be ok to remove - bool is_mate_of(const MSite &first) { + bool + is_mate_of(const MSite &first) { return (first.pos + 1 == pos && first.is_cpg() && is_cpg() && first.strand == '+' && strand == '-'); } @@ -85,39 +97,48 @@ struct MSite { ///// CpG within. Also included is a function that tests if a site ///// has a mutation. //////////////////////////////////////////////////////////////////////// - bool is_cpg() const { + bool + is_cpg() const { return context.length() >= 3 && - (context[0] == 'C' && context[1] == 'p' && context[2] == 'G'); + (context[0] == 'C' && context[1] == 'p' && context[2] == 'G'); } - bool is_chh() const { + bool + is_chh() const { return context.length() >= 3 && - (context[0] == 'C' && context[1] == 'H' && context[2] == 'H'); + (context[0] == 'C' && context[1] == 'H' && context[2] == 'H'); } - bool is_ccg() const { + bool + is_ccg() const { return context.length() >= 3 && - (context[0] == 'C' && context[1] == 'C' && context[2] == 'G'); + (context[0] == 'C' && context[1] == 'C' && context[2] == 'G'); } - bool is_cxg() const { + bool + is_cxg() const { return context.length() >= 3 && - (context[0] == 'C' && context[1] == 'X' && context[2] == 'G'); + (context[0] == 'C' && context[1] == 'X' && context[2] == 'G'); } - bool is_mutated() const { + bool + is_mutated() const { return context.length() == 4 && context[3] == 'x'; } - void set_mutated() { + void + set_mutated() { if (!is_mutated()) context += 'x'; } - void set_unmutated() { + void + set_unmutated() { if (is_mutated()) context.resize(context.length() - 1); } - std::string tostring() const; + std::string + tostring() const; }; -template T & +template +T & operator>>(T &in, MSite &s) { std::string line; if (getline(in, line)) @@ -125,9 +146,10 @@ operator>>(T &in, MSite &s) { return in; } -template T & +template +T & operator<<(T &out, const MSite &s) { - out << s.tostring(); // seems to be an issue returning this directly + out << s.tostring(); // seems to be an issue returning this directly return out; } @@ -138,15 +160,13 @@ distance(const MSite &a, const MSite &b); // file (for the specified stream) that does not precede the specified // position given by the (chrom, start_pos) pair. void -find_offset_for_msite(const std::string &chrom, - const size_t start_pos, +find_offset_for_msite(const std::string &chrom, const size_t start_pos, std::ifstream &site_in); void -find_offset_for_msite(const std::unordered_map &chrom_order, - const std::string &chr, - const size_t idx, - std::ifstream &site_in); +find_offset_for_msite( + const std::unordered_map &chrom_order, + const std::string &chr, const size_t idx, std::ifstream &site_in); // ADS: using the functions below for now for static polymorphism // because I don't want to have insertion and extraction operators @@ -154,9 +174,6 @@ find_offset_for_msite(const std::unordered_map &chrom_ord // if we are going to use them, we need to be aware of which functions // we are calling, and not just do it unconsciously. -#include -#include - inline bamxx::bgzf_file & write_site(bamxx::bgzf_file &f, const MSite &s) { // ADS: to slow?? diff --git a/src/common/bam_record_utils.cpp b/src/common/bam_record_utils.cpp index ec92f811..69f041cd 100644 --- a/src/common/bam_record_utils.cpp +++ b/src/common/bam_record_utils.cpp @@ -980,7 +980,8 @@ to_string(const bam_header &hdr, const bam_rec &aln) { if (ret < 0) { throw runtime_error("Can't format record: " + to_string(hdr, aln)); } + const std::string s = string(ks.s); if (ks.s != nullptr) free(ks.s); - return string(ks.s); + return s; } diff --git a/src/dnmtools.cpp b/src/dnmtools.cpp index 5ba90743..ff5881d7 100644 --- a/src/dnmtools.cpp +++ b/src/dnmtools.cpp @@ -24,8 +24,8 @@ #include using std::begin; -using std::end; using std::cout; +using std::end; using std::endl; using std::pair; using std::string; @@ -39,7 +39,8 @@ struct dnmtools_command { string description; std::function fun; - auto operator()(const int argc, const char **argv) const -> int { + auto + operator()(const int argc, const char **argv) const -> int { return fun(argc - 1, argv + 1); } }; @@ -64,6 +65,8 @@ simreads(int argc, const char **argv); int main_counts(int argc, const char **argv); int +main_nanocount(int argc, const char **argv); +int main_allelicmeth(int argc, const char **argv); int main_amrfinder(int argc, const char **argv); @@ -145,7 +148,8 @@ print_help( << "Commands:" << endl; for (auto &&g : command_groups) { cout << " " << g.first << ":" << endl; - for (auto &&c : g.second) cout << c << endl; + for (auto &&c : g.second) + cout << c << endl; cout << endl; } } @@ -154,7 +158,7 @@ int main(int argc, const char **argv) { try { vector>> command_groups = { -// clang-format off + // clang-format off {{"mapping", {{{"abismal", "map FASTQ reads to a FASTA reference genome or an index", abismal}, {"abismalidx", "convert a FASTA reference genome to an abismal index", abismalidx}, @@ -165,6 +169,7 @@ main(int argc, const char **argv) { {"uniq", "remove duplicate reads from sorted mapped reads", main_uniq}, {"bsrate", "compute the BS conversion rate from BS-seq reads mapped to a genome", main_bsrate}, {"counts", "get methylation levels from mapped WGBS reads", main_counts}, + {"counts-nano", "get methylation levels from mapped nanopore reads", main_nanocount}, {"sym", "get CpG sites and make methylation levels symmetric", main_symmetric_cpgs}, {"levels", "compute methylation summary statistics from a counts file", main_levels}}}}, @@ -208,7 +213,7 @@ main(int argc, const char **argv) { {"unxcounts", "reverse the xcounts process yielding a counts file", main_unxcounts}, {"selectsites", "sites inside a set of genomic intervals", main_selectsites}, {"kmersites", "make track file for sites matching kmer", kmersites}}}}}}; -// clang-format on + // clang-format on if (argc < 2) { print_help(command_groups); @@ -221,11 +226,11 @@ main(int argc, const char **argv) { for (auto &g : command_groups) { const auto the_cmd = find_if(begin(g.second), end(g.second), has_tag); - if (the_cmd != end(g.second)) return (*the_cmd)(argc, argv); + if (the_cmd != end(g.second)) + return (*the_cmd)(argc, argv); } std::cerr << "ERROR: invalid command " << argv[1] << std::endl; - } catch (const std::exception &e) { std::cerr << "ERROR:\t" << e.what() << endl; diff --git a/src/utils/symmetric-cpgs.cpp b/src/utils/symmetric-cpgs.cpp index b1d0cd9f..5b7b9620 100644 --- a/src/utils/symmetric-cpgs.cpp +++ b/src/utils/symmetric-cpgs.cpp @@ -1,8 +1,7 @@ /* symmetric-cpgs: extract the CpG sites from a methcounts output * file and produce a new one with the CpGs treated unstranded. * - * Copyright (C) 2014 University of Southern California and - * Andrew D. Smith + * Copyright (C) 2014-2025 Andrew D. Smith * * Authors: Andrew D. Smith * @@ -56,7 +55,8 @@ ensure_positive_cpg(MSite &s) { s.strand = '+'; } -template static std::tuple +template +static std::tuple get_first_site(T &in, T &out) { bool prev_is_cpg = false; MSite prev_site; @@ -68,14 +68,16 @@ get_first_site(T &in, T &out) { } else { prev_site.initialize(line.data(), line.data() + size(line)); - if (prev_site.is_cpg()) prev_is_cpg = true; + if (prev_site.is_cpg()) + prev_is_cpg = true; within_header = false; } } return {prev_site, prev_is_cpg}; } -template static bool +template +static bool process_sites(const bool verbose, T &in, T &out) { // get the first site while dealing with the header @@ -90,7 +92,8 @@ process_sites(const bool verbose, T &in, T &out) { while (read_site(in, curr_site)) { const bool same_chrom = prev_site.chrom == curr_site.chrom; if (same_chrom) { - if (curr_site.pos <= prev_site.pos) return false; + if (curr_site.pos <= prev_site.pos) + return false; if (prev_is_cpg) { if (curr_site.is_mate_of(prev_site)) curr_site.add(prev_site); @@ -103,7 +106,8 @@ process_sites(const bool verbose, T &in, T &out) { else { if (verbose) cerr << "processing: " << curr_site.chrom << endl; - if (chroms_seen.find(curr_site.chrom) != cend(chroms_seen)) return false; + if (chroms_seen.find(curr_site.chrom) != cend(chroms_seen)) + return false; chroms_seen.insert(curr_site.chrom); if (prev_is_cpg) { @@ -129,6 +133,7 @@ main_symmetric_cpgs(int argc, const char **argv) { string outfile{"-"}; bool verbose = false; bool compress_output = false; + bool allow_extra_fields = false; int32_t n_threads = 1; const string description = @@ -141,6 +146,8 @@ main_symmetric_cpgs(int argc, const char **argv) { outfile); opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); + opt_parse.add_opt("relaxed", '\0', "allow extra fields in input", false, + allow_extra_fields); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); @@ -164,20 +171,26 @@ main_symmetric_cpgs(int argc, const char **argv) { const string filename(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ - if (n_threads <= 0) throw runtime_error("threads must be positive"); + MSite::no_extra_fields = (allow_extra_fields == false); + + if (n_threads <= 0) + throw runtime_error("threads must be positive"); bamxx::bam_tpool tp(n_threads); // const bool show_progress = VERBOSE && isatty(fileno(stderr)); bgzf_file in(filename, "r"); - if (!in) throw runtime_error("could not open file: " + filename); + if (!in) + throw runtime_error("could not open file: " + filename); // open the output file const string output_mode = compress_output ? "w" : "wu"; bamxx::bgzf_file out(outfile, output_mode); - if (!out) throw runtime_error("error opening output file: " + outfile); + if (!out) + throw runtime_error("error opening output file: " + outfile); if (n_threads > 1) { - if (in.is_bgzf()) tp.set_io(in); + if (in.is_bgzf()) + tp.set_io(in); tp.set_io(out); } @@ -187,7 +200,8 @@ main_symmetric_cpgs(int argc, const char **argv) { cerr << "sites are not sorted in: " << filename << endl; namespace fs = std::filesystem; const fs::path outpath{outfile}; - if (fs::exists(outpath)) fs::remove(outpath); + if (fs::exists(outpath)) + fs::remove(outpath); return EXIT_FAILURE; } }