From f24f3014fee1a5c57751f6a03a28ed4a0f93f1e6 Mon Sep 17 00:00:00 2001 From: almac2022 Date: Thu, 28 May 2026 22:03:28 -0700 Subject: [PATCH 1/3] audit_configs.R: detect fresh->link parameters_fresh column drift Add section 3b comparing fresh's canonical parameters_fresh.csv column set against each link bundle's copy. Flags columns fresh added that link is missing (link's copy may no longer load through frs_habitat); treats observation_* as expected link-only extensions. Report-only, values ignored. Closes the parameters_fresh half of the fresh/link config drift gap (rules.yaml half tracked in #129). Co-Authored-By: Claude Opus 4.7 --- data-raw/audit_configs.R | 47 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/data-raw/audit_configs.R b/data-raw/audit_configs.R index 853eff05..125c7529 100644 --- a/data-raw/audit_configs.R +++ b/data-raw/audit_configs.R @@ -141,6 +141,53 @@ for (b in bundles) { } } +# --------------------------------------------------------------------------- +# 3b. parameters_fresh column drift (fresh canonical vs link config) +# --------------------------------------------------------------------------- +# fresh owns the access/cluster parameter SCHEMA; link hand-authors per-bundle +# copies seeded from it plus link-only `observation_*` extensions. Values +# legitimately diverge (link tunes them) — only the COLUMN SET matters here. +# A column fresh added that link is missing means link's copy may no longer +# load cleanly through frs_habitat(); flag it. See link#129 for the +# directionality (link->fresh for rules.yaml; fresh->link col-schema for this). +cat("\n--- 3b. parameters_fresh column drift (fresh canonical vs link) ---\n") +pf_fresh_path <- system.file("extdata", "parameters_fresh.csv", package = "fresh") +if (!nzchar(pf_fresh_path)) { + cat(" (fresh's bundled parameters_fresh.csv not found — is fresh installed?)\n") +} else { + cols_fresh <- names(utils::read.csv(pf_fresh_path, stringsAsFactors = FALSE, + check.names = FALSE, nrows = 1)) + cat(sprintf(" fresh canonical (%d cols): %s\n", + length(cols_fresh), paste(cols_fresh, collapse = ", "))) + for (b in bundles) { + pf_csv <- sprintf("inst/extdata/configs/%s/parameters_fresh.csv", b) + cols_link <- names(utils::read.csv(pf_csv, stringsAsFactors = FALSE, + check.names = FALSE, nrows = 1)) + + in_fresh_not_link <- setdiff(cols_fresh, cols_link) + extra_link <- setdiff(cols_link, cols_fresh) + link_extensions <- extra_link[grepl("^observation_", extra_link)] + unexpected_link <- setdiff(extra_link, link_extensions) + + cat(sprintf("\n bundle: %s\n", b)) + if (length(link_extensions) > 0) { + cat(sprintf(" link-only extensions (expected): %s\n", + paste(link_extensions, collapse = ", "))) + } + if (length(in_fresh_not_link) > 0) { + cat(sprintf(" !! fresh ∖ link (link missing engine param): %s\n", + paste(in_fresh_not_link, collapse = ", "))) + } + if (length(unexpected_link) > 0) { + cat(sprintf(" !! link ∖ fresh (unexpected non-observation col): %s\n", + paste(unexpected_link, collapse = ", "))) + } + if (length(in_fresh_not_link) == 0 && length(unexpected_link) == 0) { + cat(" column set aligned (link = fresh + observation_* extensions)\n") + } + } +} + # --------------------------------------------------------------------------- # 4. Undeclared override CSVs (files on disk but not in cfg$files) # --------------------------------------------------------------------------- From 106dffee4998bb9fc3fa080458aaeda8a93f2e20 Mon Sep 17 00:00:00 2001 From: almac2022 Date: Thu, 28 May 2026 22:35:28 -0700 Subject: [PATCH 2/3] audit_configs.R: aggregate findings into end-of-run rollup + non-zero exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Route every `!!`-level finding (sections 1-5) through a flag() accumulator, then print a single rollup at the end and quit(status=1) when any fired (guarded by !interactive()). Previously findings printed inline across six sections with no summary and the script always exited 0, so any flag could scroll past unnoticed and the trifecta could not gate on a clean audit. Uniform across all checks — no single check is special-cased louder. Co-Authored-By: Claude Opus 4.7 --- data-raw/audit_configs.R | 50 ++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/data-raw/audit_configs.R b/data-raw/audit_configs.R index 125c7529..5cc8869a 100644 --- a/data-raw/audit_configs.R +++ b/data-raw/audit_configs.R @@ -16,6 +16,14 @@ devtools::load_all(quiet = TRUE) bundles <- c("bcfishpass", "default") +# Findings accumulator — every `!!`-level finding routes through flag() so the +# end-of-run rollup (and non-zero exit) catches anything that scrolled past. +FINDINGS <- character(0) +flag <- function(section, msg) { + FINDINGS[[length(FINDINGS) + 1L]] <<- sprintf("[%s] %s", section, msg) + cat(sprintf(" !! %s\n", msg)) +} + cat("\n==== CONFIG AUDIT (link", as.character(packageVersion("link")), ") ====\n") # --------------------------------------------------------------------------- @@ -72,6 +80,11 @@ cat(sprintf(" %d drifted entries across %d bundles\n", nrow(drifted), length(bundles))) if (nrow(drifted) > 0) { print(drifted |> dplyr::select(bundle, file, byte_drift, shape_drift), n = Inf) + for (i in seq_len(nrow(drifted))) { + kinds <- c(if (drifted$byte_drift[i]) "byte", if (drifted$shape_drift[i]) "shape") + flag(sprintf("1 %s", drifted$bundle[i]), + sprintf("%s drift: %s", paste(kinds, collapse = "+"), drifted$file[i])) + } } # --------------------------------------------------------------------------- @@ -89,7 +102,8 @@ for (b in bundles) { cat(sprintf(" %-12s identical(structure)=%s identical(bytes)=%s\n", b, identical_yaml, identical_text)) if (!identical_yaml) { - cat(" !! regen differs from committed — committed rules.yaml is stale\n") + flag(sprintf("2 %s", b), + "rules.yaml regen differs from committed — committed copy is stale") } else if (!identical_text) { cat(" (text differs but yaml structure matches — likely formatting only)\n") } @@ -136,7 +150,8 @@ for (b in bundles) { ) for (name in names(flags)) { if (length(flags[[name]]) > 0) { - cat(sprintf(" !! %s: %s\n", name, paste(flags[[name]], collapse = ", "))) + flag(sprintf("3 %s", b), + sprintf("%s: %s", name, paste(flags[[name]], collapse = ", "))) } } } @@ -175,12 +190,14 @@ if (!nzchar(pf_fresh_path)) { paste(link_extensions, collapse = ", "))) } if (length(in_fresh_not_link) > 0) { - cat(sprintf(" !! fresh ∖ link (link missing engine param): %s\n", - paste(in_fresh_not_link, collapse = ", "))) + flag(sprintf("3b %s", b), + sprintf("fresh ∖ link (link missing engine param): %s", + paste(in_fresh_not_link, collapse = ", "))) } if (length(unexpected_link) > 0) { - cat(sprintf(" !! link ∖ fresh (unexpected non-observation col): %s\n", - paste(unexpected_link, collapse = ", "))) + flag(sprintf("3b %s", b), + sprintf("link ∖ fresh (unexpected non-observation col): %s", + paste(unexpected_link, collapse = ", "))) } if (length(in_fresh_not_link) == 0 && length(unexpected_link) == 0) { cat(" column set aligned (link = fresh + observation_* extensions)\n") @@ -212,8 +229,8 @@ for (b in bundles) { cat(" on-disk-but-not-declared: (none)\n") } if (length(missing) > 0) { - cat(sprintf(" !! declared-but-missing: %s\n", - paste(missing, collapse = ", "))) + flag(sprintf("4 %s", b), + sprintf("declared-but-missing: %s", paste(missing, collapse = ", "))) } else { cat(" declared-but-missing: (none)\n") } @@ -226,11 +243,15 @@ cat("\n--- 5. lnk_load_overrides() smoke per bundle ---\n") for (b in bundles) { cfg <- tryCatch(lnk_config(b), error = function(e) NULL) if (is.null(cfg)) { - cat(sprintf(" %s: lnk_config FAILED\n", b)); next + cat(sprintf(" %s: lnk_config FAILED\n", b)) + FINDINGS[[length(FINDINGS) + 1L]] <- sprintf("[5 %s] lnk_config FAILED", b) + next } loaded <- tryCatch(lnk_load_overrides(cfg), error = function(e) e) if (inherits(loaded, "error")) { cat(sprintf(" %s: lnk_load_overrides FAILED — %s\n", b, conditionMessage(loaded))) + FINDINGS[[length(FINDINGS) + 1L]] <- + sprintf("[5 %s] lnk_load_overrides FAILED — %s", b, conditionMessage(loaded)) } else { cat(sprintf(" %s: %d entries loaded — %s\n", b, length(loaded), paste(names(loaded), collapse = ", "))) @@ -251,4 +272,15 @@ cat(" Note: these were the pre-bundle predecessors. Per CLAUDE.md they map to\n cat(" the default bundle's dimensions.csv. If they have drifted from the\n") cat(" default bundle, they're stale and should be either removed or pinned.\n") +# --------------------------------------------------------------------------- +# Rollup — aggregate every `!!` finding so nothing scrolls past, exit non-zero +# so the trifecta can gate on a clean audit. +# --------------------------------------------------------------------------- cat("\n==== AUDIT COMPLETE ====\n") +if (length(FINDINGS) == 0L) { + cat("\nNo findings — config layers aligned.\n") +} else { + cat(sprintf("\n!! %d finding(s):\n", length(FINDINGS))) + for (f in FINDINGS) cat(sprintf(" %s\n", f)) + if (!interactive()) quit(status = 1L) +} From 04f6494ef86ebe69f8034c3d7c817e8f0912f404 Mon Sep 17 00:00:00 2001 From: almac2022 Date: Thu, 28 May 2026 23:55:55 -0700 Subject: [PATCH 3/3] audit_configs.R: eliminate the 30 false-positive findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All 30 findings the audit emitted were audit-side defects, not config drift (lnk_config_verify reports 0 byte/shape drift in both bundles). Four fixes: - §1: drop the homegrown shape/byte checksum reimplementation and call the canonical lnk_config_verify() instead. The local recipe diverged from .lnk_shape_fingerprint (list(cols,nrow,types) serialize=TRUE vs header-row hash serialize=FALSE), producing a 24/24 spurious shape-drift wall. - §2: regenerate rules.yaml with edge_types="explicit" to match how the committed copy is actually built (build_rules.R + regen_provenance.R). "categories" produced an all-species phantom diff. - §3: split species-axis mismatches into defects (dim∖pf, yaml∖dim — flagged) vs expected asymmetries (pf∖dim, pf∖wsg, dim∖yaml — printed only). The expected directions are by-design supersets, not drift. - §4: resolve declared paths against bundle_dir and compare full bundle-relative paths instead of basenames. Basename-matching only the overrides/ dir mis-flagged root-level parameters_fresh.csv as missing. Audit now reports "No findings — config layers aligned." and exits 0. Co-Authored-By: Claude Opus 4.7 --- data-raw/audit_configs.R | 117 +++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 66 deletions(-) diff --git a/data-raw/audit_configs.R b/data-raw/audit_configs.R index 5cc8869a..9e2a706d 100644 --- a/data-raw/audit_configs.R +++ b/data-raw/audit_configs.R @@ -31,49 +31,15 @@ cat("\n==== CONFIG AUDIT (link", as.character(packageVersion("link")), ") ====\n # --------------------------------------------------------------------------- cat("\n--- 1. Provenance drift (config.yaml `provenance:` vs current files) ---\n") -shape_checksum <- function(path) { - if (!file.exists(path)) return(NA_character_) - ext <- tools::file_ext(path) - obj <- if (ext == "csv") { - df <- utils::read.csv(path, stringsAsFactors = FALSE, check.names = FALSE) - list(cols = names(df), nrow = nrow(df), col_types = vapply(df, class, character(1))) - } else if (ext == "yaml") { - y <- yaml::read_yaml(path) - list(top_keys = names(y)) - } else { - list(size = file.size(path)) - } - paste0("sha256:", digest::digest(obj, algo = "sha256", serialize = TRUE)) -} - -byte_checksum <- function(path) { - if (!file.exists(path)) return(NA_character_) - paste0("sha256:", digest::digest(file = path, algo = "sha256")) -} - -drift_report <- list() -for (b in bundles) { - cfg_path <- sprintf("inst/extdata/configs/%s/config.yaml", b) - cfg <- yaml::read_yaml(cfg_path) - bundle_dir <- dirname(cfg_path) - rows <- list() - for (key in names(cfg$provenance)) { - p <- file.path(bundle_dir, key) - rec <- cfg$provenance[[key]] - cur_byte <- byte_checksum(p) - cur_shape <- shape_checksum(p) - rows[[length(rows) + 1]] <- tibble( - bundle = b, file = key, - byte_drift = !is.na(rec$checksum) && !is.na(cur_byte) && - rec$checksum != cur_byte, - shape_drift = !is.na(rec$shape_checksum) && !is.na(cur_shape) && - rec$shape_checksum != cur_shape, - cur_byte = cur_byte, rec_byte = rec$checksum, - cur_shape = cur_shape, rec_shape = rec$shape_checksum - ) - } - drift_report[[b]] <- bind_rows(rows) -} +# Use the canonical verifier (R/lnk_config_verify.R) — same byte + +# .lnk_shape_fingerprint recipe that wrote the provenance baselines (and that +# regen_provenance.R refreshes). Reinventing the recipe here is what caused the +# earlier 24/24 false-positive shape-drift wall. +drift_report <- lapply(bundles, function(b) { + v <- lnk_config_verify(lnk_config(b)) + v$bundle <- b + v +}) all_drift <- bind_rows(drift_report) drifted <- all_drift |> dplyr::filter(byte_drift | shape_drift) cat(sprintf(" %d drifted entries across %d bundles\n", @@ -94,8 +60,11 @@ cat("\n--- 2. rules.yaml regen vs committed ---\n") for (b in bundles) { dim_csv <- sprintf("inst/extdata/configs/%s/dimensions.csv", b) rules_committed <- sprintf("inst/extdata/configs/%s/rules.yaml", b) + # edge_types = "explicit" to match how the committed rules.yaml is actually + # built (data-raw/build_rules.R + regen_provenance.R). Regenerating with + # "categories" here is what produced the earlier spurious all-species diff. tf <- tempfile(fileext = ".yaml") - lnk_rules_build(dim_csv, tf, edge_types = "categories") + lnk_rules_build(dim_csv, tf, edge_types = "explicit") identical_yaml <- identical(yaml::read_yaml(tf), yaml::read_yaml(rules_committed)) identical_text <- identical(readLines(tf), readLines(rules_committed)) @@ -134,24 +103,35 @@ for (b in bundles) { cat(sprintf(" wsg_species_presence cols:%s\n", paste(sort(wsg_sp), collapse = " "))) cat(sprintf(" rules.yaml top-level: %s\n", paste(sort(yaml_sp), collapse = " "))) - # Mismatches - in_dim_not_pf <- setdiff(dim_sp, pf_sp) - in_pf_not_dim <- setdiff(pf_sp, dim_sp) - in_pf_not_wsg <- setdiff(pf_sp, wsg_sp) - in_dim_not_yaml <- setdiff(dim_sp, yaml_sp) - in_yaml_not_dim <- setdiff(yaml_sp, dim_sp) - - flags <- list( - "dim ∖ pf" = in_dim_not_pf, - "pf ∖ dim" = in_pf_not_dim, - "pf ∖ wsg-cols"= in_pf_not_wsg, - "dim ∖ yaml" = in_dim_not_yaml, - "yaml ∖ dim" = in_yaml_not_dim + # Defect directions — a non-empty set here breaks the pipeline, so flag: + # dim ∖ pf : species has habitat rules but no access/cluster params + # (classify/cluster fails for it) + # yaml ∖ dim : rules.yaml entry with no dimensions source — can't happen + # via lnk_rules_build(), so it means a stale/hand-edited yaml + defects <- list( + "dim ∖ pf" = setdiff(dim_sp, pf_sp), + "yaml ∖ dim" = setdiff(yaml_sp, dim_sp) + ) + # Expected supersets / intentional exclusions — print for visibility but do + # NOT flag (these are by design, not defects): + # pf ∖ dim : params carry species we don't habitat-model (e.g. CT/DV/RB) + # pf ∖ wsg : params species absent from a (subset) presence table + # dim ∖ yaml : dimensions species intentionally dropped at generation + expected <- list( + "pf ∖ dim" = setdiff(pf_sp, dim_sp), + "pf ∖ wsg-cols"= setdiff(pf_sp, wsg_sp), + "dim ∖ yaml" = setdiff(dim_sp, yaml_sp) ) - for (name in names(flags)) { - if (length(flags[[name]]) > 0) { + for (name in names(expected)) { + if (length(expected[[name]]) > 0) { + cat(sprintf(" (expected) %s: %s\n", name, + paste(expected[[name]], collapse = ", "))) + } + } + for (name in names(defects)) { + if (length(defects[[name]]) > 0) { flag(sprintf("3 %s", b), - sprintf("%s: %s", name, paste(flags[[name]], collapse = ", "))) + sprintf("%s: %s", name, paste(defects[[name]], collapse = ", "))) } } } @@ -211,15 +191,20 @@ if (!nzchar(pf_fresh_path)) { cat("\n--- 4. Override files on disk vs declared in config.yaml ---\n") for (b in bundles) { cfg_path <- sprintf("inst/extdata/configs/%s/config.yaml", b) + bundle_dir <- dirname(cfg_path) cfg <- yaml::read_yaml(cfg_path) + # Declared paths are bundle-relative (e.g. "parameters_fresh.csv" at root, + # "overrides/foo.csv"). Resolve each against bundle_dir — basename-matching + # only the overrides/ dir is what mis-flagged root-level parameters_fresh.csv. declared_paths <- vapply(cfg$files, \(x) x$path, character(1)) - declared_files <- basename(declared_paths) - - override_dir <- sprintf("inst/extdata/configs/%s/overrides", b) - on_disk <- list.files(override_dir, pattern = "\\.csv$") + missing <- declared_paths[!file.exists(file.path(bundle_dir, declared_paths))] - undeclared <- setdiff(on_disk, declared_files) - missing <- setdiff(declared_files, on_disk) + # Undeclared: override-dir csvs not in the declared set (compare full + # bundle-relative path so root vs overrides/ never collide on basename). + override_rel <- file.path("overrides", + list.files(file.path(bundle_dir, "overrides"), + pattern = "\\.csv$")) + undeclared <- setdiff(override_rel, declared_paths) cat(sprintf("\n bundle: %s\n", b)) if (length(undeclared) > 0) {