From 6199db028aa3fd6af5031fece529c9ad881a82e4 Mon Sep 17 00:00:00 2001 From: cpunt Date: Wed, 27 May 2026 16:05:10 +0000 Subject: [PATCH 1/6] updater: add MCU update service and staging path --- .gitignore | 1 + main.go | 22 +- services/updater/applier_host.go | 19 + services/updater/applier_tinygo.go | 56 ++ services/updater/boot_id.go | 132 ++++ services/updater/boot_id_host.go | 15 + services/updater/boot_id_test.go | 8 + services/updater/boot_id_tinygo.go | 24 + services/updater/facts.go | 49 ++ services/updater/prestage_host.go | 12 + services/updater/prestage_tinygo.go | 101 +++ services/updater/receiver.go | 190 +++++ services/updater/rpc.go | 92 +++ services/updater/sink_host.go | 44 ++ services/updater/sink_tinygo.go | 116 +++ services/updater/types.go | 151 ++++ services/updater/updater.go | 475 ++++++++++++ services/updater/updater_test.go | 943 +++++++++++++++++++++++ services/updater/verifier.go | 113 +++ services/updater/verifier_passthrough.go | 50 ++ 20 files changed, 2612 insertions(+), 1 deletion(-) create mode 100644 services/updater/applier_host.go create mode 100644 services/updater/applier_tinygo.go create mode 100644 services/updater/boot_id.go create mode 100644 services/updater/boot_id_host.go create mode 100644 services/updater/boot_id_test.go create mode 100644 services/updater/boot_id_tinygo.go create mode 100644 services/updater/facts.go create mode 100644 services/updater/prestage_host.go create mode 100644 services/updater/prestage_tinygo.go create mode 100644 services/updater/receiver.go create mode 100644 services/updater/rpc.go create mode 100644 services/updater/sink_host.go create mode 100644 services/updater/sink_tinygo.go create mode 100644 services/updater/types.go create mode 100644 services/updater/updater.go create mode 100644 services/updater/updater_test.go create mode 100644 services/updater/verifier.go create mode 100644 services/updater/verifier_passthrough.go diff --git a/.gitignore b/.gitignore index 71e0d97..cf8ff87 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ build/ +zz_fw_update_e2e_identity.go .vscode/settings.json diff --git a/main.go b/main.go index 74b705d..978f3c8 100644 --- a/main.go +++ b/main.go @@ -7,14 +7,25 @@ import ( "devicecode-go/bus" "devicecode-go/services/hal" "devicecode-go/services/reactor" + "devicecode-go/services/updater" "devicecode-go/types" "devicecode-go/utilities" ) // HAL const halTimeout = 5 * time.Second + var halReadiness = bus.T("hal", "state") +// Firmware identity is set by host build tooling before main runs. The e2e +// harness generates a same-package init file because TinyGo's -X support is +// narrower than the standard Go linker's support. +var ( + FirmwareVersion = "0.0.0-dev" + FirmwareBuild = "local" + FirmwareImageID = "img-dev" +) + // ----------------------------------------------------------------------------- // Main // ----------------------------------------------------------------------------- @@ -47,6 +58,15 @@ func main() { } } + // boot_id (master R3 / fabric-update W6): generate AFTER HAL ready + // and BEFORE the reactor opens fabric. RAM-only — never persisted. + bootID := updater.GenerateBootID() + log.Println("[main] boot_id =", bootID) + + reactor.FirmwareVersion = FirmwareVersion + reactor.FirmwareBuild = FirmwareBuild + reactor.FirmwareImageID = FirmwareImageID + // Reactor r := reactor.NewReactor(b, uiConn) r.Run(ctx) @@ -76,4 +96,4 @@ func waitHALReady(ctx context.Context, c *bus.Connection, d time.Duration) bool } // Global logger instance -var log = utilities.Logger{LineStart: true} \ No newline at end of file +var log = utilities.Logger{LineStart: true} diff --git a/services/updater/applier_host.go b/services/updater/applier_host.go new file mode 100644 index 0000000..1633fd1 --- /dev/null +++ b/services/updater/applier_host.go @@ -0,0 +1,19 @@ +//go:build !tinygo + +package updater + +// ProductionApplier returns the applier the reactor wires by default. +// On host builds (tests, dev environments without a flash slot to +// reboot into) this stays the safe-default RefusingApplier — commit +// returns apply_unavailable. Real reboot wiring lives in +// applier_tinygo.go. +func ProductionApplier() Applier { return RefusingApplier() } + +func scheduleArmReboot(a Applier, d StagedDescriptor, results chan<- applyRebootResult) { + if err := a.ArmReboot(d); err != nil { + select { + case results <- applyRebootResult{desc: d, err: err}: + default: + } + } +} diff --git a/services/updater/applier_tinygo.go b/services/updater/applier_tinygo.go new file mode 100644 index 0000000..8df4b86 --- /dev/null +++ b/services/updater/applier_tinygo.go @@ -0,0 +1,56 @@ +//go:build tinygo && rp2350 + +package updater + +import ( + "errors" + "time" +) + +// abupdateApplier reboots into the slot the abupdateSink staged into. +// CanApply requires that newSlotSink has previously initialised the +// shared updater (i.e. updater/main staging wrote a staged image); without +// that, the inactive slot still holds the previous image and rebooting +// would either roll back or fail at the bootloader. +type abupdateApplier struct{} + +// ProductionApplier returns the abupdate-backed applier. CanApply +// validates that a staging cycle ran; ArmReboot calls +// abupdate.RebootIntoSlot which does not return on success. +func ProductionApplier() Applier { return abupdateApplier{} } + +const postCommitReplyFlushDelay = 750 * time.Millisecond + +func (abupdateApplier) CanApply(d StagedDescriptor) error { + _ = d + if !sharedUpdaterInit { + return errFromRC("apply_unavailable_uninited", 0) + } + return nil +} + +func (abupdateApplier) ArmReboot(d StagedDescriptor) error { + _ = d + if !sharedUpdaterInit { + return errors.New("apply_reboot_failed:apply_unavailable_uninited") + } + // Does not return on success. + rc := sharedUpdater.RebootIntoSlot() + return errors.New("apply_reboot_failed:" + errFromRC("reboot_into_slot", rc).Error()) +} + +func scheduleArmReboot(a Applier, d StagedDescriptor, results chan<- applyRebootResult) { + go func() { + // handleCommit has only replied on the local bus. The fabric + // session still needs a scheduler turn to marshal and write the + // wire reply (and the state=rebooting retain) back to CM5 before + // RebootIntoSlot stops the process. + time.Sleep(postCommitReplyFlushDelay) + if err := a.ArmReboot(d); err != nil { + select { + case results <- applyRebootResult{desc: d, err: err}: + default: + } + } + }() +} diff --git a/services/updater/boot_id.go b/services/updater/boot_id.go new file mode 100644 index 0000000..295ed80 --- /dev/null +++ b/services/updater/boot_id.go @@ -0,0 +1,132 @@ +package updater + +import ( + "encoding/hex" + "runtime" + "sync/atomic" + "time" +) + +// boot_id contract per master plan R3 / docs/firmware-alignment-update.md §W6: +// - Opaque 16-character lower-hex marker that must change on every +// successful boot. +// - Generated from 8 bytes of crypto/rand AFTER HAL init succeeds and +// BEFORE fabric opens, so it's available to the first software-fact +// publish on hello_ack. +// - Held in RAM only. Not persisted to flash. Not added to the +// abupdate metadata block (the regression guard test in +// fabric-update tests checks that abupdate metadata never grows a +// boot_id field). +// +// The fallback path on rand failure is documented inline; this branch +// drops to a process-startup counter rather than panicking, with a +// clear log so the failure-mode test suite (master R3) can assert it. + +var ( + cachedBootID atomic.Pointer[string] + fallbackTick uint64 +) + +// GenerateBootID populates the cached value. Call exactly once during +// boot — main.go invokes it between HAL ready and fabric.Run. Subsequent +// calls return the existing value (idempotent so reactor reinit doesn't +// regenerate). +func GenerateBootID() string { + if existing := cachedBootID.Load(); existing != nil { + return *existing + } + id := generate() + if cachedBootID.CompareAndSwap(nil, &id) { + return id + } + // Lost the race; return whatever the winner stored. + return *cachedBootID.Load() +} + +// BootID returns the cached value generated at boot. Returns "" if +// GenerateBootID has not yet been called — which would indicate a +// boot-order bug, since the spec says it must run before fabric opens. +func BootID() string { + if existing := cachedBootID.Load(); existing != nil { + return *existing + } + return "" +} + +func generate() string { + var buf [8]byte + if tryCryptoRand(buf[:]) { + return hex.EncodeToString(buf[:]) + } + // Fallback: triggered only when crypto/rand is unavailable or returns + // all-zero. This is best-effort per-boot jitter, not contract-grade entropy. + // The log line below is the failure-mode signal for tests and diagnostics. + tick := atomic.AddUint64(&fallbackTick, 1) + println("[updater] boot_id fallback engaged tick=", itoaU64(tick)) + + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + + // Best-effort fallback when crypto/rand is unavailable. Mixes: + // - monotonic clock at generation time (UnixNano), which varies + // with HAL init duration across cold boots; + // - runtime.MemStats Alloc / Mallocs / HeapInuse / Frees, which + // vary with allocation timing inside HAL bringup; + // - the per-call counter so multiple GenerateBootID calls within + // one process boot don't collide. + // Followed by a 3-stage shift mix so every output byte depends on + // every input bit. + // + // NOT contract-grade: the mix is non-cryptographic and depends on runtime + // jitter rather than a hardware entropy source. If crypto/rand is broken on + // the target, use the RP2350 hardware RNG or a persisted boot counter. + mix := tick + mix ^= uint64(time.Now().UnixNano()) + mix ^= ms.Alloc + mix ^= uint64(ms.Mallocs) + mix ^= uint64(ms.HeapInuse) + mix ^= uint64(ms.Frees) << 32 + mix ^= mix >> 11 + mix ^= mix << 17 + mix ^= mix >> 5 + for i := 7; i >= 0; i-- { + buf[i] = byte(mix & 0xff) + mix >>= 8 + } + return hex.EncodeToString(buf[:]) +} + +// tryCryptoRand is split per build: +// - host (!tinygo) — boot_id_host.go reads from crypto/rand +// - tinygo (RP2350 et al.) — boot_id_tinygo.go skips crypto/rand +// entirely and returns false, so the firmware always falls +// through to the deterministic mix below. TinyGo on RP2350 +// panics with "no rng" inside crypto/rand.Read, and pulling in +// defer/recover to catch it grew the binary by ~110 KB. Until +// TinyGo wires the RP2350 hardware-RNG (rosc) into its +// crypto/rand backend or we route a HAL-supplied RNG into +// services/updater, the safe-by-default path on the firmware +// is to never call crypto/rand. + +func allZero(b []byte) bool { + for _, c := range b { + if c != 0 { + return false + } + } + return true +} + +func itoaU64(v uint64) string { + if v == 0 { + return "0" + } + var buf [20]byte + pos := len(buf) + for v > 0 { + pos-- + buf[pos] = byte('0' + v%10) + v /= 10 + } + return string(buf[pos:]) +} diff --git a/services/updater/boot_id_host.go b/services/updater/boot_id_host.go new file mode 100644 index 0000000..11f6c53 --- /dev/null +++ b/services/updater/boot_id_host.go @@ -0,0 +1,15 @@ +//go:build !tinygo + +package updater + +import "crypto/rand" + +// tryCryptoRand on host builds reads 8 bytes from crypto/rand. Tests +// assert randomness across simulated "boots" via this path. +func tryCryptoRand(buf []byte) bool { + n, err := rand.Read(buf) + if err != nil || n != len(buf) { + return false + } + return !allZero(buf) +} diff --git a/services/updater/boot_id_test.go b/services/updater/boot_id_test.go new file mode 100644 index 0000000..5b16a74 --- /dev/null +++ b/services/updater/boot_id_test.go @@ -0,0 +1,8 @@ +package updater + +import "sync/atomic" + +func resetBootIDForTest() { + cachedBootID.Store(nil) + atomic.StoreUint64(&fallbackTick, 0) +} diff --git a/services/updater/boot_id_tinygo.go b/services/updater/boot_id_tinygo.go new file mode 100644 index 0000000..310b082 --- /dev/null +++ b/services/updater/boot_id_tinygo.go @@ -0,0 +1,24 @@ +//go:build tinygo + +package updater + +// tryCryptoRand on TinyGo always returns false so generate() falls +// through to the deterministic mix. +// +// Why we don't call crypto/rand.Read here: on RP2350 (and several +// other TinyGo targets), the runtime has no hardware-RNG seam wired +// in, and crypto/rand.Read PANICS with "no rng" rather than +// returning an error. Recovering from that panic in Go is possible +// but pulls TinyGo's panic-handling runtime into the binary, +// inflating code size by ~110 KB. The deterministic mix is poor +// entropy but at least it boots — and the +// `[updater] boot_id fallback engaged` log line is the canonical +// signal for the failure-mode hardware test suite. +// +// When TinyGo grows an RP2350 RNG backend or we route a HAL- +// supplied RNG into services/updater, drop this stub and let the +// host-side implementation (boot_id_host.go) be the single source. +func tryCryptoRand(buf []byte) bool { + _ = buf + return false +} diff --git a/services/updater/facts.go b/services/updater/facts.go new file mode 100644 index 0000000..d7c9536 --- /dev/null +++ b/services/updater/facts.go @@ -0,0 +1,49 @@ +package updater + +// PublishSoftware emits the retained state/self/software fact with the +// build identity + the per-boot RAM-only boot_id + the persisted +// payload_sha256 (when abupdate has populated it). Callers don't pass +// inputs — the fact pulls everything from the Service's configured +// Identity + boot_id cache + metadata reader. +func (s *Service) PublishSoftware() { + fact := SoftwareFact{ + Version: s.identity.Version, + BuildID: s.identity.Build, + ImageID: s.identity.ImageID, + BootID: s.ensureBootID(), + PayloadSHA256: s.metadata.PayloadSHA256(), + } + s.conn.Publish(s.conn.NewMessage(TopicSoftwareFact, fact, true)) +} + +func strPtrOrNil(v string) *string { + if v == "" { + return nil + } + return &v +} + +// PublishUpdater emits the retained state/self/updater fact with the +// canonical {state, last_error, pending_version} shape. Called on +// every state transition (via transitionTo) and as part of the post- +// hello_ack republish. +func (s *Service) PublishUpdater() { + s.mu.Lock() + fact := UpdaterFact{ + State: s.state, + LastError: strPtrOrNil(s.lastError), + PendingVersion: strPtrOrNil(s.pendingVersion), + PendingImageID: strPtrOrNil(s.pendingImageID), + StagedImageID: strPtrOrNil(s.stagedImageID), + JobID: strPtrOrNil(s.jobID), + } + s.mu.Unlock() + s.conn.Publish(s.conn.NewMessage(TopicUpdaterFact, fact, true)) +} + +// PublishHealth emits the retained state/self/health fact. Reason is +// optional; "" is dropped via the omitempty tag. +func (s *Service) PublishHealth(state, reason string) { + fact := HealthFact{State: state, Reason: reason} + s.conn.Publish(s.conn.NewMessage(TopicHealthFact, fact, true)) +} diff --git a/services/updater/prestage_host.go b/services/updater/prestage_host.go new file mode 100644 index 0000000..c78c83a --- /dev/null +++ b/services/updater/prestage_host.go @@ -0,0 +1,12 @@ +//go:build !tinygo || !rp2350 + +package updater + +type streamedStage struct { + Length uint32 + PayloadSHA256 string +} + +func consumeStreamedStage() (streamedStage, bool) { + return streamedStage{}, false +} diff --git a/services/updater/prestage_tinygo.go b/services/updater/prestage_tinygo.go new file mode 100644 index 0000000..bef919d --- /dev/null +++ b/services/updater/prestage_tinygo.go @@ -0,0 +1,101 @@ +//go:build tinygo && rp2350 + +package updater + +import ( + "crypto/sha256" + "encoding/hex" + "errors" + + "pico2-a-b/abupdate" +) + +// streamedStage tracks a raw transfer that fabric has already streamed into +// the inactive A/B slot. It is the TinyGo bring-up path used before imagev1 +// verification can stream directly from the transfer source. +type streamedStage struct { + Length uint32 + PayloadSHA256 string +} + +var ( + streamedStageDesc streamedStage + streamedStageOK bool + streamedStageHash = sha256.New() + streamedStageLen uint32 +) + +// BeginStreamedStage prepares the inactive slot for a raw incoming transfer. +// The caller must subsequently call WriteStreamedStage and CommitStreamedStage +// or AbortStreamedStage. +func BeginStreamedStage(size uint32) error { + // A fresh prepare invalidates any prior stage, and retrying an update in + // the same boot must not inherit abupdate's previous writing/complete + // state. Recreate the updater before resolving the inactive slot. + sharedUpdater = abupdate.Updater{} + sharedUpdaterInit = false + + u, err := ensureUpdaterInited() + if err != nil { + return err + } + if rc := u.BeginUpdate(size); rc != 0 { + return errFromRC("begin_update", rc) + } + streamedStageHash.Reset() + streamedStageLen = 0 + streamedStageDesc = streamedStage{} + streamedStageOK = false + return nil +} + +func WriteStreamedStage(data []byte) error { + if len(data) == 0 { + return errors.New("empty_chunk") + } + u, err := ensureUpdaterInited() + if err != nil { + return err + } + if rc := u.WriteChunk(data); rc != 0 { + return errFromRC("write_chunk", rc) + } + _, _ = streamedStageHash.Write(data) + streamedStageLen += uint32(len(data)) + return nil +} + +func CommitStreamedStage() (uint32, error) { + u, err := ensureUpdaterInited() + if err != nil { + return 0, err + } + if rc := u.FlushFinal(); rc != 0 { + return 0, errFromRC("flush_final", rc) + } + streamedStageDesc = streamedStage{ + Length: streamedStageLen, + PayloadSHA256: hex.EncodeToString(streamedStageHash.Sum(nil)), + } + streamedStageOK = true + return u.BytesWritten(), nil +} + +func AbortStreamedStage() { + streamedStageDesc = streamedStage{} + streamedStageOK = false + streamedStageLen = 0 + streamedStageHash.Reset() +} + +func consumeStreamedStage() (streamedStage, bool) { + if !streamedStageOK { + return streamedStage{}, false + } + out := streamedStageDesc + streamedStageDesc = streamedStage{} + streamedStageOK = false + streamedStageLen = 0 + streamedStageHash.Reset() + return out, true +} diff --git a/services/updater/receiver.go b/services/updater/receiver.go new file mode 100644 index 0000000..51c33e8 --- /dev/null +++ b/services/updater/receiver.go @@ -0,0 +1,190 @@ +package updater + +import ( + "bytes" + + "devicecode-go/bus" +) + +// The SlotSink used during verification is created via newSlotSink, +// which is build-tag-split: host returns a RAM buffer (sink_host.go), +// tinygo+rp2350 returns an abupdate-backed sink that streams into the +// inactive A/B slot (sink_tinygo.go). + +// handleStage runs the verifier-gated staging path. Triggered by fabric +// after xfer_commit; the reply gates whether fabric sends xfer_done or +// xfer_abort. +// +// On verifier success: write staged descriptor, publish state=staged +// with the manifest's version as pending_version, return ok=true. +// +// On verifier failure: publish state=failed with the verifier's error +// string in last_error, return ok=false. +func (s *Service) handleStage(msg *bus.Message) { + payload, ok := jsonDecode[StagePayload](msg.Payload) + if !ok { + s.reply(msg, StageReply{OK: false, Err: "bad_payload"}) + return + } + if payload.Target != TargetUpdaterMain { + s.reply(msg, StageReply{OK: false, Err: "unsupported_target"}) + return + } + if payload.DigestAlg != "" && payload.DigestAlg != DigestAlgXXHash32 { + s.reply(msg, StageReply{OK: false, Err: "unsupported_digest_alg"}) + return + } + s.transitionTo(StateReceiving, "", "") + + if len(payload.Artefact) == 0 { + staged, ok := consumeStreamedStage() + if !ok { + s.clearStagedImage() + s.transitionTo(StateFailed, "artefact_missing", "") + s.reply(msg, StageReply{OK: false, Err: "artefact_missing"}) + return + } + stageIdentity, _ := identityFromStageMeta(s.identity, payload.Meta) + desc := StagedDescriptor{ + Version: stageIdentity.Version, + BuildID: stageIdentity.Build, + ImageID: stageIdentity.ImageID, + Length: staged.Length, + Slot: 0, + PayloadSHA256: staged.PayloadSHA256, + } + if err := s.metadataWrite.WriteStagedDescriptor(desc); err != nil { + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + s.transitionTo(StateFailed, "metadata_write_failed:"+err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: "metadata_write_failed"}) + return + } + s.setStagedImage(desc.ImageID, desc.Version) + s.transitionTo(StateStaged, "", desc.Version) + s.reply(msg, StageReply{OK: true, Stage: "staged"}) + return + } + + sink, err := newSlotSink(uint32(len(payload.Artefact))) + if err != nil { + s.clearStagedImage() + s.transitionTo(StateFailed, "sink_init_failed:"+err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: "sink_init_failed"}) + return + } + manifest, err := s.verifier.Verify(bytes.NewReader(payload.Artefact), sink) + if err != nil { + // Verifier rejected the artefact. Clear any prior descriptor so a + // following commit cannot apply stale firmware from an older stage. + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + s.transitionTo(StateFailed, err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: err.Error()}) + return + } + + // On verifier success the sink holds the verified payload bytes. + // Persist the staged descriptor via the abupdate metadata writer + // (W11) so the next prepare/commit RPC and the next boot's + // software fact see payload_sha256 + descriptor. The fabric-update + // branch ships an in-memory writer; fabric-security replaces it + // with a flash-backed implementation that survives reboots. + if err := sink.Commit(); err != nil { + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + s.transitionTo(StateFailed, "sink_commit_failed:"+err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: "sink_commit_failed"}) + return + } + desc := StagedDescriptor{ + Version: manifest.Version, + BuildID: manifest.BuildID, + ImageID: manifest.ImageID, + Length: manifest.PayloadLength, + Slot: 0, // slot-pick comes from abupdate when fabric-security wires it + PayloadSHA256: manifest.PayloadSHA256, + } + if err := s.metadataWrite.WriteStagedDescriptor(desc); err != nil { + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + s.transitionTo(StateFailed, "metadata_write_failed:"+err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: "metadata_write_failed"}) + return + } + + s.setStagedImage(desc.ImageID, manifest.Version) + s.transitionTo(StateStaged, "", manifest.Version) + // Do not republish the software fact here: PayloadSHA256 describes the + // running image, while this descriptor describes the staged image. + s.reply(msg, StageReply{OK: true, Stage: "staged"}) +} + +type stageMetadata struct { + Version string `json:"version,omitempty"` + Build string `json:"build,omitempty"` + BuildID string `json:"build_id,omitempty"` + ImageID string `json:"image_id,omitempty"` + ExpectedImageID string `json:"expected_image_id,omitempty"` +} + +type stageMetadataEnvelope struct { + Metadata stageMetadata `json:"metadata,omitempty"` + Meta stageMetadata `json:"meta,omitempty"` + Request struct { + Metadata stageMetadata `json:"metadata,omitempty"` + Meta stageMetadata `json:"meta,omitempty"` + ExpectedImageID string `json:"expected_image_id,omitempty"` + } `json:"request,omitempty"` +} + +func applyStageMetadata(ident *Identity, md stageMetadata) bool { + applied := false + if md.Version != "" { + ident.Version = md.Version + applied = true + } + if md.BuildID != "" { + ident.Build = md.BuildID + applied = true + } else if md.Build != "" { + ident.Build = md.Build + applied = true + } + if md.ImageID != "" { + ident.ImageID = md.ImageID + applied = true + } else if md.ExpectedImageID != "" { + ident.ImageID = md.ExpectedImageID + applied = true + } + return applied +} + +func identityFromStageMeta(defaults Identity, meta any) (Identity, bool) { + ident := defaults + applied := false + md, ok := jsonDecode[stageMetadata](meta) + if ok { + applied = applyStageMetadata(&ident, md) || applied + } + + env, ok := jsonDecode[stageMetadataEnvelope](meta) + if !ok { + return ident, applied + } + applied = applyStageMetadata(&ident, env.Metadata) || applied + applied = applyStageMetadata(&ident, env.Meta) || applied + if env.Request.ExpectedImageID != "" && env.Request.Metadata.ExpectedImageID == "" { + env.Request.Metadata.ExpectedImageID = env.Request.ExpectedImageID + } + if env.Request.ExpectedImageID != "" && env.Request.Meta.ExpectedImageID == "" { + env.Request.Meta.ExpectedImageID = env.Request.ExpectedImageID + } + applied = applyStageMetadata(&ident, env.Request.Metadata) || applied + applied = applyStageMetadata(&ident, env.Request.Meta) || applied + if !applied { + return ident, false + } + return ident, true +} diff --git a/services/updater/rpc.go b/services/updater/rpc.go new file mode 100644 index 0000000..b721eb3 --- /dev/null +++ b/services/updater/rpc.go @@ -0,0 +1,92 @@ +package updater + +import "devicecode-go/bus" + +// handlePrepare processes cap/self/updater/main/rpc/prepare-update after +// Fabric remaps it to the local bus. Success returns the current contract's +// prepare acknowledgement, including the required transfer target and maximum +// raw chunk size. +func (s *Service) handlePrepare(msg *bus.Message) { + req, ok := jsonDecode[PrepareRequest](msg.Payload) + if !ok { + s.reply(msg, Reply{OK: false, Error: "bad_request"}) + return + } + if req.Target != "" && req.Target != PrepareTargetMCU { + s.reply(msg, Reply{OK: false, Error: ErrTargetMismatch}) + return + } + + s.mu.Lock() + if s.preparing || s.state == StateCommitting || s.state == StateRebooting { + s.mu.Unlock() + s.reply(msg, Reply{OK: false, Error: ErrBusy}) + return + } + s.preparing = true + s.mu.Unlock() + s.setJobContext(req.JobID, req.ExpectedImageID) + s.transitionTo(StatePreparing, "", "") + + // Clear any persisted staged descriptor from a previous successful + // stage. Without this, a flow of (stage A) -> (prepare for B) -> + // (stage B fails) leaves descriptor A persisted and committable — + // which would be a real safety bug since the user-intent on + // prepare(B) is "I want to stage B, throw away A". + if err := s.metadataWrite.ClearStagedDescriptor(); err != nil { + s.markPrepareDone() + s.reply(msg, Reply{OK: false, Error: "metadata_clear_failed:" + err.Error()}) + return + } + + s.transitionTo(StateReady, "", "") + s.markPrepareDone() + s.reply(msg, PrepareReply{ + Ready: true, + Target: TargetUpdaterMain, + MaxChunkSize: DefaultMaxChunkSize, + }) +} + +// handleCommit processes cap/self/updater/main/rpc/commit-update after Fabric +// remaps it to the local bus. It only accepts a valid staged descriptor +// matching the requested/remembered expected image. +func (s *Service) handleCommit(msg *bus.Message) { + req, ok := jsonDecode[CommitRequest](msg.Payload) + if !ok { + s.reply(msg, Reply{OK: false, Error: "bad_request"}) + return + } + + desc, present := s.metadata.StagedDescriptor() + s.mu.Lock() + stagedInState := s.state == StateStaged + pendingImageID := s.pendingImageID + s.mu.Unlock() + + if !present || !stagedInState { + s.reply(msg, Reply{OK: false, Error: ErrNothingStaged}) + return + } + expectedImageID := req.ExpectedImageID + if expectedImageID == "" { + expectedImageID = pendingImageID + } + if expectedImageID != "" && desc.ImageID != expectedImageID { + s.reply(msg, Reply{OK: false, Error: ErrTargetMismatch}) + return + } + + // Validate the apply path before publishing committing/rebooting or + // replying accepted. The default Applier refuses in non-hardware tests. + if err := s.applier.CanApply(desc); err != nil { + s.reply(msg, Reply{OK: false, Error: err.Error()}) + return + } + + s.transitionTo(StateCommitting, "", desc.Version) + s.reply(msg, CommitReply{Accepted: true, RebootRequired: true}) + s.transitionTo(StateRebooting, "", desc.Version) + + scheduleArmReboot(s.applier, desc, s.applyResults) +} diff --git a/services/updater/sink_host.go b/services/updater/sink_host.go new file mode 100644 index 0000000..7c2f78c --- /dev/null +++ b/services/updater/sink_host.go @@ -0,0 +1,44 @@ +//go:build !tinygo + +package updater + +import ( + "bytes" + "io" +) + +// memorySink buffers verified payload bytes in RAM. Used on host builds +// (tests, dev builds) where there's no flash slot to write into. Any +// build that needs to stage to actual flash uses the tinygo+rp2350 +// sink in sink_tinygo.go. +type memorySink struct { + buf bytes.Buffer + closed bool +} + +func (m *memorySink) Write(p []byte) (int, error) { + if m.closed { + return 0, io.ErrClosedPipe + } + return m.buf.Write(p) +} + +func (m *memorySink) Commit() error { + m.closed = true + return nil +} + +func (m *memorySink) Abort() error { + m.buf.Reset() + m.closed = true + return nil +} + +// newSlotSink returns the host-default sink. totalSize is unused — the +// memory sink grows as bytes arrive. Staging passes it for parity +// with the tinygo factory which must hand the size to abupdate up +// front. +func newSlotSink(totalSize uint32) (SlotSink, error) { + _ = totalSize + return &memorySink{}, nil +} diff --git a/services/updater/sink_tinygo.go b/services/updater/sink_tinygo.go new file mode 100644 index 0000000..9ce85dc --- /dev/null +++ b/services/updater/sink_tinygo.go @@ -0,0 +1,116 @@ +//go:build tinygo && rp2350 + +package updater + +import ( + "errors" + + "pico2-a-b/abupdate" +) + +// sharedUpdater is the package-level abupdate instance. It must persist +// across the staging path (which writes the staged image into the +// inactive slot via the abupdateSink) and the applier path (which +// reboots into that slot). One device = one inactive slot, so a +// singleton is fine. +var ( + sharedUpdater abupdate.Updater + sharedUpdaterInit bool +) + +func ensureUpdaterInited() (*abupdate.Updater, error) { + if !sharedUpdaterInit { + if rc := sharedUpdater.Init(); rc != 0 { + return nil, errFromRC("updater_init", rc) + } + sharedUpdaterInit = true + } + return &sharedUpdater, nil +} + +// abupdateSink streams verified payload bytes straight into the +// inactive A/B slot via abupdate.WriteChunk. Commit() pads + writes +// the final partial page; Abort() leaves the slot in its current +// state (the next BeginUpdate erases sectors lazily as the next image +// is written). +type abupdateSink struct { + u *abupdate.Updater + closed bool +} + +func (s *abupdateSink) Write(p []byte) (int, error) { + if s.closed { + return 0, errors.New("abupdate_sink: closed") + } + if rc := s.u.WriteChunk(p); rc != 0 { + return 0, errFromRC("write_chunk", rc) + } + return len(p), nil +} + +func (s *abupdateSink) Commit() error { + if s.closed { + return nil + } + s.closed = true + if rc := s.u.FlushFinal(); rc != 0 { + return errFromRC("flush_final", rc) + } + return nil +} + +func (s *abupdateSink) Abort() error { + s.closed = true + return nil +} + +// newSlotSink resolves the inactive slot, calls BeginUpdate(totalSize) +// so abupdate knows when to stop erasing, and hands back a sink that +// streams into flash. The staging path creates one of these per Verify +// call. +func newSlotSink(totalSize uint32) (SlotSink, error) { + u, err := ensureUpdaterInited() + if err != nil { + return nil, err + } + if rc := u.BeginUpdate(totalSize); rc != 0 { + return nil, errFromRC("begin_update", rc) + } + return &abupdateSink{u: u}, nil +} + +func errFromRC(op string, rc int32) error { + return &rcError{op: op, rc: rc} +} + +type rcError struct { + op string + rc int32 +} + +func (e *rcError) Error() string { + return e.op + ":" + i32s(e.rc) +} + +func i32s(v int32) string { + if v == 0 { + return "0" + } + neg := false + if v < 0 { + neg = true + v = -v + } + var buf [12]byte + pos := len(buf) + for v > 0 { + pos-- + buf[pos] = byte('0' + v%10) + v /= 10 + } + if neg { + pos-- + buf[pos] = '-' + } + return string(buf[pos:]) +} diff --git a/services/updater/types.go b/services/updater/types.go new file mode 100644 index 0000000..7fc1231 --- /dev/null +++ b/services/updater/types.go @@ -0,0 +1,151 @@ +package updater + +// State enumerates the canonical updater states from the current MCU +// contract. Empty string is accepted as the nil/unset state for local +// callers that have not published a fact yet. +type State string + +const ( + StateRunning State = "running" + StateReady State = "ready" + StatePreparing State = "preparing" + StateReceiving State = "receiving" + StateStaged State = "staged" + StateCommitting State = "committing" + StateRebooting State = "rebooting" + StateFailed State = "failed" + StateRollbackDetected State = "rollback_detected" +) + +func (s State) Allowed() bool { + switch s { + case "", + StateRunning, StateReady, StatePreparing, StateReceiving, + StateStaged, StateCommitting, StateRebooting, + StateFailed, StateRollbackDetected: + return true + } + return false +} + +const ( + PrepareTargetMCU = "mcu" + TargetUpdaterMain = "updater/main" + DigestAlgXXHash32 = "xxhash32" + DefaultMaxChunkSize uint32 = 2048 +) + +// PrepareRequest mirrors the current prepare-update payload. +// metadata is intentionally opaque so the CM5 can add fields without +// requiring a device firmware rebuild. +type PrepareRequest struct { + JobID string `json:"job_id,omitempty"` + Target string `json:"target,omitempty"` + ExpectedImageID string `json:"expected_image_id,omitempty"` + Metadata any `json:"metadata,omitempty"` +} + +// CommitRequest mirrors commit-update. +type CommitRequest struct { + JobID string `json:"job_id,omitempty"` + ExpectedImageID string `json:"expected_image_id,omitempty"` + Metadata any `json:"metadata,omitempty"` +} + +type PrepareReply struct { + Ready bool `json:"ready"` + Target string `json:"target"` + MaxChunkSize uint32 `json:"max_chunk_size"` +} + +type CommitReply struct { + Accepted bool `json:"accepted"` + RebootRequired bool `json:"reboot_required,omitempty"` +} + +// Reply is retained for refusal/error replies. Successful prepare/commit +// calls use the contract-specific PrepareReply and CommitReply shapes. +type Reply struct { + OK bool `json:"ok"` + Accepted bool `json:"accepted,omitempty"` + Error string `json:"error,omitempty"` +} + +// Refusal error strings — the Lua side compares against these. +const ( + ErrBusy = "busy" + ErrNothingStaged = "nothing_staged" + ErrTargetMismatch = "target_mismatch" + // ErrApplyUnavailable is returned when the commit RPC sees a valid + // staged descriptor but no Applier is wired to actually trigger + // the slot-switch + reboot. fabric-update ships with a refusing + // Applier so we never lie to the CM5 about apply success on a + // branch where the apply path doesn't exist; fabric-security + // supplies a real Applier and the refusal goes away. + ErrApplyUnavailable = "apply_unavailable" +) + +// SoftwareFact is the retained payload at state/self/software per +// docs/firmware-alignment-update.md §"Identity facts". `boot_id` is +// generated per boot (W6, RAM-only); `payload_sha256` is bare 64-char +// lower-hex sourced from the abupdate metadata block. +type SoftwareFact struct { + Version string `json:"version"` + BuildID string `json:"build_id"` + ImageID string `json:"image_id"` + BootID string `json:"boot_id"` + PayloadSHA256 string `json:"payload_sha256,omitempty"` +} + +// UpdaterFact is the retained payload at state/self/updater. Nullable +// fields are pointers so JSON publishes explicit nulls, not omitted +// properties, when no value is present. +type UpdaterFact struct { + State State `json:"state"` + LastError *string `json:"last_error"` + PendingVersion *string `json:"pending_version"` + PendingImageID *string `json:"pending_image_id"` + StagedImageID *string `json:"staged_image_id"` + JobID *string `json:"job_id"` +} + +// HealthFact is the retained payload at state/self/health. Lua extracts +// `state`; Reason is optional. +type HealthFact struct { + State string `json:"state"` + Reason string `json:"reason,omitempty"` +} + +// StagedDescriptor is the metadata about a staged image, persisted in +// the abupdate metadata block by updater/main staging after the verifier +// accepts. Read at the next prepare/commit RPC to know what's actually +// stageable. +type StagedDescriptor struct { + Version string `json:"version"` + BuildID string `json:"build_id"` + ImageID string `json:"image_id"` + Length uint32 `json:"length"` + Slot uint8 `json:"slot"` + PayloadSHA256 string `json:"payload_sha256"` +} + +// StagePayload is the local updater/main staging RPC invoked by fabric +// after xfer_commit has verified size and transfer digest. It replaces +// the older meta.receiver/raw-member receive path; the CM5 supplies only +// target="updater/main" on the wire. +type StagePayload struct { + LinkID string `json:"link_id"` + XferID string `json:"xfer_id"` + Target string `json:"target"` + Size uint32 `json:"size"` + DigestAlg string `json:"digest_alg"` + Digest string `json:"digest"` + Meta any `json:"meta,omitempty"` + Artefact []byte `json:"artefact,omitempty"` +} + +type StageReply struct { + OK bool `json:"ok"` + Err string `json:"err,omitempty"` + Stage string `json:"stage,omitempty"` +} diff --git a/services/updater/updater.go b/services/updater/updater.go new file mode 100644 index 0000000..b5dc335 --- /dev/null +++ b/services/updater/updater.go @@ -0,0 +1,475 @@ +package updater + +import ( + "context" + "encoding/json" + "sync" + + "devicecode-go/bus" +) + +// Local-bus topics the updater binds to. Fabric routes wire +// prepare-update/commit-update calls here. The staging path is a local RPC +// called by fabric after xfer_commit for +// target="updater/main"; raw/member topic names are not wire contract. +var ( + TopicPrepareRPC = bus.T("rpc", "updater", "prepare") + TopicCommitRPC = bus.T("rpc", "updater", "commit") + TopicStageRPC = bus.T("rpc", "updater", "stage") + + TopicSoftwareFact = bus.T("state", "self", "software") + TopicUpdaterFact = bus.T("state", "self", "updater") + TopicHealthFact = bus.T("state", "self", "health") + + // TopicFabricLink is the wildcard the updater watches to drive the + // post-hello_ack republish (W10). The fabric session retains a + // payload at state/fabric/link/ on every link-state edge; + // we pick out Ready-true transitions and call Republish() so the + // CM5 sees fresh state/self/* facts on every newly established + // session, warm or cold. + TopicFabricLink = bus.T("state", "fabric", "link", "+") +) + +// Identity carries the build-time stamp the software fact publishes. +// Filled in main.go (or tests) when constructing the updater. +type Identity struct { + Version string + Build string + ImageID string +} + +// MetadataReader is the read side of the abupdate metadata block — the +// updater pulls payload_sha256 and the staged descriptor (if any) from +// here at boot. The fabric-update branch only requires reads from this +// interface; the matching MetadataWriter handles staging-side +// persistence in W11. +type MetadataReader interface { + PayloadSHA256() string + StagedDescriptor() (StagedDescriptor, bool) +} + +// MetadataWriter is the write side: updater/main staging hands a verified +// StagedDescriptor + payload_sha256 here so the next boot's +// MetadataReader observes them. A default in-memory implementation is +// supplied (NewMemoryMetadata) for the fabric-update branch; the +// pico2-a-b/abupdate flash-backed implementation lands later (it +// touches the metadata sector at offset 0x000FF000 — see master +// plan §abupdate metadata block). +type MetadataWriter interface { + WriteStagedDescriptor(d StagedDescriptor) error + ClearStagedDescriptor() error +} + +// MemoryMetadata is the default in-memory MetadataReader+Writer used by host +// tests and non-persistent builds. +// +// Two separate payload-hash fields are intentional: +// - runningPayloadSHA — the hash of the IMAGE THAT IS RUNNING. Set +// once at boot from the active slot's metadata block. Read by +// SoftwareFact.PayloadSHA256. +// - stagedPayloadSHA — carried inside StagedDescriptor; lives only +// when a staged image is present. Cleared by +// ClearStagedDescriptor; never bleeds into the running fact. +// +// Sharing a single field would let prepare/stage-failure leave a +// stale staged hash sitting on the wire-visible software fact even +// after the descriptor was cleared. +type MemoryMetadata struct { + mu sync.Mutex + runningPayloadSHA string + desc StagedDescriptor + hasDesc bool +} + +// NewMemoryMetadata returns an empty MemoryMetadata. runningPayloadSHA +// stays "" until the caller calls SetRunningPayloadSHA from the boot +// path (typically reading the active slot's metadata block); the +// staged descriptor stays empty until updater/main staging writes it. +func NewMemoryMetadata() *MemoryMetadata { return &MemoryMetadata{} } + +// SetRunningPayloadSHA records the hash of the currently-running +// image. fabric-security wires this from the active slot's flash +// metadata at boot; tests can call it directly. Bare 64-char +// lower-hex per the spec. +func (m *MemoryMetadata) SetRunningPayloadSHA(sha string) { + m.mu.Lock() + defer m.mu.Unlock() + m.runningPayloadSHA = sha +} + +func (m *MemoryMetadata) PayloadSHA256() string { + m.mu.Lock() + defer m.mu.Unlock() + return m.runningPayloadSHA +} + +func (m *MemoryMetadata) StagedDescriptor() (StagedDescriptor, bool) { + m.mu.Lock() + defer m.mu.Unlock() + return m.desc, m.hasDesc +} + +func (m *MemoryMetadata) WriteStagedDescriptor(d StagedDescriptor) error { + m.mu.Lock() + defer m.mu.Unlock() + m.desc = d + m.hasDesc = true + // Note: running hash is NOT updated here. The staged hash lives + // inside the descriptor; it only becomes the running hash after + // a successful boot into the staged slot, at which point the + // next boot's SetRunningPayloadSHA pulls it from flash metadata. + return nil +} + +func (m *MemoryMetadata) ClearStagedDescriptor() error { + m.mu.Lock() + defer m.mu.Unlock() + m.desc = StagedDescriptor{} + m.hasDesc = false + return nil +} + +// nullMetadata is the zero-value default when the caller doesn't +// provide a MetadataReader. Read-only — no Write methods. +type nullMetadata struct{} + +func (nullMetadata) PayloadSHA256() string { return "" } +func (nullMetadata) StagedDescriptor() (StagedDescriptor, bool) { return StagedDescriptor{}, false } + +type applyRebootResult struct { + desc StagedDescriptor + err error +} + +// Service is the updater state machine + RPC binder. Constructed once +// in reactor.go and run in its own goroutine. +type Service struct { + conn *bus.Connection + verifier Verifier + applier Applier + identity Identity + metadata MetadataReader + metadataWrite MetadataWriter + + mu sync.Mutex + state State + lastError string + pendingVersion string + pendingImageID string + stagedImageID string + jobID string + preparing bool + + applyResults chan applyRebootResult + + // Logger seam — left as a small helper so tests can plug in. nil in + // tests means stderr-style println. + logf func(string, ...any) +} + +// Options bundle the constructor parameters so Service can grow new +// dependencies without churning callers. +type Options struct { + Conn *bus.Connection + Verifier Verifier + Applier Applier + Identity Identity + Metadata MetadataReader + MetadataWrite MetadataWriter +} + +// New builds a Service. Verifier defaults to the rejecting StubVerifier +// and Applier defaults to RefusingApplier so the production wiring +// never claims an apply succeeded when the apply path isn't +// implemented yet. Metadata defaults to a fresh in-memory +// implementation that's both reader and writer — fine for tests and +// for the rejecting-stub production path where nothing ever writes +// anyway. +func New(opts Options) *Service { + v := opts.Verifier + if v == nil { + v = StubVerifier() + } + a := opts.Applier + if a == nil { + a = RefusingApplier() + } + mr := opts.Metadata + mw := opts.MetadataWrite + if mr == nil && mw == nil { + shared := NewMemoryMetadata() + mr = shared + mw = shared + } else if mr == nil { + mr = nullMetadata{} + } else if mw == nil { + // Reader-only: writes from staging become no-ops. + mw = noopMetadataWriter{} + } + return &Service{ + conn: opts.Conn, + verifier: v, + applier: a, + identity: opts.Identity, + metadata: mr, + metadataWrite: mw, + state: StateRunning, + applyResults: make(chan applyRebootResult, 1), + } +} + +// noopMetadataWriter is the writer-side fallback when the caller +// supplied a MetadataReader without a matching writer. +type noopMetadataWriter struct{} + +func (noopMetadataWriter) WriteStagedDescriptor(d StagedDescriptor) error { + return nil +} +func (noopMetadataWriter) ClearStagedDescriptor() error { + return nil +} + +// Run binds the RPC + staging topics, publishes the initial fact +// surface, and watches the fabric link-state retain for ready-true +// edges (W10). Blocks until ctx is cancelled. +func (s *Service) Run(ctx context.Context) { + prepareSub := s.conn.Subscribe(TopicPrepareRPC) + defer s.conn.Unsubscribe(prepareSub) + + commitSub := s.conn.Subscribe(TopicCommitRPC) + defer s.conn.Unsubscribe(commitSub) + + stageSub := s.conn.Subscribe(TopicStageRPC) + defer s.conn.Unsubscribe(stageSub) + + linkSub := s.conn.Subscribe(TopicFabricLink) + defer s.conn.Unsubscribe(linkSub) + + // Initial fact publish: tells the CM5 we're alive and reports + // build identity + the freshly generated boot_id. + s.PublishSoftware() + s.PublishUpdater() + s.PublishHealth("ok", "") + + // Track per-link ready state so we only republish on the + // !Ready -> Ready edge, not on every retain churn. + prevReady := map[string]bool{} + + for { + select { + case <-ctx.Done(): + return + case msg, ok := <-prepareSub.Channel(): + if !ok || msg == nil { + continue + } + s.handlePrepare(msg) + case msg, ok := <-commitSub.Channel(): + if !ok || msg == nil { + continue + } + s.handleCommit(msg) + case msg, ok := <-stageSub.Channel(): + if !ok || msg == nil { + continue + } + s.handleStage(msg) + case result := <-s.applyResults: + s.failRebootIfCurrent(result.desc, result.err) + case msg, ok := <-linkSub.Channel(): + if !ok || msg == nil { + continue + } + linkID, ready := decodeLinkState(msg) + if linkID == "" { + continue + } + was := prevReady[linkID] + if ready && !was { + // W10: post-hello_ack republish. Mirrors the spec line + // "republished after every successful boot AND on every + // newly established session (hello_ack), warm or cold". + s.Republish() + } + prevReady[linkID] = ready + } + } +} + +// Republish re-emits all retained `state/self/*` facts. Wired up to +// fabric's session lifecycle so every new hello_ack triggers a fresh +// retain — required by the spec for warm-and-cold session resumes. +func (s *Service) Republish() { + s.PublishSoftware() + s.PublishUpdater() + s.PublishHealth("ok", "") +} + +// transitionTo updates state under the lock and publishes the updater +// fact. Returns the previous state for callers that want to log or +// confirm a precondition. +func (s *Service) transitionTo(next State, lastError, pendingVersion string) State { + s.mu.Lock() + prev := s.state + s.state = next + if lastError != "" || (next != StateFailed && next != StateRollbackDetected) { + s.lastError = lastError + } + if pendingVersion != "" { + s.pendingVersion = pendingVersion + } else if next == StatePreparing || next == StateReady || next == StateReceiving { + s.pendingVersion = "" + } + s.mu.Unlock() + s.PublishUpdater() + return prev +} + +func (s *Service) failRebootIfCurrent(desc StagedDescriptor, err error) bool { + if err == nil { + return false + } + s.mu.Lock() + matches := s.state == StateRebooting && + s.pendingVersion == desc.Version && + s.stagedImageID == desc.ImageID + s.mu.Unlock() + if !matches { + return false + } + s.transitionTo(StateFailed, err.Error(), desc.Version) + return true +} + +func (s *Service) setJobContext(jobID, pendingImageID string) { + s.mu.Lock() + s.jobID = jobID + s.pendingImageID = pendingImageID + s.stagedImageID = "" + s.mu.Unlock() +} + +func (s *Service) setStagedImage(imageID, version string) { + s.mu.Lock() + s.stagedImageID = imageID + if version != "" { + s.pendingVersion = version + } + s.mu.Unlock() +} + +func (s *Service) clearStagedImage() { + s.mu.Lock() + s.stagedImageID = "" + s.pendingVersion = "" + s.mu.Unlock() +} + +// markPrepareDone clears the preparing flag. handlePrepare/handleCommit +// guard re-entry through this. +func (s *Service) markPrepareDone() { + s.mu.Lock() + s.preparing = false + s.mu.Unlock() +} + +// boot-time initialization helper — main.go calls this before opening +// fabric so the first software-fact publish has a non-empty boot_id. +func (s *Service) ensureBootID() string { + id := BootID() + if id == "" { + id = GenerateBootID() + } + return id +} + +// reply is a thin convenience wrapper that tolerates nil msg (defensive +// against bus quirks observed during fabric-protocol bring-up where a +// ctx cancel could land a nil message on the channel). +func (s *Service) reply(msg *bus.Message, payload any) { + if msg == nil || !msg.CanReply() { + return + } + s.conn.Reply(msg, payload, false) +} + +// decodeLinkState extracts the link_id and ready flag from a +// state/fabric/link/ retain. Tolerates both the typed payload +// shape published by services/fabric/session.go and a generic +// map[string]any (in-process test harnesses). Returns ("", false) +// for any payload it can't make sense of — the caller treats that +// as "no edge". +func decodeLinkState(msg *bus.Message) (string, bool) { + if msg == nil { + return "", false + } + // Pull link_id from the topic tail (state/fabric/link/). + t := msg.Topic + if t == nil || t.Len() < 4 { + return "", false + } + last := t.At(t.Len() - 1) + linkID, _ := last.(string) + if linkID == "" { + return "", false + } + switch p := msg.Payload.(type) { + case nil: + return linkID, false + case map[string]any: + ready, _ := p["ready"].(bool) + return linkID, ready + } + // Fall back to JSON probe for the typed-struct payload that + // fabric publishes via its linkStatePayload type. + b, err := json.Marshal(msg.Payload) + if err != nil { + return linkID, false + } + var probe struct { + Ready bool `json:"ready"` + } + if err := json.Unmarshal(b, &probe); err != nil { + return linkID, false + } + return linkID, probe.Ready +} + +// jsonDecode is a small helper that tolerates both already-typed +// payloads (Go-side test wiring) and raw JSON payloads (real wire). +// Returns the decoded value or false on a hopeless mismatch. +func jsonDecode[T any](payload any) (T, bool) { + var out T + switch v := payload.(type) { + case nil: + return out, true + case T: + return v, true + case json.RawMessage: + if len(v) == 0 { + return out, true + } + if err := json.Unmarshal(v, &out); err != nil { + return out, false + } + return out, true + case []byte: + if len(v) == 0 { + return out, true + } + if err := json.Unmarshal(v, &out); err != nil { + return out, false + } + return out, true + } + // Fall back to re-marshaling unknown shapes; covers the test path + // where callers pass map[string]any that JSON-roundtrips. + b, err := json.Marshal(payload) + if err != nil { + return out, false + } + if err := json.Unmarshal(b, &out); err != nil { + return out, false + } + return out, true +} diff --git a/services/updater/updater_test.go b/services/updater/updater_test.go new file mode 100644 index 0000000..7bfadb5 --- /dev/null +++ b/services/updater/updater_test.go @@ -0,0 +1,943 @@ +package updater + +import ( + "bytes" + "context" + "encoding/hex" + "encoding/json" + "errors" + "io" + "strings" + "sync" + "testing" + "time" + + "devicecode-go/bus" +) + +// ---- helpers -------------------------------------------------------- + +func newTestBus() *bus.Bus { return bus.NewBus(8, "+", "#") } + +type fakeVerifierAccept struct { + manifest Manifest + payload []byte +} + +func (f *fakeVerifierAccept) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + if f.payload != nil { + _, _ = sink.Write(f.payload) + } else { + _, _ = io.Copy(sink, r) + } + return f.manifest, nil +} + +type fakeVerifierReject struct{ err error } + +func (f *fakeVerifierReject) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + _ = r + if sink != nil { + _ = sink.Abort() + } + return Manifest{}, f.err +} + +type fakeMetadata struct { + sha string + staged StagedDescriptor + has bool +} + +func (f *fakeMetadata) PayloadSHA256() string { return f.sha } +func (f *fakeMetadata) StagedDescriptor() (StagedDescriptor, bool) { return f.staged, f.has } + +// fakeApplier always succeeds — used by tests that need the commit RPC +// to drive the state machine through committing/rebooting without +// actually rebooting (production wiring uses RefusingApplier so the +// commit RPC returns apply_unavailable until fabric-security supplies +// the real abupdate-backed implementation). +// +// canCalls and rebootCalls are kept separate so tests can verify the commit +// ordering: CanApply first, publish rebooting + reply accepted, then ArmReboot. +type fakeApplier struct { + mu sync.Mutex + canCalls []StagedDescriptor + rebootCalls []StagedDescriptor + rebootErr error + rebootCh chan StagedDescriptor +} + +func (f *fakeApplier) CanApply(d StagedDescriptor) error { + f.mu.Lock() + defer f.mu.Unlock() + f.canCalls = append(f.canCalls, d) + return nil +} + +func (f *fakeApplier) ArmReboot(d StagedDescriptor) error { + f.mu.Lock() + f.rebootCalls = append(f.rebootCalls, d) + err := f.rebootErr + ch := f.rebootCh + f.mu.Unlock() + if ch != nil { + select { + case ch <- d: + default: + } + } + return err +} + +func (f *fakeApplier) callCounts() (int, int) { + f.mu.Lock() + defer f.mu.Unlock() + return len(f.canCalls), len(f.rebootCalls) +} + +func (f *fakeApplier) rebootCall(i int) StagedDescriptor { + f.mu.Lock() + defer f.mu.Unlock() + return f.rebootCalls[i] +} + +// ---- boot_id (W6) --------------------------------------------------- + +func TestBootIDIs16HexChars(t *testing.T) { + resetBootIDForTest() + id := GenerateBootID() + if len(id) != 16 { + t.Fatalf("len = %d, want 16", len(id)) + } + if _, err := hex.DecodeString(id); err != nil { + t.Fatalf("not hex: %v", err) + } +} + +func TestBootIDIsCachedAcrossCalls(t *testing.T) { + // Within a process boot, GenerateBootID is idempotent — multiple + // callers see the same value. + resetBootIDForTest() + a := GenerateBootID() + b := GenerateBootID() + if a != b { + t.Fatalf("non-idempotent: %s vs %s", a, b) + } +} + +func TestBootIDChangesAfterReset(t *testing.T) { + // resetBootIDForTest mimics a successful boot. 10 successive boots + // must all produce unique values (master R3 failure-mode list: + // "RNG-never-seeded / from-constant" guard). + seen := make(map[string]struct{}) + for i := 0; i < 10; i++ { + resetBootIDForTest() + id := GenerateBootID() + if _, dup := seen[id]; dup { + t.Fatalf("boot %d duplicated id %s", i, id) + } + seen[id] = struct{}{} + } +} + +func TestBootIDIsNotAllZero(t *testing.T) { + // "Generated-before-entropy" guard: all-zero sentinel should never + // be returned. The fallback path explicitly walks past it. + for i := 0; i < 20; i++ { + resetBootIDForTest() + id := GenerateBootID() + if id == "0000000000000000" { + t.Fatal("got all-zero boot_id") + } + } +} + +// ---- state machine + RPC handlers (W4) ------------------------------ + +func waitForFact[T any](t *testing.T, sub *bus.Subscription, want func(T) bool) T { + t.Helper() + deadline := time.After(2 * time.Second) + for { + select { + case msg := <-sub.Channel(): + if msg == nil { + continue + } + fact, ok := msg.Payload.(T) + if !ok { + continue + } + if want == nil || want(fact) { + return fact + } + case <-deadline: + t.Fatal("timeout waiting for fact") + } + } +} + +func strValue(p *string) string { + if p == nil { + return "" + } + return *p +} + +func testStagePayload(id string, artefact []byte) StagePayload { + return StagePayload{ + LinkID: "mcu-uart0", + XferID: id, + Target: TargetUpdaterMain, + Size: uint32(len(artefact)), + DigestAlg: DigestAlgXXHash32, + Digest: "deadbeef", + Artefact: artefact, + } +} + +func runService(t *testing.T, b *bus.Bus, opts Options) (*Service, context.CancelFunc) { + t.Helper() + resetBootIDForTest() + if opts.Conn == nil { + t.Fatal("Options.Conn is required") + } + if opts.Identity.Version == "" { + opts.Identity = Identity{Version: "0.0.0-test", Build: "build-test", ImageID: "img-test"} + } + // Subscribe to the software-fact topic BEFORE starting Run, so we + // catch the initial publish without racing the goroutine's bus + // subscriptions. The probe lives on its own connection so it + // doesn't interfere with the caller's subscriptions. + probeConn := b.NewConnection("updater-probe") + probe := probeConn.Subscribe(TopicSoftwareFact) + svc := New(opts) + ctx, cancel := context.WithCancel(context.Background()) + go svc.Run(ctx) + select { + case msg := <-probe.Channel(): + if msg == nil { + t.Fatal("nil software fact at boot") + } + case <-time.After(2 * time.Second): + cancel() + t.Fatal("updater service did not publish initial software fact") + } + probeConn.Unsubscribe(probe) + return svc, cancel +} + +func TestPublishesInitialFactsOnRun(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + swSub := observer.Subscribe(TopicSoftwareFact) + defer observer.Unsubscribe(swSub) + upSub := observer.Subscribe(TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + hSub := observer.Subscribe(TopicHealthFact) + defer observer.Unsubscribe(hSub) + + _, cancel := runService(t, b, Options{ + Conn: conn, + Identity: Identity{Version: "1.2.3", Build: "abc", ImageID: "img-1"}, + }) + defer cancel() + + sw := waitForFact[SoftwareFact](t, swSub, nil) + if sw.Version != "1.2.3" || sw.BuildID != "abc" || sw.ImageID != "img-1" { + t.Fatalf("software identity wrong: %+v", sw) + } + if len(sw.BootID) != 16 { + t.Fatalf("boot_id len = %d, want 16 chars: %q", len(sw.BootID), sw.BootID) + } + + up := waitForFact[UpdaterFact](t, upSub, nil) + if up.State != StateRunning { + t.Fatalf("updater state = %q, want %q", up.State, StateRunning) + } + + h := waitForFact[HealthFact](t, hSub, nil) + if h.State != "ok" { + t.Fatalf("health state = %q, want ok", h.State) + } +} + +func TestPrepareTransitionsToReady(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + // drain initial running fact + _ = waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateRunning }) + + req := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(PrepareReply) + if !ok { + t.Fatalf("reply payload type = %T", msg.Payload) + } + if !reply.Ready || reply.Target != TargetUpdaterMain || reply.MaxChunkSize != DefaultMaxChunkSize { + t.Fatalf("prepare reply = %+v, want ready target max_chunk_size", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for prepare reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateReady }) + if up.LastError != nil { + t.Fatalf("last_error not cleared on prepare: %q", strValue(up.LastError)) + } +} + +func TestCommitWithoutStagedReturnsNothingStaged(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + req := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(Reply) + if !ok { + t.Fatalf("reply payload type = %T", msg.Payload) + } + if reply.OK { + t.Fatalf("commit unexpectedly OK without staged image: %+v", reply) + } + if reply.Error != ErrNothingStaged { + t.Fatalf("commit error = %q, want %q", reply.Error, ErrNothingStaged) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for commit reply") + } +} + +func TestCommitWithoutStagedStateRefusesEvenWithDescriptor(t *testing.T) { + // Both halves of the staged condition are required: a descriptor + // in metadata AND state == staged. A descriptor without the + // matching state means the receiver didn't actually finish, so + // commit must refuse rather than push into committing/rebooting. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + md := &fakeMetadata{ + has: true, + staged: StagedDescriptor{Version: "9.9.9", BuildID: "bx", ImageID: "ix", Length: 4096, Slot: 1, PayloadSHA256: strings.Repeat("a", 64)}, + } + _, cancel := runService(t, b, Options{Conn: conn, Metadata: md, Applier: &fakeApplier{}}) + defer cancel() + + req := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + select { + case msg := <-replySub.Channel(): + reply, _ := msg.Payload.(Reply) + if reply.OK || reply.Error != ErrNothingStaged { + t.Fatalf("commit reply = %+v, want refusal=nothing_staged", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for commit reply") + } +} + +func TestCommitWithoutApplierReturnsApplyUnavailable(t *testing.T) { + // Spec safety: the commit RPC must not claim success when the MCU + // has no apply hook wired (the production default RefusingApplier + // returns ErrApplyUnavailable). State stays at staged; the + // receiver-staged descriptor remains valid for a subsequent + // commit once a real Applier is wired. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierAccept{manifest: Manifest{Version: "9.9.9", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: 4}} + memMD := NewMemoryMetadata() + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Metadata: memMD, + MetadataWrite: memMD, + // No Applier supplied — defaults to RefusingApplier. + }) + defer cancel() + + // Drive updater/main staging to staged state. + rreq := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-x", []byte("blob")), false) + rsub := caller.Request(rreq) + defer caller.Unsubscribe(rsub) + <-rsub.Channel() + + creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + csub := caller.Request(creq) + defer caller.Unsubscribe(csub) + select { + case msg := <-csub.Channel(): + reply, _ := msg.Payload.(Reply) + if reply.OK || reply.Error != ErrApplyUnavailable { + t.Fatalf("commit reply = %+v, want refusal=apply_unavailable", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for commit reply") + } + + // State must NOT have transitioned to committing/rebooting — that would lie. + settle := time.After(150 * time.Millisecond) + for { + select { + case msg := <-upSub.Channel(): + fact, _ := msg.Payload.(UpdaterFact) + if fact.State == StateCommitting || fact.State == StateRebooting { + t.Fatalf("state transitioned to %s despite refusing applier", fact.State) + } + case <-settle: + return + } + } +} + +func TestCommitWithFakeApplierTransitionsToRebooting(t *testing.T) { + // With a real Applier supplied, the staged descriptor in metadata and state + // drives commit through committing to rebooting. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierAccept{manifest: Manifest{Version: "9.9.9", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: 4}} + memMD := NewMemoryMetadata() + app := &fakeApplier{rebootCh: make(chan StagedDescriptor, 1)} + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Applier: app, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + + // Stage via updater/main. + rreq := caller.NewMessage(TopicStageRPC, testStagePayload("x", []byte("blob")), false) + <-caller.Request(rreq).Channel() + + // Commit. + creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + csub := caller.Request(creq) + defer caller.Unsubscribe(csub) + select { + case msg := <-csub.Channel(): + reply, _ := msg.Payload.(CommitReply) + if !reply.Accepted || !reply.RebootRequired { + t.Fatalf("commit reply = %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } + + select { + case <-app.rebootCh: + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for ArmReboot") + } + + canCalls, rebootCalls := app.callCounts() + if canCalls != 1 || rebootCalls != 1 { + t.Fatalf("Applier hooks fired wrong: can=%d reboot=%d, want 1+1", + canCalls, rebootCalls) + } + if got := app.rebootCall(0).Version; got != "9.9.9" { + t.Fatalf("ArmReboot got descriptor.Version = %q, want 9.9.9", got) + } + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateRebooting }) + if strValue(up.PendingVersion) != "9.9.9" { + t.Fatalf("pending_version = %q", strValue(up.PendingVersion)) + } +} + +func TestCommitApplyRebootErrorPublishesFailedAfterAcceptedReply(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierAccept{manifest: Manifest{ + Version: "9.9.9", + BuildID: "build-9.9.9", + ImageID: "mcu-dev-9.9.9", + PayloadSHA256: strings.Repeat("a", 64), + PayloadLength: 4, + }} + memMD := NewMemoryMetadata() + app := &fakeApplier{rebootErr: errors.New("apply_reboot_failed:reboot_into_slot:-1")} + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Applier: app, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + + rreq := caller.NewMessage(TopicStageRPC, testStagePayload("x", []byte("blob")), false) + <-caller.Request(rreq).Channel() + + creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + csub := caller.Request(creq) + defer caller.Unsubscribe(csub) + select { + case msg := <-csub.Channel(): + reply, _ := msg.Payload.(CommitReply) + if !reply.Accepted || !reply.RebootRequired { + t.Fatalf("commit reply = %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for commit reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + if got := strValue(up.LastError); got != "apply_reboot_failed:reboot_into_slot:-1" { + t.Fatalf("last_error = %q", got) + } +} + +func TestApplyRebootErrorIgnoredWhenContextNoLongerMatches(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + svc, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + desc := StagedDescriptor{Version: "9.9.9", ImageID: "mcu-dev-9.9.9"} + svc.setStagedImage(desc.ImageID, desc.Version) + svc.transitionTo(StateRebooting, "", desc.Version) + svc.transitionTo(StateReady, "", "") + svc.applyResults <- applyRebootResult{ + desc: desc, + err: errors.New("apply_reboot_failed:reboot_into_slot:-1"), + } + + settle := time.After(150 * time.Millisecond) + for { + select { + case msg := <-upSub.Channel(): + fact, _ := msg.Payload.(UpdaterFact) + if fact.State == StateFailed { + t.Fatalf("stale apply result unexpectedly failed updater: %+v", fact) + } + case <-settle: + return + } + } +} + +// ---- updater/main staging path with fakes ---------------------------- + +func TestStageStubVerifierPublishesFailed(t *testing.T) { + // Production stub: any artefact is rejected. State must transition + // to failed with last_error matching the sentinel. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + _, cancel := runService(t, b, Options{Conn: conn, Verifier: StubVerifier()}) + defer cancel() + + req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-1", []byte("blob")), false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(StageReply) + if !ok || reply.OK { + t.Fatalf("stage unexpectedly OK with stub: %+v", reply) + } + if !strings.Contains(reply.Err, "verifier_stub") { + t.Fatalf("stage err = %q, want stub sentinel", reply.Err) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + if !strings.Contains(strValue(up.LastError), "verifier_stub") { + t.Fatalf("last_error = %q, want stub sentinel", strValue(up.LastError)) + } +} + +func TestStageFakeAcceptWritesStagedDescriptor(t *testing.T) { + // W11: on verifier success staging writes the manifest's + // fields to the metadata writer. A subsequent commit RPC reads + // the descriptor back via the matching reader and transitions + // to rebooting with the same pending_version. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierAccept{manifest: Manifest{ + Version: "9.9.9", + BuildID: "bx", + ImageID: "ix", + PayloadSHA256: "deadbeef", + PayloadLength: 4, + }} + memMD := NewMemoryMetadata() + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Applier: &fakeApplier{}, // success path; production default refuses + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + + // Drive updater/main staging to verifier success. + req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-w11", []byte("blob")), false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + select { + case msg := <-replySub.Channel(): + reply, _ := msg.Payload.(StageReply) + if !reply.OK { + t.Fatalf("stage reply not ok: %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage reply") + } + + // Reader sees the staged descriptor + its embedded payload hash. + desc, ok := memMD.StagedDescriptor() + if !ok { + t.Fatal("staged descriptor not persisted") + } + if desc.Version != "9.9.9" || desc.PayloadSHA256 != "deadbeef" || desc.Length != 4 { + t.Fatalf("descriptor wrong: %+v", desc) + } + // WriteStagedDescriptor must not promote the staged hash into the + // running-image hash. Running hash stays "" until SetRunningPayloadSHA is + // called at the next boot. + if got := memMD.PayloadSHA256(); got != "" { + t.Fatalf("running payload_sha256 leaked from staged descriptor: %q", got) + } + + // Commit RPC now succeeds because the reader sees the descriptor + // AND state is staged AND a real Applier is wired. + creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + csub := caller.Request(creq) + defer caller.Unsubscribe(csub) + select { + case msg := <-csub.Channel(): + reply, _ := msg.Payload.(CommitReply) + if !reply.Accepted { + t.Fatalf("commit reply not ok: %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for commit reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateRebooting }) + if strValue(up.PendingVersion) != "9.9.9" { + t.Fatalf("pending_version = %q, want 9.9.9", strValue(up.PendingVersion)) + } +} + +func TestStageFailureClearsStaleStagedDescriptor(t *testing.T) { + // A (stage A) -> (prepare for B) -> (stage B fails) flow must not leave + // descriptor A persisted. The next commit should return nothing_staged + // rather than committing stale firmware. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + // Pre-stage: a real descriptor sitting in metadata from an earlier + // successful flow. + memMD := NewMemoryMetadata() + _ = memMD.WriteStagedDescriptor(StagedDescriptor{Version: "1.0.0", PayloadSHA256: "old"}) + + // Service uses a verifier that always rejects. + verif := &fakeVerifierReject{err: errString("bad_signature")} + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Applier: &fakeApplier{}, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + + // Drive updater/main staging to failure. + rreq := caller.NewMessage(TopicStageRPC, testStagePayload("x", []byte("blob")), false) + rsub := caller.Request(rreq) + defer caller.Unsubscribe(rsub) + select { + case <-rsub.Channel(): + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } + + // The stale descriptor must have been cleared. + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatalf("stale staged descriptor survived receiver failure") + } + + // Commit must refuse with nothing_staged rather than commit the + // stale image. + creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + csub := caller.Request(creq) + defer caller.Unsubscribe(csub) + select { + case msg := <-csub.Channel(): + reply, _ := msg.Payload.(Reply) + if reply.OK || reply.Error != ErrNothingStaged { + t.Fatalf("commit reply = %+v, want refusal=nothing_staged", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } +} + +func TestPrepareClearsStaleStagedDescriptor(t *testing.T) { + // A new prepare invalidates any prior persisted stage so a partial- + // failure subsequent transfer can't accidentally commit the + // previously-staged image. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + memMD := NewMemoryMetadata() + _ = memMD.WriteStagedDescriptor(StagedDescriptor{Version: "1.0.0", PayloadSHA256: "old"}) + + _, cancel := runService(t, b, Options{ + Conn: conn, + Metadata: memMD, + MetadataWrite: memMD, + Applier: &fakeApplier{}, + }) + defer cancel() + + preq := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) + psub := caller.Request(preq) + defer caller.Unsubscribe(psub) + select { + case msg := <-psub.Channel(): + reply, _ := msg.Payload.(PrepareReply) + if !reply.Ready { + t.Fatalf("prepare reply = %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } + + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatalf("stale staged descriptor survived prepare") + } +} + +func TestStageFakeAcceptPublishesStaged(t *testing.T) { + // Test fake exercises the success path that fabric-security will + // flesh out in production. State -> staged, pending_version mirrors + // the manifest's build version, reply.OK = true. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierAccept{manifest: Manifest{Version: "9.9.9", BuildID: "bx", ImageID: "ix", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: 4}} + _, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) + defer cancel() + + req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-2", []byte("blob")), false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(StageReply) + if !ok || !reply.OK || reply.Stage != "staged" { + t.Fatalf("stage reply = %+v ok-type=%v", reply, ok) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateStaged }) + if strValue(up.PendingVersion) != "9.9.9" { + t.Fatalf("pending_version = %q, want 9.9.9", strValue(up.PendingVersion)) + } +} + +func TestStageFakeRejectPublishesFailed(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierReject{err: errString("manifest_check_failed")} + _, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) + defer cancel() + + req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-3", []byte("blob")), false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(StageReply) + if !ok || reply.OK { + t.Fatalf("stage unexpectedly OK: %+v", reply) + } + if reply.Err != "manifest_check_failed" { + t.Fatalf("stage err = %q, want manifest_check_failed", reply.Err) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + if strValue(up.LastError) != "manifest_check_failed" { + t.Fatalf("last_error = %q, want manifest_check_failed", strValue(up.LastError)) + } +} + +func TestRepublishOnLinkReadyEdge(t *testing.T) { + // W10 contract: the updater republishes its retained state/self/* + // surface on every !Ready -> Ready transition observed on + // state/fabric/link/. Verifies the edge is detected without + // double-firing on subsequent retains that keep Ready=true. + b := newTestBus() + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + swSub := observer.Subscribe(TopicSoftwareFact) + defer observer.Unsubscribe(swSub) + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + // Drain the initial software fact emitted on Run start. + _ = waitForFact[SoftwareFact](t, swSub, nil) + + // Publish a link-state retain with Ready=false first; should not + // trigger a republish. + publisher := b.NewConnection("test-fabric") + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": false, "established": false}, + true, + )) + // Brief wait then drop everything that's already in the channel. + time.Sleep(50 * time.Millisecond) + for len(swSub.Channel()) > 0 { + <-swSub.Channel() + } + + // Now flip Ready to true: the !Ready -> Ready edge MUST trigger a + // software-fact republish. + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": true, "established": true, "peer_sid": "cm5-x"}, + true, + )) + _ = waitForFact[SoftwareFact](t, swSub, nil) + + // Subsequent Ready=true retain (no edge) should NOT trigger another + // republish. We assert by checking the channel is empty after a + // short settle window. + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": true, "established": true, "peer_sid": "cm5-x", "last_rx_ms": int64(123)}, + true, + )) + settled := time.After(150 * time.Millisecond) + for { + select { + case <-swSub.Channel(): + t.Fatal("unexpected republish on subsequent Ready=true retain") + case <-settled: + return + } + } +} + +// ---- jsonDecode robustness ------------------------------------------ + +func TestJSONDecodeAcceptsTypedAndRaw(t *testing.T) { + t1, ok := jsonDecode[PrepareRequest](PrepareRequest{Target: "x"}) + if !ok || t1.Target != "x" { + t.Fatalf("typed: %v %v", ok, t1) + } + raw := json.RawMessage(`{"target":"y"}`) + t2, ok := jsonDecode[PrepareRequest](raw) + if !ok || t2.Target != "y" { + t.Fatalf("raw: %v %v", ok, t2) + } + t3, ok := jsonDecode[PrepareRequest](nil) + if !ok || t3.Target != "" { + t.Fatalf("nil: %v %v", ok, t3) + } + t4, ok := jsonDecode[PrepareRequest]([]byte(`{"target":"z"}`)) + if !ok || t4.Target != "z" { + t.Fatalf("bytes: %v %v", ok, t4) + } +} + +// ---- memorySink behaviour ------------------------------------------- + +func TestMemorySinkAbortClearsBuffer(t *testing.T) { + s := &memorySink{} + _, _ = s.Write([]byte("hello")) + _ = s.Abort() + if got := s.buf.Len(); got != 0 { + t.Fatalf("after abort buf len = %d, want 0", got) + } +} + +func TestMemorySinkCommitClosesWrites(t *testing.T) { + s := &memorySink{} + _, _ = s.Write([]byte("hello")) + if err := s.Commit(); err != nil { + t.Fatalf("commit: %v", err) + } + _, err := s.Write([]byte("more")) + if err != io.ErrClosedPipe { + t.Fatalf("write after commit err = %v, want io.ErrClosedPipe", err) + } +} + +// errString is a tiny error type for tests that don't want to import +// the standard errors package twice. +type errString string + +func (e errString) Error() string { return string(e) } + +// Compile-time assert that bytes.NewReader satisfies the verifier API. +var _ io.Reader = bytes.NewReader(nil) diff --git a/services/updater/verifier.go b/services/updater/verifier.go new file mode 100644 index 0000000..4cfac9f --- /dev/null +++ b/services/updater/verifier.go @@ -0,0 +1,113 @@ +package updater + +import ( + "errors" + "io" +) + +// Manifest is the small subset of image metadata that updater staging needs +// after verification succeeds. fabric-update fills it from the bring-up +// passthrough verifier; fabric-security fills the same interface from +// pico2-a-b/imagev1. +type Manifest struct { + Version string + BuildID string + ImageID string + PayloadSHA256 string + PayloadLength uint32 +} + +// SlotSink is what the verifier writes verified payload bytes into. +// In production this lands in the inactive abupdate slot; in tests it +// can be backed by a bytes.Buffer or similar. Keep the interface tiny. +type SlotSink interface { + io.Writer + // Commit finalises the staged write. Called after the verifier has + // finished streaming and confirms the payload SHA-256 matches the + // manifest. Returns the descriptor-relevant fields. + Commit() error + // Abort rolls back any partial write so the next prepare/commit + // starts from a clean slot. + Abort() error +} + +// Verifier is updater/main staging's verification hook. Tests may pass fakes, +// production wiring supplies an explicit verifier, and nil Options.Verifier +// falls back to the rejecting StubVerifier. +type Verifier interface { + // Verify reads the artefact bytes from r, validates the signed + // envelope (header + manifest + signature), and on success streams + // the verified payload into sink. Returns the trusted manifest the + // staging path propagates to the staged descriptor and software fact. + // + // On failure: sink.Abort is called by the verifier itself before + // returning so staging doesn't have to special-case it. + Verify(r io.Reader, sink SlotSink) (Manifest, error) +} + +// ErrUnsignedNotSupported is the sentinel returned by the production +// stub on this branch. The wire `last_error` value is set to its +// Error() string so Lua-side test harnesses can grep for it. +var ErrUnsignedNotSupported = errors.New("verifier_stub: unsigned images not supported on this build") + +// Applier is the slot-switch + reboot hook for the commit RPC. Split in two so +// handleCommit can publish the rebooting retain and reply accepted before the +// reboot fires; an implementation that reboots inside Apply would otherwise +// skip both the wire reply and the state/self/updater retain. +// +// New() still defaults to RefusingApplier so tests and host builds never claim +// apply success without an explicit production applier. Reactor wiring supplies +// the abupdate-backed implementation that triggers REBOOT_TYPE_FLASH_UPDATE into +// the staged slot. +type Applier interface { + // CanApply validates that the apply path is wired and the + // descriptor is acceptable. Quick, no side effects beyond minimal + // validation. Errors here surface in the commit reply as + // {ok:false, error:}; the canonical committing/rebooting + // retains are NOT published. + CanApply(d StagedDescriptor) error + + // ArmReboot schedules the slot-switch + reboot. Called only AFTER + // handleCommit has published state=rebooting and replied accepted to the + // caller. Real implementations may reboot inside this call (it + // won't return); the spec contract is that callers must do their + // pre-reboot work first. If it returns an error, the updater service + // publishes that failure from its own Run loop. + ArmReboot(d StagedDescriptor) error +} + +// refusingApplier is the production default. CanApply always returns +// ErrApplyUnavailable so commit refuses with +// `error: "apply_unavailable"` and never reaches ArmReboot. +type refusingApplier struct{} + +// RefusingApplier returns the safe-default Applier for this branch. +func RefusingApplier() Applier { return refusingApplier{} } + +func (refusingApplier) CanApply(d StagedDescriptor) error { + _ = d + return errors.New(ErrApplyUnavailable) +} + +// ArmReboot is a contract-required no-op for the refusing default — +// CanApply rejects every descriptor, so the commit handler never +// calls this. Defined for interface conformance. +func (refusingApplier) ArmReboot(d StagedDescriptor) error { + _ = d + return nil +} + +// stubVerifier is the safe default when no verifier is wired. It always rejects +// so no unsigned firmware can stage accidentally. +type stubVerifier struct{} + +// StubVerifier returns the rejecting verifier used as New's default. +func StubVerifier() Verifier { return stubVerifier{} } + +func (stubVerifier) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + _ = r + if sink != nil { + _ = sink.Abort() + } + return Manifest{}, ErrUnsignedNotSupported +} diff --git a/services/updater/verifier_passthrough.go b/services/updater/verifier_passthrough.go new file mode 100644 index 0000000..e1e6f91 --- /dev/null +++ b/services/updater/verifier_passthrough.go @@ -0,0 +1,50 @@ +package updater + +import ( + "crypto/sha256" + "encoding/hex" + "errors" + "io" +) + +// passthroughVerifier accepts any artefact, streams its bytes straight +// into sink while computing SHA-256, and returns a synthetic manifest +// with the artefact length + computed hash. Intended for the bringup +// stack on this branch where the signed-image v1 envelope (header + +// canonical manifest + Ed25519 signature) is not yet implemented. +// +// Replace with a real verifier when fabric-security lands; this exists +// so fw-update-e2e can drive the staging → applier → reboot path +// end-to-end without the signed-image scaffolding in place. +type passthroughVerifier struct { + identity Identity +} + +// PassthroughVerifier returns a Verifier that accepts any artefact and +// fills the manifest with identity (caller-supplied), the artefact +// length, and the SHA-256 of the streamed payload. Reboot-time apply +// is gated by the Applier; a passthrough verifier without a real +// applier still ends with state=failed(apply_unavailable) at commit. +func PassthroughVerifier(identity Identity) Verifier { + return passthroughVerifier{identity: identity} +} + +func (v passthroughVerifier) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + if sink == nil { + return Manifest{}, errors.New("passthrough_verifier: nil sink") + } + hasher := sha256.New() + mw := io.MultiWriter(sink, hasher) + n, err := io.Copy(mw, r) + if err != nil { + _ = sink.Abort() + return Manifest{}, err + } + return Manifest{ + Version: v.identity.Version, + BuildID: v.identity.Build, + ImageID: v.identity.ImageID, + PayloadSHA256: hex.EncodeToString(hasher.Sum(nil)), + PayloadLength: uint32(n), + }, nil +} From 230c29646e826ad256aa00fc7278042d30d7db6c Mon Sep 17 00:00:00 2001 From: cpunt Date: Wed, 27 May 2026 16:05:38 +0000 Subject: [PATCH 2/6] fabric: add updater transfer and wire contract --- services/fabric/config.go | 104 +-- services/fabric/fabric.go | 12 +- services/fabric/fabric_test.go | 964 +++++++----------------- services/fabric/protocol.go | 102 ++- services/fabric/remap.go | 67 +- services/fabric/session.go | 307 +++++--- services/fabric/trace.go | 14 + services/fabric/transfer.go | 360 ++++++--- services/fabric/transfer_sink_buffer.go | 80 ++ services/fabric/transfer_sink_rp2350.go | 65 +- services/fabric/transfer_sink_stub.go | 10 +- services/fabric/transfer_test.go | 598 +++++++++++---- services/fabric/transport_limits.go | 4 +- services/fabric/writer.go | 43 +- 14 files changed, 1483 insertions(+), 1247 deletions(-) create mode 100644 services/fabric/transfer_sink_buffer.go diff --git a/services/fabric/config.go b/services/fabric/config.go index 28b7cb1..ac8dff6 100644 --- a/services/fabric/config.go +++ b/services/fabric/config.go @@ -1,88 +1,15 @@ package fabric -import ( - "encoding/json" - - "devicecode-go/types" -) - -// decodeHALConfig extracts a HALConfig from an arbitrary payload, -// normalizing Lua empty-table encoding ({} → []) for known slice fields. -func decodeHALConfig(payload any) (types.HALConfig, string) { - switch v := payload.(type) { - case types.HALConfig: - return v, "" - case *types.HALConfig: - if v == nil { - return types.HALConfig{}, "nil_hal_config" - } - return *v, "" - case json.RawMessage: - return decodeHALConfigBytes(v) - case []byte: - return decodeHALConfigBytes(v) - default: - b, err := json.Marshal(v) - if err != nil { - return types.HALConfig{}, "payload_marshal_failed: " + err.Error() - } - return decodeHALConfigBytes(b) - } -} - -func decodeHALConfigBytes(b []byte) (types.HALConfig, string) { - var probe map[string]json.RawMessage - if err := json.Unmarshal(b, &probe); err != nil { - return types.HALConfig{}, "json_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(b) - } - if _, ok := probe["devices"]; !ok { - return types.HALConfig{}, "missing_devices_field; raw=" + truncateRawJSON(b) - } - - // Lua encodes empty tables as {} (object) not [] (array). - // Normalize known slice fields so Go unmarshal accepts them. - for _, key := range []string{"devices", "pollers"} { - if raw, ok := probe[key]; ok && len(raw) == 2 && raw[0] == '{' && raw[1] == '}' { - probe[key] = json.RawMessage("[]") - } - } - fixed, err := json.Marshal(probe) - if err != nil { - return types.HALConfig{}, "normalize_failed: " + err.Error() - } - - var out types.HALConfig - if err := json.Unmarshal(fixed, &out); err != nil { - return types.HALConfig{}, "hal_config_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(fixed) - } - return out, "" -} - -func decodeHALState(payload any) (types.HALState, bool) { - switch v := payload.(type) { - case types.HALState: - return v, true - case *types.HALState: - if v == nil { - return types.HALState{}, false - } - return *v, true - case json.RawMessage: - var out types.HALState - return out, json.Unmarshal(v, &out) == nil - case []byte: - var out types.HALState - return out, json.Unmarshal(v, &out) == nil - default: - b, err := json.Marshal(v) - if err != nil { - return types.HALState{}, false - } - var out types.HALState - return out, json.Unmarshal(b, &out) == nil - } -} - +import "encoding/json" + +// decodePayload normalises whatever shape the bus delivered into a +// reasonable Go value for the reply path. The wire delivers +// json.RawMessage; in-process callers may pass already-typed values. +// Used by session.onReply when forwarding RPC replies onto the +// originating Request's reply path. +// +// This file intentionally contains only reply-payload decoding; legacy +// config/device and rpc/hal/dump glue is no longer part of the MCU contract. func decodePayload(payload any) any { switch v := payload.(type) { case nil: @@ -111,14 +38,3 @@ func decodePayload(payload any) any { return v } } - -func truncateRawJSON(b []byte) string { - if len(b) == 0 { - return "" - } - const max = 160 - if len(b) <= max { - return string(b) - } - return string(b[:max]) + "..." -} diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index 0a2ab70..3a62b7e 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -6,6 +6,7 @@ import ( "time" "devicecode-go/bus" + "devicecode-go/services/updater" "devicecode-go/x/strconvx" ) @@ -16,12 +17,11 @@ type Transport interface { Close() error } -const protoVersion = 1 -const defaultLinkID = "mcu0" +const defaultLinkID = "mcu-uart0" // LinkConfig carries the fabric link parameters that the CM5 publishes // alongside its own session/transfer-mgr instances. Mirrors the relevant -// keys in `bigbox-v1-cm-2.json` `service.fabric.links.` for the +// keys in `bigbox-v1-cm-2.json` `fabric.data.links.` for the // MCU-facing link. Missing fields fall back to release defaults via // applyDefaults so callers can pass `LinkConfig{}` to mean "release". type LinkConfig struct { @@ -95,7 +95,11 @@ func (c *LinkConfig) applyDefaults() { var nextSessionID atomic.Uint64 func newLocalSID() string { - return "mcu-sid-" + strconvx.Utoa64(nextSessionID.Add(1)) + bootID := updater.BootID() + if bootID == "" { + bootID = updater.GenerateBootID() + } + return "mcu-sid-" + bootID + "-" + strconvx.Utoa64(nextSessionID.Add(1)) } // Run starts the fabric session. Blocks until ctx is cancelled or the diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 361495c..0aa15a1 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -3,6 +3,7 @@ package fabric import ( "bytes" "context" + "encoding/base64" "encoding/json" "errors" "io" @@ -11,7 +12,6 @@ import ( "time" "devicecode-go/bus" - "devicecode-go/types" "devicecode-go/x/shmring" ) @@ -67,10 +67,10 @@ const testCM5SID = "s1" func bringUp(t *testing.T, cm5 Transport) protoHelloAck { t.Helper() sendMsg(t, cm5, protoHello{ - Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: testCM5SID, Proto: protoVersion, + Type: "hello", Proto: protocolName, Node: "bigbox-cm5", SID: testCM5SID, }) ack := readMsg[protoHelloAck](t, cm5) - if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { + if ack.Node != "mcu" || ack.SID == "" || ack.Proto != protocolName { t.Fatalf("bad hello_ack: %+v", ack) } time.Sleep(50 * time.Millisecond) @@ -89,7 +89,7 @@ func unlockExports(t *testing.T, cm5 Transport) { // ---- codec ---- func TestCodecRoundTrip(t *testing.T) { - orig := protoHello{Type: "hello", Node: "mcu-1", Peer: "cm5-local", SID: "abc", Proto: protoVersion} + orig := protoHello{Type: "hello", Proto: protocolName, Node: "bigbox-cm5", SID: "abc"} data := marshal(orig) if !bytes.HasSuffix(data, []byte("\n")) { t.Error("marshal should end with newline") @@ -103,7 +103,7 @@ func TestCodecRoundTrip(t *testing.T) { } var dec protoHello json.Unmarshal(jsonPart, &dec) - if dec != orig { + if dec.Type != orig.Type || dec.Proto != orig.Proto || dec.Node != orig.Node || dec.SID != orig.SID { t.Errorf("round-trip: %+v vs %+v", dec, orig) } } @@ -211,6 +211,20 @@ func TestOversizeLineRecovery(t *testing.T) { } } +func TestReleaseTransferChunkFitsLineLimit(t *testing.T) { + raw := bytes.Repeat([]byte{'x'}, int(DefaultLinkConfig().ChunkSize)) + line := marshal(protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-line-limit", + Offset: 0, + Data: base64.RawURLEncoding.EncodeToString(raw), + ChunkDigest: "00000000", + }) + if got := len(line) - 1; got > maxLineLen { + t.Fatalf("%d-byte raw transfer chunk frame len = %d, max %d", len(raw), got, maxLineLen) + } +} + // ---- shmring transport ---- func TestShmringTransportRoundTrip(t *testing.T) { @@ -346,13 +360,13 @@ func TestHandshake(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) sendMsg(t, cm5, protoHello{ - Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s1", Proto: protoVersion, + Type: "hello", Proto: protocolName, Node: "bigbox-cm5", SID: "s1", }) ack := readMsg[protoHelloAck](t, cm5) - if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { + if ack.Node != "mcu" || ack.SID == "" || ack.Proto != protocolName { t.Errorf("bad ack: %+v", ack) } time.Sleep(50 * time.Millisecond) @@ -368,13 +382,13 @@ func TestSessionReset(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) - sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, Node: "bigbox-cm5", SID: "s2"}) ack := readMsg[protoHelloAck](t, cm5) - if !ack.OK || ack.SID == "" || ack.Proto != protoVersion { - t.Error("hello_ack.OK = false") + if ack.SID == "" || ack.Proto != protocolName { + t.Errorf("bad hello_ack: %+v", ack) } sendMsg(t, cm5, protoPing{Type: "ping", TS: 55, SID: "s2"}) pong := readMsg[protoPong](t, cm5) @@ -383,14 +397,14 @@ func TestSessionReset(t *testing.T) { } } -func TestRejectsWrongPeer(t *testing.T) { +func TestRejectsWrongNode(t *testing.T) { mcu, cm5 := pipePair() b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) - sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-999", SID: "s1", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, Node: "cm5-wrong", SID: "s1"}) gotLine := make(chan readResult, 1) go func() { line, err := cm5.ReadLine() @@ -398,10 +412,10 @@ func TestRejectsWrongPeer(t *testing.T) { }() select { case <-gotLine: - t.Fatal("got response to wrong-peer hello") + t.Fatal("got response to wrong-node hello") case <-time.After(200 * time.Millisecond): } - sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, Node: "bigbox-cm5", SID: "s2"}) select { case res := <-gotLine: if res.err != nil { @@ -411,8 +425,8 @@ func TestRejectsWrongPeer(t *testing.T) { if err := json.Unmarshal(res.line, &ack); err != nil { t.Fatalf("expected hello_ack: %v", err) } - if !ack.OK { - t.Fatal("hello_ack.OK = false") + if ack.Proto != protocolName { + t.Fatalf("bad hello_ack: %+v", ack) } case <-time.After(2 * time.Second): t.Fatal("no hello_ack for correct peer") @@ -424,7 +438,7 @@ func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) gotLine := make(chan readResult, 1) go func() { @@ -432,14 +446,14 @@ func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { gotLine <- readResult{line: line, err: err} }() - sendMsg(t, cm5, protoHello{Type: "hello", Peer: "mcu-1", SID: "s1", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, SID: "s1"}) select { case <-gotLine: t.Fatal("got response to hello without node") case <-time.After(200 * time.Millisecond): } - sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, Node: "bigbox-cm5", SID: "s2"}) select { case res := <-gotLine: if res.err != nil { @@ -449,8 +463,8 @@ func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { if err := json.Unmarshal(res.line, &ack); err != nil { t.Fatalf("expected hello_ack: %v", err) } - if !ack.OK { - t.Fatal("hello_ack.OK = false") + if ack.Proto != protocolName { + t.Fatalf("bad hello_ack: %+v", ack) } case <-time.After(2 * time.Second): t.Fatal("no hello_ack for correct peer") @@ -462,7 +476,7 @@ func TestPingPong(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) ack := bringUp(t, cm5) sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: "s1"}) pong := readMsg[protoPong](t, cm5) @@ -471,6 +485,40 @@ func TestPingPong(t *testing.T) { } } +func TestEchoedPingIgnored(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + ack := bringUp(t, cm5) + + sendMsg(t, cm5, protoPing{Type: "ping", TS: 41, SID: ack.SID}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: testCM5SID}) + + pong := readMsg[protoPong](t, cm5) + if pong.TS != 42 || pong.SID != ack.SID { + t.Errorf("bad pong after echoed ping: %+v ack=%+v", pong, ack) + } +} + +func TestEchoedTransferControlIgnored(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + bringUp(t, cm5) + + sendMsg(t, cm5, protoXferNeed{Type: msgXferNeed, XferID: "echoed", Next: 0}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: testCM5SID}) + + pong := readMsg[protoPong](t, cm5) + if pong.TS != 42 { + t.Errorf("bad pong after echoed transfer control: %+v", pong) + } +} + func TestMCUNeverInitiates(t *testing.T) { // Pre-handshake the MCU is silent; tickPing only fires once the link // is up. Active outbound pings post-handshake are covered by @@ -479,7 +527,7 @@ func TestMCUNeverInitiates(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) gotLine := make(chan struct{}) go func() { cm5.ReadLine(); close(gotLine) }() select { @@ -498,7 +546,7 @@ func TestSessionPingsUnconditionally(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", LinkConfig{PingInterval: 150 * time.Millisecond}) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", LinkConfig{PingInterval: 150 * time.Millisecond}) bringUp(t, cm5) for i := 0; i < 3; i++ { @@ -516,12 +564,12 @@ func TestReadyHeldUntilExportHoldoff(t *testing.T) { mcu, cm5 := pipePair() b := newBus() observer := b.NewConnection("observer") - sub := observer.Subscribe(bus.T("state", "fabric", "link", "mcu0")) + sub := observer.Subscribe(bus.T("state", "fabric", "link", "mcu-uart0")) defer observer.Unsubscribe(sub) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) var sawNotReady, sawReady bool @@ -554,23 +602,35 @@ func TestSessionResetUnretainsImports(t *testing.T) { // in promoteLink/teardownImportedRetained: each tracked local topic // gets a nil-payload retained publish that clears the bus's retain // store, so consumers don't see stale CM5-session data. + // importPublishRules is empty in the production contract, so this test + // installs a scoped temp rule. The mechanism under test is the generic + // retain-tracking + session-reset teardown chain, not the specific topic. + prev := importPublishRules + importPublishRules = append([]importRule{}, prev...) + importPublishRules = append(importPublishRules, importRule{ + wire: []string{"test", "wire", "config"}, + local: []string{"test", "local", "config"}, + }) + t.Cleanup(func() { importPublishRules = prev }) + cfgTopic := bus.T("test", "local", "config") + mcu, cm5 := pipePair() b := newBus() observer := b.NewConnection("observer") - cfgSub := observer.Subscribe(tConfigHAL) + cfgSub := observer.Subscribe(cfgTopic) defer observer.Unsubscribe(cfgSub) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) - // Push a config via the import pub path so config/hal becomes a - // tracked imported retain. + // Push a payload via the temp import path so the local topic + // becomes a tracked imported retain. sendMsg(t, cm5, protoPub{ Type: msgPub, - Topic: []string{"config", "device"}, - Payload: json.RawMessage(`{"devices":[]}`), + Topic: []string{"test", "wire", "config"}, + Payload: json.RawMessage(`{"hello":"world"}`), Retain: true, }) @@ -584,7 +644,7 @@ func TestSessionResetUnretainsImports(t *testing.T) { gotInitial = true } case <-deadline: - t.Fatal("timeout waiting for initial config/hal retain") + t.Fatal("timeout waiting for initial imported retain") } } @@ -595,13 +655,13 @@ func TestSessionResetUnretainsImports(t *testing.T) { // not run. go func() { _ = readMsg[protoHelloAck](t, cm5) }() sendMsg(t, cm5, protoHello{ - Type: msgHello, - Node: "cm5-local", - Peer: "mcu-1", - SID: "cm5-sid-new", + Type: msgHello, + Proto: protocolName, + Node: "bigbox-cm5", + SID: "cm5-sid-new", }) - // Expect a nil-payload retained publish on config/hal. + // Expect a nil-payload retained publish on the imported topic. deadline = time.After(2 * time.Second) for { select { @@ -637,7 +697,7 @@ func TestSessionResetUnretainsImportsAfterTransientPub(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) // 1) Retained import — establishes the bus retain + tracking entry. @@ -683,10 +743,10 @@ func TestSessionResetUnretainsImportsAfterTransientPub(t *testing.T) { // 3) Session reset → expect the original retain to be cleared. go func() { _ = readMsg[protoHelloAck](t, cm5) }() sendMsg(t, cm5, protoHello{ - Type: msgHello, - Node: "cm5-local", - Peer: "mcu-1", - SID: "cm5-sid-new", + Type: msgHello, + Proto: protocolName, + Node: "bigbox-cm5", + SID: "cm5-sid-new", }) deadline = time.After(2 * time.Second) @@ -760,7 +820,7 @@ func TestInboundCallBusyAtCapacity(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", LinkConfig{MaxInboundHelpers: 1}) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", LinkConfig{MaxInboundHelpers: 1}) bringUp(t, cm5) // First call holds the only helper slot. The bus has no handler, so @@ -795,7 +855,7 @@ func TestUnknownTypeIgnored(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) cm5.WriteLine([]byte(`{"type":"future_msg"}`)) sendMsg(t, cm5, protoPing{Type: "ping", TS: 1}) @@ -810,7 +870,7 @@ func TestMalformedJSONIgnored(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) cm5.WriteLine([]byte("not json")) sendMsg(t, cm5, protoPing{Type: "ping", TS: 2}) @@ -826,7 +886,7 @@ func TestCancelClosesCleanly(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) done := make(chan struct{}) go func() { - Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) close(done) }() bringUp(t, cm5) @@ -842,12 +902,12 @@ func TestLinkStatePublishedOnHandshake(t *testing.T) { mcu, cm5 := pipePair() b := newBus() observer := b.NewConnection("observer") - sub := observer.Subscribe(bus.T("state", "fabric", "link", "mcu0")) + sub := observer.Subscribe(bus.T("state", "fabric", "link", "mcu-uart0")) defer observer.Unsubscribe(sub) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) ack := bringUp(t, cm5) @@ -867,14 +927,14 @@ func TestLinkStatePublishedOnHandshake(t *testing.T) { sawOpening = true } if payload.Status == "ready" { - if payload.LinkID != "mcu0" { - t.Fatalf("link_id = %q, want mcu0", payload.LinkID) + if payload.LinkID != "mcu-uart0" { + t.Fatalf("link_id = %q, want mcu-uart0", payload.LinkID) } if !payload.Ready || !payload.Established { t.Fatalf("expected ready/established link state, got %+v", payload) } - if payload.PeerID != "cm5-local" { - t.Fatalf("peer_id = %q, want cm5-local", payload.PeerID) + if payload.PeerID != "bigbox-cm5" { + t.Fatalf("peer_id = %q, want bigbox-cm5", payload.PeerID) } if payload.LocalSID != ack.SID { t.Fatalf("local_sid = %q, want %q", payload.LocalSID, ack.SID) @@ -907,28 +967,30 @@ func topicString(t bus.Topic) string { } func TestImportPublishTopic(t *testing.T) { - for _, tc := range []struct { - wire []string - want string - }{ - {[]string{"config", "device"}, "config/hal"}, - {[]string{"config", "other"}, ""}, - {[]string{"unknown", "x"}, ""}, - {nil, ""}, + // importPublishRules is empty. Anything queried returns nil. + for _, tc := range [][]string{ + {"config", "device"}, // legacy gone + {"config", "other"}, + {"unknown", "x"}, + nil, } { - got := importPublishTopic(tc.wire) - if gotStr := topicString(got); gotStr != tc.want { - t.Errorf("importPublishTopic(%v) = %q, want %q", tc.wire, gotStr, tc.want) + if got := importPublishTopic(tc); got != nil { + t.Errorf("importPublishTopic(%v) = %v, want nil", tc, got) } } } func TestImportCallTopic(t *testing.T) { + // The current Lua migration wire surface uses cap/self/updater/main/rpc/*. for _, tc := range []struct { wire []string want string }{ - // rpc/hal/dump is handled directly by onCall, not via import rules. + {[]string{"cap", "self", "updater", "main", "rpc", "prepare-update"}, "rpc/updater/prepare"}, + {[]string{"cap", "self", "updater", "main", "rpc", "commit-update"}, "rpc/updater/commit"}, + {[]string{"cmd", "self", "updater", "prepare"}, ""}, + {[]string{"cmd", "self", "updater", "commit"}, ""}, + {[]string{"rpc", "hal", "dump"}, ""}, {[]string{"rpc", "hal", "other"}, ""}, {[]string{"config", "device"}, ""}, {nil, ""}, @@ -941,14 +1003,16 @@ func TestImportCallTopic(t *testing.T) { } func TestExportTopic(t *testing.T) { + // The current wire surface exports state/self/* and event/self/* only. for _, tc := range []struct { bus bus.Topic want []string }{ - {bus.T("hal", "cap", "env", "temperature", "core", "value"), []string{"state", "env", "temperature", "core", "value"}}, - {bus.T("hal", "cap", "power", "battery", "internal", "value"), []string{"state", "power", "battery", "internal", "value"}}, - {bus.T("hal", "state"), []string{"state", "hal"}}, - {bus.T("hal", "cap", "gpio", "fan", "value"), nil}, + {bus.T("state", "self", "software"), []string{"state", "self", "software"}}, + {bus.T("state", "self", "power", "battery"), []string{"state", "self", "power", "battery"}}, + {bus.T("event", "self", "power", "charger", "alert"), []string{"event", "self", "power", "charger", "alert"}}, + {bus.T("hal", "cap", "env", "temperature", "core", "value"), nil}, // legacy gone + {bus.T("hal", "state"), nil}, // legacy gone {bus.T("other", "topic"), nil}, } { got := exportTopic(tc.bus) @@ -956,41 +1020,23 @@ func TestExportTopic(t *testing.T) { if got != nil { t.Errorf("exportTopic(%v) = %v, want nil", tc.bus, got) } - } else { - if !slicesEqual(got, tc.want) { - t.Errorf("exportTopic(%v) = %v, want %v", tc.bus, got, tc.want) - } + } else if !slicesEqual(got, tc.want) { + t.Errorf("exportTopic(%v) = %v, want %v", tc.bus, got, tc.want) } } } func TestExportCallTopic(t *testing.T) { - for _, tc := range []struct { - bus bus.Topic - want []string - }{ - {bus.T("fabric", "out", "rpc", "hal", "dump"), []string{"rpc", "hal", "dump"}}, - {bus.T("fabric", "out", "rpc", "hal"), nil}, - {bus.T("other", "topic"), nil}, - } { - got := exportCallTopic(tc.bus) - if tc.want == nil { - if got != nil { - t.Errorf("exportCallTopic(%v) = %v, want nil", tc.bus, got) - } - } else if !slicesEqual(got, tc.want) { - t.Errorf("exportCallTopic(%v) = %v, want %v", tc.bus, got, tc.want) - } + // exportCallRules is empty; the MCU does not originate outbound RPC calls. + if got := exportCallTopic(bus.T("fabric", "out", "rpc", "hal", "dump")); got != nil { + t.Errorf("exportCallTopic(legacy dump path) = %v, want nil", got) } } func TestExportCallPatterns(t *testing.T) { patterns := exportCallPatterns() - if len(patterns) != 1 { - t.Fatalf("len(exportCallPatterns()) = %d, want 1", len(patterns)) - } - if got := topicString(patterns[0]); got != "fabric/out/rpc/hal/dump" { - t.Fatalf("exportCallPatterns()[0] = %q, want fabric/out/rpc/hal/dump", got) + if len(patterns) != 0 { + t.Fatalf("len(exportCallPatterns()) = %d, want 0", len(patterns)) } } @@ -1006,107 +1052,8 @@ func slicesEqual(a, b []string) bool { return true } -// ---- pub import ---- - -func TestPubImport(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - conn := b.NewConnection("fabric") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, conn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - - reader := b.NewConnection("test") - sub := reader.Subscribe(bus.T("config", "hal")) - - sendMsg(t, cm5, protoPub{ - Type: "pub", - Topic: []string{"config", "device"}, - Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), - Retain: true, - }) - - select { - case m := <-sub.Channel(): - if m == nil { - t.Fatal("nil message") - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for imported config on config/hal") - } -} - // ---- pub export ---- -func TestPubExport(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - publishConn := b.NewConnection("hal") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - publishConn.Publish(publishConn.NewMessage( - bus.T("hal", "cap", "env", "temperature", "core", "value"), - map[string]int{"deci_c": 412}, - true, - )) - - msg := readMsg[protoPub](t, cm5) - if msg.Type != "pub" { - t.Fatalf("expected pub, got %q", msg.Type) - } - want := []string{"state", "env", "temperature", "core", "value"} - if !slicesEqual(msg.Topic, want) { - t.Errorf("topic = %v, want %v", msg.Topic, want) - } - if !msg.Retain { - t.Error("expected retain=true") - } -} - -func TestUnretainExport(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - publishConn := b.NewConnection("hal") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - // Publish retained value first. - publishConn.Publish(publishConn.NewMessage( - bus.T("hal", "cap", "env", "temperature", "core", "value"), - map[string]int{"deci_c": 412}, - true, - )) - pub := readMsg[protoPub](t, cm5) - if pub.Type != "pub" || !pub.Retain { - t.Fatalf("expected retained pub, got t=%q retain=%v", pub.Type, pub.Retain) - } - - // Clear retained state (retain=true, payload=nil). - publishConn.Publish(publishConn.NewMessage( - bus.T("hal", "cap", "env", "temperature", "core", "value"), - nil, - true, - )) - unr := readMsg[protoUnretain](t, cm5) - if unr.Type != "unretain" { - t.Fatalf("expected unretain, got %q", unr.Type) - } - want := []string{"state", "env", "temperature", "core", "value"} - if !slicesEqual(unr.Topic, want) { - t.Errorf("topic = %v, want %v", unr.Topic, want) - } -} - func TestDrainExportsReturnsWhenSubscriptionClosed(t *testing.T) { b := newBus() conn := b.NewConnection("fabric") @@ -1166,6 +1113,132 @@ func TestDrainExportsWaitsForStartupHoldoff(t *testing.T) { } } +func TestDrainExportsPausesDuringIncomingTransfer(t *testing.T) { + b := newBus() + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + s := session{ + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + incomingTransfer: &incomingTransfer{}, + } + + s.setupExports() + defer s.teardownExports() + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "memory"), + map[string]int{"alloc_bytes": 241376}, + true, + )) + s.drainExports() + + if len(tr.writes) != 0 { + t.Fatalf("writes during transfer = %d, want 0", len(tr.writes)) + } + + s.incomingTransfer = nil + s.drainExports() + + if len(tr.writes) != 1 { + t.Fatalf("writes after transfer = %d, want 1", len(tr.writes)) + } +} + +func TestDrainExportsPausesAfterPrepareCall(t *testing.T) { + b := newBus() + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + cfg := DefaultLinkConfig() + s := session{ + conn: fabricConn, + tr: tr, + cfg: cfg, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), + } + + s.setupExports() + defer s.teardownExports() + defer s.teardownInbound() + + s.onCall(&protoCall{ + Type: msgCall, + ID: "prepare-1", + Topic: []string{"cap", "self", "updater", "main", "rpc", "prepare-update"}, + }) + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "updater"), + map[string]any{ + "state": "ready", + "pending_image_id": "mcu-dev-13.0", + "job_id": "job-1", + }, + true, + )) + s.drainExports() + + if len(tr.writes) != 0 { + t.Fatalf("writes during prepare quiet = %d, want 0", len(tr.writes)) + } + + s.transferQuietUntil = time.Time{} + s.transferQuietReason = "" + s.drainExports() + + if len(tr.writes) != 1 { + t.Fatalf("writes after prepare quiet = %d, want 1", len(tr.writes)) + } +} + +func TestPongAllowedDuringIncomingTransfer(t *testing.T) { + tr := &captureTransport{} + s := session{ + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + incomingTransfer: &incomingTransfer{ + meta: transferMeta{ID: "xfer-1"}, + }, + } + + s.onPing(&protoPing{Type: msgPing, TS: 42, SID: "cm5-sid"}) + + if len(tr.writes) != 1 { + t.Fatalf("pong writes during transfer = %d, want 1", len(tr.writes)) + } + var pong protoPong + if err := json.Unmarshal(tr.writes[0], &pong); err != nil { + t.Fatalf("pong decode failed: %v", err) + } + if pong.Type != msgPong || pong.SID != "mcu-sid-test" || pong.TS != 42 { + t.Fatalf("bad pong: %+v", pong) + } +} + +func TestPongSuppressedDuringPrepareQuiet(t *testing.T) { + tr := &captureTransport{} + s := session{ + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + transferQuietUntil: time.Now().Add(time.Second), + transferQuietReason: "prepare_call_rx", + } + + s.onPing(&protoPing{Type: msgPing, TS: 42, SID: "cm5-sid"}) + + if len(tr.writes) != 0 { + t.Fatalf("pong writes during prepare quiet = %d, want 0", len(tr.writes)) + } +} + // ---- unretain ---- func TestPubIgnoredBeforeHandshake(t *testing.T) { @@ -1173,7 +1246,7 @@ func TestPubIgnoredBeforeHandshake(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) sendMsg(t, cm5, protoPub{ Type: "pub", Topic: []string{"config", "device"}, @@ -1196,7 +1269,7 @@ func TestUnretainIgnoredBeforeHandshake(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) writer := b.NewConnection("writer") writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(`{"v":1}`), true)) @@ -1221,34 +1294,6 @@ func TestUnretainIgnoredBeforeHandshake(t *testing.T) { } } -func TestUnretain(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - conn := b.NewConnection("fabric") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, conn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - - sendMsg(t, cm5, protoPub{ - Type: "pub", Topic: []string{"config", "device"}, - Payload: json.RawMessage(`{"v":1}`), Retain: true, - }) - time.Sleep(50 * time.Millisecond) - sendMsg(t, cm5, protoUnretain{Type: "unretain", Topic: []string{"config", "device"}}) - time.Sleep(50 * time.Millisecond) - - reader := b.NewConnection("test") - sub := reader.Subscribe(bus.T("config", "device")) - select { - case m := <-sub.Channel(): - if m != nil && m.Payload != nil { - t.Errorf("expected no retained message, got %+v", m) - } - case <-time.After(100 * time.Millisecond): - } -} - // ---- call import ---- func TestCallIgnoredBeforeHandshake(t *testing.T) { @@ -1257,7 +1302,7 @@ func TestCallIgnoredBeforeHandshake(t *testing.T) { fabricConn := b.NewConnection("fabric") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, fabricConn, "mcu", "bigbox-cm5", DefaultLinkConfig()) handler := b.NewConnection("handler") sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) @@ -1276,16 +1321,18 @@ func TestCallIgnoredBeforeHandshake(t *testing.T) { } func TestCallImport(t *testing.T) { + // Test the canonical inbound call route: cap/self/updater/main/rpc/prepare-update + // maps to local rpc/updater/prepare where services/updater binds. mcu, cm5 := pipePair() b := newBus() fabricConn := b.NewConnection("fabric") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, fabricConn, "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) handler := b.NewConnection("handler") - sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) + sub := handler.Subscribe(bus.T("rpc", "updater", "prepare")) go func() { for m := range sub.Channel() { handler.Reply(m, map[string]string{"result": "ok"}, false) @@ -1293,7 +1340,7 @@ func TestCallImport(t *testing.T) { }() sendMsg(t, cm5, protoCall{ - Type: "call", ID: "test-corr-1", Topic: []string{"rpc", "hal", "dump"}, + Type: "call", ID: "test-corr-1", Topic: []string{"cap", "self", "updater", "main", "rpc", "prepare-update"}, Payload: json.RawMessage(`{}`), TimeoutMs: 5000, }) @@ -1311,7 +1358,7 @@ func TestCallNoRoute(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) sendMsg(t, cm5, protoCall{ @@ -1331,239 +1378,6 @@ func TestCallNoRoute(t *testing.T) { } } -func TestDumpCallReturnsConfigState(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - - // Send config first so the session has state. - sendMsg(t, cm5, protoPub{ - Type: "pub", - Topic: []string{"config", "device"}, - Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), - Retain: true, - }) - time.Sleep(100 * time.Millisecond) - - // Call dump. - sendMsg(t, cm5, protoCall{ - Type: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, - Payload: json.RawMessage(`{"ask":"status"}`), TimeoutMs: 5000, - }) - - reply := readMsg[protoReply](t, cm5) - if reply.Corr != "dump-1" { - t.Errorf("corr = %q", reply.Corr) - } - if !reply.OK { - t.Errorf("expected ok=true, got err=%q", reply.Err) - } - var dump dumpReply - if err := json.Unmarshal(reply.Value, &dump); err != nil { - t.Fatalf("unmarshal dump reply: %v", err) - } - if !dump.Applied { - t.Error("expected applied=true") - } - if dump.ConfigCount != 1 { - t.Errorf("config_count = %d, want 1", dump.ConfigCount) - } -} - -func TestDumpCallDoesNotBlockPing(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - - // Send dump call and ping back-to-back. - sendMsg(t, cm5, protoCall{ - Type: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, - Payload: json.RawMessage(`{}`), TimeoutMs: 1000, - }) - sendMsg(t, cm5, protoPing{Type: "ping", TS: 77, SID: testCM5SID}) - - type readResult struct { - line []byte - err error - } - type wireHeader struct { - Type string `json:"type"` - } - var gotReply, gotPong bool - for i := 0; i < 2; i++ { - msg := readMsg[wireHeader](t, cm5) - switch msg.Type { - case msgReply: - gotReply = true - case msgPong: - gotPong = true - default: - t.Fatalf("unexpected message type %q", msg.Type) - } - } - if !gotReply { - t.Error("missing dump reply") - } - if !gotPong { - t.Error("missing pong") - } -} - -func TestCallExport(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - type result struct { - msg *bus.Message - err error - } - done := make(chan result, 1) - go func() { - msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - )) - done <- result{msg: msg, err: err} - }() - - call := readMsg[protoCall](t, cm5) - if call.Type != "call" { - t.Fatalf("expected call, got %q", call.Type) - } - want := []string{"rpc", "hal", "dump"} - if !slicesEqual(call.Topic, want) { - t.Fatalf("topic = %v, want %v", call.Topic, want) - } - var payload map[string]string - if err := json.Unmarshal(call.Payload, &payload); err != nil { - t.Fatalf("Unmarshal payload: %v", err) - } - if payload["ask"] != "status" { - t.Fatalf("payload.ask = %q, want status", payload["ask"]) - } - - sendMsg(t, cm5, protoReply{ - Type: "reply", - Corr: call.ID, - OK: true, - Value: json.RawMessage(`{"ok":true,"remote":"cm5"}`), - }) - - select { - case res := <-done: - if res.err != nil { - t.Fatalf("RequestWait: %v", res.err) - } - if res.msg == nil { - t.Fatal("nil bus reply") - } - reply, ok := res.msg.Payload.(map[string]any) - if !ok { - t.Fatalf("payload type = %T, want map[string]any", res.msg.Payload) - } - if reply["remote"] != "cm5" { - t.Fatalf("reply.remote = %#v", reply["remote"]) - } - if reply["ok"] != true { - t.Fatalf("reply.ok = %#v", reply["ok"]) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for local reply") - } -} - -func TestCallExportOnlyConfiguredRule(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - // Use an unconfigured topic — only fabric/out/rpc/hal/dump is routed. - reqCtx, reqCancel := context.WithTimeout(context.Background(), 250*time.Millisecond) - defer reqCancel() - go func() { - _, _ = reqConn.RequestWait(reqCtx, reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "not_configured"), - map[string]string{"ask": "status"}, - false, - )) - }() - - gotLine := make(chan struct{}) - go func() { - _, _ = cm5.ReadLine() - close(gotLine) - }() - - select { - case <-gotLine: - t.Fatal("got wire call for unconfigured export rule") - case <-time.After(200 * time.Millisecond): - } -} - -func TestPendingWireCallsTimeout(t *testing.T) { - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - msg := reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - ) - sub := reqConn.Request(msg) - defer reqConn.Unsubscribe(sub) - - s := session{ - conn: fabricConn, - outboundCalls: []*outboundCall{ - {id: "wire-1", req: msg, deadline: time.Now().Add(-time.Millisecond)}, - }, - } - - s.drainOutbound(time.Now()) - - select { - case reply := <-sub.Channel(): - if reply == nil { - t.Fatal("nil timeout reply") - } - out, ok := reply.Payload.(types.ErrorReply) - if !ok { - t.Fatalf("payload type = %T, want types.ErrorReply", reply.Payload) - } - if out.OK { - t.Fatal("expected ok=false") - } - if out.Error != "timeout" { - t.Fatalf("error = %q, want timeout", out.Error) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for timeout reply") - } -} - func TestDrainExportsDropsUnmarshalablePayload(t *testing.T) { b := newBus() fabricConn := b.NewConnection("fabric") @@ -1637,225 +1451,3 @@ func TestDrainPendingCallsReportsMarshalFailure(t *testing.T) { t.Fatalf("err = %q, want %q", reply.Err, errPayloadMarshal) } } - -func TestDrainOutgoingWireCallsReportsMarshalFailure(t *testing.T) { - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - tr := &captureTransport{} - s := session{ - conn: fabricConn, - tr: tr, - link: linkUp, - } - - s.setupExports() - defer s.teardownExports() - - msg := reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - make(chan int), - false, - ) - replySub := reqConn.Request(msg) - defer reqConn.Unsubscribe(replySub) - - s.drainOutbound(time.Now()) - - if len(tr.writes) != 0 { - t.Fatalf("writes = %d, want 0", len(tr.writes)) - } - if len(s.outboundCalls) != 0 { - t.Fatalf("outboundCalls = %d, want 0", len(s.outboundCalls)) - } - - select { - case reply := <-replySub.Channel(): - if reply == nil { - t.Fatal("nil reply") - } - out, ok := reply.Payload.(types.ErrorReply) - if !ok { - t.Fatalf("payload type = %T, want types.ErrorReply", reply.Payload) - } - if out.OK { - t.Fatal("expected ok=false") - } - if out.Error != errPayloadMarshal { - t.Fatalf("error = %q, want %q", out.Error, errPayloadMarshal) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for marshal failure reply") - } -} - -func TestDrainOutgoingWireCallsReportsWriteFailure(t *testing.T) { - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - tr := &captureTransport{writeErr: errors.New("boom")} - s := session{ - conn: fabricConn, - tr: tr, - link: linkUp, - } - - s.setupExports() - defer s.teardownExports() - - msg := reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - ) - replySub := reqConn.Request(msg) - defer reqConn.Unsubscribe(replySub) - - s.drainOutbound(time.Now()) - - if s.link != linkDown { - t.Fatalf("link = %v, want %v", s.link, linkDown) - } - if len(s.outboundCalls) != 0 { - t.Fatalf("outboundCalls = %d, want 0", len(s.outboundCalls)) - } - - select { - case reply := <-replySub.Channel(): - if reply == nil { - t.Fatal("nil reply") - } - out, ok := reply.Payload.(types.ErrorReply) - if !ok { - t.Fatalf("payload type = %T, want types.ErrorReply", reply.Payload) - } - if out.OK { - t.Fatal("expected ok=false") - } - if out.Error != "transport_write_failed" { - t.Fatalf("error = %q, want transport_write_failed", out.Error) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for write failure reply") - } -} - -func TestCallExportPeerReset(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - type result struct { - msg *bus.Message - err error - } - done := make(chan result, 1) - go func() { - msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - )) - done <- result{msg: msg, err: err} - }() - - call := readMsg[protoCall](t, cm5) - if call.Type != "call" { - t.Fatalf("expected call, got %q", call.Type) - } - - sendMsg(t, cm5, protoHello{ - Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "fresh-session", Proto: protoVersion, - }) - _ = readMsg[protoHelloAck](t, cm5) - - select { - case res := <-done: - if res.err != nil { - t.Fatalf("RequestWait: %v", res.err) - } - if res.msg == nil { - t.Fatal("nil bus reply") - } - out, ok := res.msg.Payload.(types.ErrorReply) - if !ok { - t.Fatalf("payload type = %T, want types.ErrorReply", res.msg.Payload) - } - if out.OK { - t.Fatal("expected ok=false") - } - if out.Error != "session_reset" { - t.Fatalf("error = %q, want session_reset", out.Error) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for peer-reset reply") - } -} - -func TestEchoedHelloAckIgnoredDuringOutgoingCall(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - ack := bringUp(t, cm5) - unlockExports(t, cm5) - - type result struct { - msg *bus.Message - err error - } - done := make(chan result, 1) - go func() { - msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - )) - done <- result{msg: msg, err: err} - }() - - call := readMsg[protoCall](t, cm5) - if call.Type != "call" { - t.Fatalf("expected call, got %q", call.Type) - } - - // Send an echoed hello_ack (our own SID) — should be ignored. - sendMsg(t, cm5, protoHelloAck{ - Type: "hello_ack", Node: "mcu-1", SID: ack.SID, Proto: protoVersion, OK: true, - }) - - sendMsg(t, cm5, protoReply{ - Type: "reply", - Corr: call.ID, - OK: true, - Value: json.RawMessage(`{"ok":true,"remote":"cm5"}`), - }) - - select { - case res := <-done: - if res.err != nil { - t.Fatalf("RequestWait: %v", res.err) - } - if res.msg == nil { - t.Fatal("nil bus reply") - } - reply, ok := res.msg.Payload.(map[string]any) - if !ok { - t.Fatalf("payload type = %T, want map[string]any", res.msg.Payload) - } - if reply["remote"] != "cm5" || reply["ok"] != true { - t.Fatalf("unexpected reply payload: %#v", reply) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for local reply after echoed hello_ack") - } -} diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index 023b0ec..a5d3bca 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -4,14 +4,15 @@ import "encoding/json" // ---- Wire message type identifiers ---- // -// Wire schema mirrors devicecode-lua/src/services/fabric/protocol.lua at -// update-migration tip (commit 2c88090). The frame discriminator field is -// "type" (not "t"). Reply frames carry {id, ok, value, err}. Transfer frames -// use xfer_id/offset/checksum/data with a minimal xfer_chunk shape and -// xxHash32 hex wire integrity (no algorithm field; Lua source treats checksum -// as opaque hex). +// Wire schema mirrors ../docs/updating.md and +// devicecode-lua/src/services/fabric/protocol.lua. The frame discriminator is +// "type". Replies carry {id, ok, payload, err}. Transfers use explicit +// digest_alg/digest fields and required per-chunk chunk_digest. const ( + protocolName = "fabric-jsonl/1" + digestAlg = "xxhash32" + msgHello = "hello" msgHelloAck = "hello_ack" msgPing = "ping" @@ -31,28 +32,22 @@ const ( // ---- Wire message structs ---- -// protoCaps is carried in hello for forward compatibility. The Lua side -// sends caps but neither side enforces them in v1. -type protoCaps struct { - Pub bool `json:"pub,omitempty"` - Call bool `json:"call,omitempty"` -} - type protoHello struct { - Type string `json:"type"` - Node string `json:"node"` - Peer string `json:"peer"` - SID string `json:"sid"` - Proto int `json:"proto,omitempty"` - Caps *protoCaps `json:"caps,omitempty"` + Type string `json:"type"` + Proto string `json:"proto"` + SID string `json:"sid"` + Node string `json:"node"` + Identity json.RawMessage `json:"identity,omitempty"` + Auth json.RawMessage `json:"auth,omitempty"` } type protoHelloAck struct { - Type string `json:"type"` - Node string `json:"node"` - SID string `json:"sid,omitempty"` - Proto int `json:"proto,omitempty"` - OK bool `json:"ok"` + Type string `json:"type"` + Proto string `json:"proto"` + SID string `json:"sid"` + Node string `json:"node"` + Identity json.RawMessage `json:"identity,omitempty"` + Auth json.RawMessage `json:"auth,omitempty"` } type protoPing struct { @@ -87,28 +82,27 @@ type protoCall struct { TimeoutMs int `json:"timeout_ms"` } -// protoReply mirrors Lua's reply frame: {type, id, ok, value, err}. The Go +// protoReply mirrors Lua's reply frame: {type, id, ok, payload, err}. The Go // field for the correlation id keeps the name "Corr" for readability — the // wire spelling is "id" because the reply correlates to a prior call.id. type protoReply struct { - Type string `json:"type"` - Corr string `json:"id"` - OK bool `json:"ok"` - Value json.RawMessage `json:"value,omitempty"` - Err string `json:"err,omitempty"` + Type string `json:"type"` + Corr string `json:"id"` + OK bool `json:"ok"` + Payload json.RawMessage `json:"payload,omitempty"` + Err string `json:"err,omitempty"` } -// protoXferBegin (control lane) — required fields per protocol.lua -// validate_control: xfer_id, size, checksum (xxHash32 hex). meta is -// optional but source-used: transfer_mgr.lua sends it on xfer_begin and -// later does conn:call(meta.receiver, …) before xfer_done. Preserve the -// blob opaquely so fabric-update's receiver can pull meta.receiver out. +// protoXferBegin starts an incoming transfer to a named target. The only +// supported digest for fabric-jsonl/1 is xxhash32 seed 0, lower-hex. type protoXferBegin struct { - Type string `json:"type"` - XferID string `json:"xfer_id"` - Size uint32 `json:"size"` - Checksum string `json:"checksum"` - Meta json.RawMessage `json:"meta,omitempty"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Target string `json:"target"` + Size uint32 `json:"size"` + DigestAlg string `json:"digest_alg"` + Digest string `json:"digest"` + Meta json.RawMessage `json:"meta,omitempty"` } // protoXferReady (control) carries only xfer_id; success/failure is implicit @@ -118,29 +112,31 @@ type protoXferReady struct { XferID string `json:"xfer_id"` } -// protoXferChunk (bulk) — minimal {xfer_id, offset, data}. No chunk-level -// checksum, no sequence number; ack is by byte offset via xfer_need.next. +// protoXferChunk carries unpadded base64url data plus a required xxhash32 +// digest over the raw decoded chunk bytes. type protoXferChunk struct { - Type string `json:"type"` - XferID string `json:"xfer_id"` - Offset uint32 `json:"offset"` - Data string `json:"data"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Offset uint32 `json:"offset"` + Data string `json:"data"` + ChunkDigest string `json:"chunk_digest"` } -// protoXferNeed (control) acks the receiver's expected next byte offset. +// protoXferNeed (control) acks the MCU's expected next byte offset. type protoXferNeed struct { Type string `json:"type"` XferID string `json:"xfer_id"` Next uint32 `json:"next"` } -// protoXferCommit (control) carries the same wire-integrity shape as -// xfer_begin: xfer_id, size, checksum (xxHash32 hex over the payload bytes). +// protoXferCommit repeats the whole-object digest so begin/commit/streamed +// content can be reconciled before the target accepts the object. type protoXferCommit struct { - Type string `json:"type"` - XferID string `json:"xfer_id"` - Size uint32 `json:"size"` - Checksum string `json:"checksum"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Size uint32 `json:"size"` + DigestAlg string `json:"digest_alg"` + Digest string `json:"digest"` } // protoXferDone (control) carries only xfer_id; failure is signalled via diff --git a/services/fabric/remap.go b/services/fabric/remap.go index 175c44d..c6993c3 100644 --- a/services/fabric/remap.go +++ b/services/fabric/remap.go @@ -9,16 +9,20 @@ import "devicecode-go/bus" // of routes. If new routes are required, add them here and on the Lua // config side. // -// CM5 -> MCU wire publish: -// ["config","device"] -> config/hal (with Lua empty-table normalization) +// The legacy MCU surface (config/device -> config/hal import, rpc/hal/dump +// inline handler, hal/cap/env/* and hal/cap/power/* exports, hal/state -> +// state/hal export, fabric/out/rpc/hal/dump call export) has been removed. The +// canonical surface is now: // // CM5 -> MCU wire call: -// ["rpc","hal","dump"] -> handled directly by session (not via import rules) +// ["cap","self","updater","main","rpc","prepare-update"] -> rpc/updater/prepare +// ["cap","self","updater","main","rpc","commit-update"] -> rpc/updater/commit +// xfer_begin target="updater/main" is handled by the transfer path +// and routed to the local updater staging RPC after xfer_commit. // // MCU local bus publish -> wire: -// hal/cap/env/# -> ["state","env",...] -// hal/cap/power/# -> ["state","power",...] -// hal/state -> ["state","hal"] +// state/self/# -> state/self/... (identity, telemetry, update facts) +// event/self/# -> event/self/... (sparse charger alerts) type importRule struct { wire []string @@ -31,39 +35,50 @@ type busExportRule struct { suffix bool } -var importPublishRules = []importRule{ +// importPublishRules is empty. Config-like data flows through the +// prepare-update metadata field instead of retained publishes. +var importPublishRules = []importRule{} + +var ( + wireUpdaterPrepare = []string{"cap", "self", "updater", "main", "rpc", "prepare-update"} + wireUpdaterCommit = []string{"cap", "self", "updater", "main", "rpc", "commit-update"} +) + +// cap/self/updater/main/rpc/{prepare-update,commit-update} land here from +// the wire and are routed to local rpc/updater/{prepare,commit} where the +// updater service binds. The updater package re-uses the same local topic +// strings (services/updater.TopicPrepareRPC / TopicCommitRPC) so callers +// stay consistent. +var importCallRules = []importRule{ { - wire: []string{"config", "device"}, - local: []string{"config", "hal"}, + wire: wireUpdaterPrepare, + local: []string{"rpc", "updater", "prepare"}, + }, + { + wire: wireUpdaterCommit, + local: []string{"rpc", "updater", "commit"}, }, } -// rpc/hal/dump is handled directly by onCall, not via import rules. -var importCallRules = []importRule{} - +// exportPublishRules is the minimal surface: local `state/self/*` retains and +// `event/self/*` events flow to the wire under the same name. Legacy HAL export +// topics are replaced by telemetry publishers under state/self/*. var exportPublishRules = []busExportRule{ { - localPrefix: []string{"hal", "cap", "env"}, - remotePrefix: []string{"state", "env"}, + localPrefix: []string{"state", "self"}, + remotePrefix: []string{"state", "self"}, suffix: true, }, { - localPrefix: []string{"hal", "cap", "power"}, - remotePrefix: []string{"state", "power"}, + localPrefix: []string{"event", "self"}, + remotePrefix: []string{"event", "self"}, suffix: true, }, - { - localPrefix: []string{"hal", "state"}, - remotePrefix: []string{"state", "hal"}, - }, } -var exportCallRules = []busExportRule{ - { - localPrefix: []string{"fabric", "out", "rpc", "hal", "dump"}, - remotePrefix: []string{"rpc", "hal", "dump"}, - }, -} +// exportCallRules is empty. The MCU does not originate outbound RPC calls for +// the current Fabric/update contract. +var exportCallRules = []busExportRule{} func importPublishTopic(wire []string) bus.Topic { return importMatch(wire, importPublishRules) diff --git a/services/fabric/session.go b/services/fabric/session.go index 91a4908..3ad29b5 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "errors" + "strings" "time" "devicecode-go/bus" @@ -48,6 +49,17 @@ const ( exportMaxPerTick = 1 exportTickInterval = 50 * time.Millisecond errPayloadMarshal = "payload_marshal_failed" + + // The USB/UART path used during OTA echoes MCU-originated JSONL back into + // the MCU receiver. If exported retained state is in flight while CM5 starts + // an OTA transfer, the echoed line can contain CM5's xfer_begin spliced into + // the middle of the state pub. Hold exports quiet from prepare until either + // xfer_begin arrives or this window expires. + transferPrepareQuiet = 10 * time.Second + // Keep telemetry/state exports quiet long enough for the host to send the + // follow-up updater commit call after xfer_done. On echo-prone UART links, + // retained export backlog can otherwise splice into the commit JSONL frame. + transferCompleteQuiet = 10 * time.Second ) // ---- link reasons and error strings ---- @@ -65,29 +77,14 @@ const ( reasonTimeout = "timeout" ) -// ---- bus topics for config handling ---- - -var ( - tConfigHAL = bus.T("config", "hal") - dumpCallTopic = []string{"rpc", "hal", "dump"} -) - // ---- types ---- -type dumpReply struct { - OK bool `json:"ok"` - Method string `json:"method"` - Echo any `json:"echo,omitempty"` - HAL *types.HALState `json:"hal,omitempty"` - Applied bool `json:"applied"` - ConfigCount int `json:"config_count,omitempty"` - ConfigError string `json:"config_error,omitempty"` -} - type inboundCall struct { - id string - sub *bus.Subscription - deadline time.Time + id string + topic []string + sub *bus.Subscription + deadline time.Time + transferPrepare bool } type outboundCall struct { @@ -110,7 +107,7 @@ type linkStatePayload struct { LocalSID string `json:"local_sid"` PeerSID string `json:"peer_sid,omitempty"` PeerNode string `json:"peer_node,omitempty"` - PeerProto int `json:"peer_proto,omitempty"` + PeerProto string `json:"peer_proto,omitempty"` LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` LastPongUnixMilli int64 `json:"last_pong_unix_ms,omitempty"` @@ -137,38 +134,41 @@ type session struct { link linkState peerNode string peerSID string - peerProto int + peerProto string lastRxAt time.Time lastTxAt time.Time lastPongAt time.Time exportReadyAt time.Time exportsEnabled bool - exportSubs []*bus.Subscription - exportCallSubs []*bus.Subscription - inboundCalls []*inboundCall - outboundCalls []*outboundCall - nextOutboundID uint64 - nextPingAt time.Time - txControl txLane - txRPC txLane - txBulk txLane - importedRetained []bus.Topic // local topics currently retained on the bus due to wire imports - rpcReady bool // bridge replay complete; gates linkStatePayload.Ready - incomingTransfer *incomingTransfer - beginTransfer func(transferMeta) (transferSink, error) - - // Config state — tracks config/device → config/hal translation. - configApplied bool - configCount int - lastConfigErr string + exportSubs []*bus.Subscription + exportCallSubs []*bus.Subscription + inboundCalls []*inboundCall + outboundCalls []*outboundCall + nextOutboundID uint64 + nextPingAt time.Time + txControl txLane + txRPC txLane + txBulk txLane + importedRetained []bus.Topic // local topics currently retained on the bus due to wire imports + rpcReady bool // bridge replay complete; gates linkStatePayload.Ready + incomingTransfer *incomingTransfer + transferQuietUntil time.Time + transferQuietReason string + beginTransfer func(transferMeta) (transferSink, error) } func (s *session) log(msg string) { + if !fabricTraceEnabled { + return + } println("[fabric]", "sid", s.localSID, msg) } func (s *session) logKV(msg, key, value string) { + if !fabricTraceEnabled { + return + } println("[fabric]", "sid", s.localSID, msg, key, value) } @@ -334,10 +334,12 @@ func (s *session) handleLinkDown(reason, err string) { s.link = linkDown s.peerNode = "" s.peerSID = "" - s.peerProto = 0 + s.peerProto = "" s.exportReadyAt = time.Time{} s.exportsEnabled = false s.rpcReady = false + s.transferQuietUntil = time.Time{} + s.transferQuietReason = "" s.teardownExports() s.teardownInbound() s.teardownOutbound(pendingReason) @@ -468,6 +470,8 @@ func (s *session) dispatch(line []byte) { typedDispatch(s, line, s.onTransferCommit) case msgXferAbort: typedDispatch(s, line, s.onTransferAbort) + case msgXferReady, msgXferNeed, msgXferDone: + s.logKV("echoed transfer control ignored", "type", t) default: s.logKV("unknown message type dropped", "type", t) } @@ -495,21 +499,40 @@ func (s *session) logMalformed(line []byte, err error) { if err != nil { errStr = err.Error() } - println( - "[fabric]", "sid", s.localSID, - "malformed frame dropped", - "line_len", strconvx.Itoa(len(line)), - "line_head", tracePreview(line), - "err", errStr, - ) + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "malformed frame dropped", + "line_len", strconvx.Itoa(len(line)), + "line_xxhash32", traceHash(line), + "line_head", tracePreview(line), + "line_tail", traceTail(line), + "err", errStr, + ) + } + + // If a transfer is in flight, the dropped frame was very likely a + // corrupted xfer_chunk. Without an explicit signal CM5 keeps + // streaming chunks past the gap and the receiver silently drops + // them as out-of-order; the transfer eventually fails on the + // phase timeout. Re-request the next expected byte so CM5 + // retransmits from the gap. Cheap if it wasn't actually a chunk + // (the sender just gets one stale need frame and ignores it once + // it has caught up). + if cur := s.incomingTransfer; cur != nil { + s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + // Refresh the idle-chunk deadline so a stream of malformed frames can + // recover instead of tripping phase_timeout mid-retry. + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) + } } -// notePeerIdentity records the remote peer's node, SID, and proto version. +// notePeerIdentity records the remote peer's node, SID, and protocol name. // If the SID changes mid-session, the returned reason triggers a full // teardown of exports and pending calls on the Go side. Note: the Lua // side only tears down pending calls on SID change, not exports — this // asymmetry is intentional since the CM5 re-subscribes on reconnect. -func (s *session) notePeerIdentity(node, sid string, proto int) string { +func (s *session) notePeerIdentity(node, sid string, proto string) string { reason := "" if s.link == linkUp && s.peerSID != "" && sid != "" && s.peerSID != sid { reason = reasonSessionReset @@ -520,7 +543,7 @@ func (s *session) notePeerIdentity(node, sid string, proto int) string { if sid != "" { s.peerSID = sid } - if proto > 0 { + if proto != "" { s.peerProto = proto } return reason @@ -548,9 +571,55 @@ func hasWirePrefix(topic, prefix []string) bool { return true } +func wireTopicEquals(topic, want []string) bool { + if len(topic) != len(want) { + return false + } + for i := range want { + if topic[i] != want[i] { + return false + } + } + return true +} + +func wireTopicString(topic []string) string { + if len(topic) == 0 { + return "" + } + return strings.Join(topic, "/") +} + +func (s *session) extendTransferQuiet(reason string, d time.Duration) { + now := time.Now() + until := now.Add(d) + if until.After(s.transferQuietUntil) { + s.transferQuietUntil = until + s.transferQuietReason = reason + } +} + +func (s *session) transferQuiet(now time.Time) (bool, string) { + if cur := s.incomingTransfer; cur != nil { + return true, "incoming_transfer:" + cur.meta.ID + } + if !s.transferQuietUntil.IsZero() && now.Before(s.transferQuietUntil) { + reason := s.transferQuietReason + if reason == "" { + reason = "quiet_window" + } + return true, reason + } + return false, "" +} + func (s *session) onHello(msg *protoHello) { - if msg.Peer != "" && msg.Peer != s.nodeID { - s.log("hello dropped: wrong peer") + if msg.Proto != protocolName { + s.log("hello dropped: unsupported proto") + return + } + if msg.SID == "" || msg.Node == "" { + s.log("hello dropped: missing identity") return } if s.peerID != "" && msg.Node != s.peerID { @@ -562,10 +631,9 @@ func (s *session) onHello(msg *protoHello) { if !s.sendControl(marshal(protoHelloAck{ Type: msgHelloAck, - Node: s.nodeID, + Proto: protocolName, SID: s.localSID, - Proto: protoVersion, - OK: true, + Node: s.nodeID, })) { return } @@ -578,9 +646,12 @@ func (s *session) onHelloAck(msg *protoHelloAck) { s.log("echoed hello_ack ignored") return } - if !msg.OK { - s.log("hello_ack rejected by peer") - s.handleLinkDown(reasonHelloRejected, "") + if msg.Proto != protocolName { + s.log("hello_ack dropped: unsupported proto") + return + } + if msg.SID == "" || msg.Node == "" { + s.log("hello_ack dropped: missing identity") return } reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) @@ -589,6 +660,13 @@ func (s *session) onHelloAck(msg *protoHelloAck) { } func (s *session) onPing(msg *protoPing) { + if s.isSelfControlFrame("", msg.SID) { + s.log("echoed ping ignored") + return + } + if !s.transferQuietUntil.IsZero() && time.Now().Before(s.transferQuietUntil) { + return + } s.logKV("ping rx", "peer_sid", msg.SID) if !s.sendControl(marshal(protoPong{Type: msgPong, TS: msg.TS, SID: s.localSID})) { return @@ -603,6 +681,13 @@ func (s *session) tickPing(now time.Time) { if s.link != linkUp { return } + if quiet, _ := s.transferQuiet(now); quiet { + // Keep the UART quiet while CM5 is preparing or streaming a firmware + // image; chunk recovery depends on xfer_need being the only periodic + // MCU-originated frame on the fabric link. + s.nextPingAt = now.Add(s.cfg.PingInterval) + return + } if s.nextPingAt.IsZero() || now.Before(s.nextPingAt) { return } @@ -631,23 +716,6 @@ func (s *session) onPub(msg *protoPub) { return } - // config/device → config/hal: normalize and track. - if topicEquals(localTopic, tConfigHAL) { - cfg, err := decodeHALConfig(msg.Payload) - if err != "" { - s.lastConfigErr = err - s.log("config/device rejected: " + err) - return - } - s.configApplied = true - s.configCount++ - s.lastConfigErr = "" - s.log("config/device applied to config/hal") - s.conn.Publish(s.conn.NewMessage(localTopic, cfg, true)) - s.trackImportedRetain(localTopic) - return - } - s.conn.Publish(s.conn.NewMessage(localTopic, msg.Payload, msg.Retain)) if msg.Retain { s.trackImportedRetain(localTopic) @@ -670,34 +738,6 @@ func (s *session) onUnretain(msg *protoUnretain) { } func (s *session) onCall(msg *protoCall) { - // rpc/hal/dump: handle directly — reply with config and HAL state. - if slicesEqualStrings(msg.Topic, dumpCallTopic) { - var halState *types.HALState - sub := s.conn.Subscribe(bus.T("hal", "state")) - select { - case m := <-sub.Channel(): - if m != nil { - if st, ok := decodeHALState(m.Payload); ok { - halState = &st - } - } - default: - } - s.conn.Unsubscribe(sub) - - reply := dumpReply{ - OK: true, - Method: "dump", - Echo: decodePayload(msg.Payload), - HAL: halState, - Applied: s.configApplied, - ConfigCount: s.configCount, - ConfigError: s.lastConfigErr, - } - s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: true, Value: mustMarshal(reply)})) - return - } - if len(s.inboundCalls) >= s.cfg.MaxInboundHelpers { s.log("incoming call dropped: busy") s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonBusy})) @@ -711,16 +751,24 @@ func (s *session) onCall(msg *protoCall) { return } + isTransferPrepare := wireTopicEquals(msg.Topic, wireUpdaterPrepare) + if isTransferPrepare { + s.extendTransferQuiet("prepare_call_rx", transferPrepareQuiet) + } + timeout := callTimeoutDef if msg.TimeoutMs > 0 { timeout = time.Duration(msg.TimeoutMs) * time.Millisecond } busMsg := s.conn.NewMessage(localTopic, msg.Payload, false) sub := s.conn.Request(busMsg) + topicCopy := append([]string(nil), msg.Topic...) s.inboundCalls = append(s.inboundCalls, &inboundCall{ - id: msg.ID, - sub: sub, - deadline: time.Now().Add(timeout), + id: msg.ID, + topic: topicCopy, + sub: sub, + deadline: time.Now().Add(timeout), + transferPrepare: isTransferPrepare, }) } @@ -737,7 +785,7 @@ func (s *session) onReply(msg *protoReply) { s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: msg.Err}, false) return } - s.conn.Reply(call.req, decodePayload(msg.Value), false) + s.conn.Reply(call.req, decodePayload(msg.Payload), false) return } @@ -763,14 +811,6 @@ func checkBusError(payload any) string { return "" } -func mustMarshal(v any) json.RawMessage { - b, err := json.Marshal(v) - if err != nil { - return json.RawMessage(`{"error":"marshal_failed"}`) - } - return json.RawMessage(b) -} - func topicEquals(t bus.Topic, expected bus.Topic) bool { if t.Len() != expected.Len() { return false @@ -846,10 +886,17 @@ func (s *session) drainExports() { if s.link != linkUp { return } + now := time.Now() + if quiet, _ := s.transferQuiet(now); quiet { + // Avoid colliding telemetry/state exports with prepare/xfer traffic on + // echo-prone links. Queued retained state can be exported after the OTA + // control/data path has gone quiet. + return + } if !s.exportsEnabled { return } - if !s.exportReadyAt.IsZero() && time.Now().Before(s.exportReadyAt) { + if !s.exportReadyAt.IsZero() && now.Before(s.exportReadyAt) { return } total := 0 @@ -882,6 +929,15 @@ func (s *session) drainExports() { s.logKV("export payload dropped", "err", err.Error()) continue } + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "export pub tx", + "topic", wireTopicString(wire), + "retain", m.Retained, + "payload_len", strconvx.Itoa(len(payload)), + ) + } if !s.sendRPC(marshal(protoPub{ Type: msgPub, Topic: wire, @@ -911,12 +967,18 @@ func (s *session) drainInbound(now time.Time) { s.conn.Unsubscribe(call.sub) call.sub = nil // prevent double-unsubscribe in teardownInbound if !ok || reply == nil { + if call.transferPrepare { + s.extendTransferQuiet("prepare_reply_timeout", transferPrepareQuiet) + } if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue } if errStr := checkBusError(reply.Payload); errStr != "" { + if call.transferPrepare { + s.extendTransferQuiet("prepare_reply_error", transferPrepareQuiet) + } if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errStr})) { return } @@ -924,12 +986,18 @@ func (s *session) drainInbound(now time.Time) { } payload, err := marshalPayload(reply.Payload) if err != nil { + if call.transferPrepare { + s.extendTransferQuiet("prepare_reply_marshal_failed", transferPrepareQuiet) + } if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { return } continue } - if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Value: payload})) { + if call.transferPrepare { + s.extendTransferQuiet("prepare_reply_ok", transferPrepareQuiet) + } + if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Payload: payload})) { return } continue @@ -939,6 +1007,9 @@ func (s *session) drainInbound(now time.Time) { if !now.Before(call.deadline) { s.conn.Unsubscribe(call.sub) call.sub = nil + if call.transferPrepare { + s.extendTransferQuiet("prepare_call_timeout", transferPrepareQuiet) + } if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } diff --git a/services/fabric/trace.go b/services/fabric/trace.go index 4c2637b..7453e17 100644 --- a/services/fabric/trace.go +++ b/services/fabric/trace.go @@ -1,5 +1,7 @@ package fabric +import "devicecode-go/x/xxhash" + func traceLine(dir string, data []byte) { if !fabricTraceEnabled { return @@ -36,6 +38,18 @@ func tracePreview(data []byte) string { return string(out) } +func traceTail(data []byte) string { + const max = 200 + if len(data) > max { + data = data[len(data)-max:] + } + return tracePreview(data) +} + +func traceHash(data []byte) string { + return xxhashHex(xxhash.Sum32(data, 0)) +} + func hexNibble(v byte) byte { v &= 0x0f if v < 10 { diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 59919b8..b90aea3 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -7,28 +7,29 @@ import ( "strings" "time" + "devicecode-go/services/updater" "devicecode-go/x/strconvx" "devicecode-go/x/xxhash" ) -const postTransferDoneSettle = 250 * time.Millisecond -const transferProgressLogEvery = 32 +const transferTargetUpdaterMain = "updater/main" +const transferIdleRetryLimit = 3 -// transferMeta captures xfer_begin contents. The required Lua wire shape is -// {xfer_id, size, checksum}; meta is optional but source-used (transfer_mgr -// passes it through to the receiver, where meta.receiver names a local -// endpoint to call after xfer_commit and before xfer_done). Preserve meta -// as an opaque blob — interpretation lives in fabric-update. +// transferMeta captures xfer_begin contents. The transfer target is explicit +// on the wire; firmware update uses target="updater/main". meta remains opaque +// and informational to Fabric. type transferMeta struct { - ID string - Size uint32 - Checksum string // xxHash32 hex (8 lower-case hex chars), no algorithm field - Meta json.RawMessage + ID string + Target string + Size uint32 + DigestAlg string + Digest string // xxHash32 hex (8 lower-case hex chars), seed 0 + Meta json.RawMessage } // transferInfo is internal-only state returned by the sink on Commit. It is // no longer wire-visible — xfer_done carries only xfer_id in the canonical -// schema; size/checksum reconciliation lives on xfer_commit. +// schema; size/digest reconciliation lives on xfer_commit. type transferInfo struct { BytesWritten uint32 SlotXIPAddr uint32 @@ -38,11 +39,18 @@ type transferInfo struct { // WriteChunk receives bytes at the given byte offset (matching xfer_chunk's // canonical wire fields). No sequence number is passed — the caller has // already validated offset against expected progress. +// +// Bytes() returns the committed payload bytes for target invocation. +// Only valid after Commit() has succeeded. May return nil if the sink +// streamed the bytes elsewhere (e.g. the RP2350 sink writes directly to +// flash and doesn't keep a RAM copy); updater/main consumes that staged +// stream from the updater package. type transferSink interface { WriteChunk(offset uint32, data []byte) error Commit() (transferInfo, error) Apply() error Abort(reason string) error + Bytes() []byte } type incomingTransfer struct { @@ -51,6 +59,7 @@ type incomingTransfer struct { bytesWritten uint32 chunksSeen uint32 hasher *xxhash.Hasher + idleRetries uint8 // deadline is the idle-chunk watchdog: bumped on every accepted chunk // and on initial xfer_begin. checkTransferTimeout fires if now > deadline. // Mirrors transfer_mgr.lua: `active.deadline = runtime.now() + phase_timeout`. @@ -61,10 +70,39 @@ func lowerHex(s string) string { return strings.ToLower(strings.TrimSpace(s)) } +func canonicalXXHash32Hex(s string) (string, bool) { + digest := lowerHex(s) + return digest, s == digest && validXXHash32Hex(digest) +} + +func validXXHash32Hex(s string) bool { + if len(s) != 8 { + return false + } + for i := 0; i < len(s); i++ { + c := s[i] + if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) { + return false + } + } + return true +} + func u32s(v uint32) string { return strconvx.Itoa(int(v)) } +func decodeChunkData(encoded string) ([]byte, string) { + raw, err := base64.RawURLEncoding.DecodeString(encoded) + if err != nil { + return nil, "invalid_chunk_encoding" + } + if base64.RawURLEncoding.EncodeToString(raw) != encoded { + return nil, "invalid_chunk_encoding" + } + return raw, "" +} + func (s *session) sendTransferReady(id string) bool { return s.sendControl(marshal(protoXferReady{ Type: msgXferReady, @@ -124,9 +162,14 @@ func (s *session) checkTransferTimeout(now time.Time) { if !now.After(cur.deadline) { return } + if cur.idleRetries < transferIdleRetryLimit { + cur.idleRetries++ + cur.deadline = now.Add(s.cfg.PhaseTimeout) + s.logKV("transfer idle retry", "offset", u32s(cur.bytesWritten)) + s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + return + } id := cur.meta.ID - println("[fabric]", "sid", s.localSID, "xfer_phase_timeout", - "id", id, "phase_s", u32s(uint32(s.cfg.PhaseTimeout/time.Second))) s.abortTransfer("timeout") s.sendTransferAbort(id, "timeout") } @@ -135,21 +178,34 @@ func validateTransferBegin(msg *protoXferBegin) (transferMeta, string) { if msg.XferID == "" { return transferMeta{}, "xfer_begin.xfer_id" } + if msg.Target == "" { + return transferMeta{}, "missing_target" + } + if msg.Target != transferTargetUpdaterMain { + return transferMeta{}, "unsupported_target" + } if msg.Size == 0 { return transferMeta{}, "xfer_begin.size" } - if msg.Checksum == "" { - return transferMeta{}, "xfer_begin.checksum" + if msg.DigestAlg != digestAlg { + return transferMeta{}, "unsupported_digest_alg" + } + digest, ok := canonicalXXHash32Hex(msg.Digest) + if !ok { + return transferMeta{}, "invalid_digest" } return transferMeta{ - ID: msg.XferID, - Size: msg.Size, - Checksum: lowerHex(msg.Checksum), - Meta: append(json.RawMessage(nil), msg.Meta...), + ID: msg.XferID, + Target: msg.Target, + Size: msg.Size, + DigestAlg: msg.DigestAlg, + Digest: digest, + Meta: append(json.RawMessage(nil), msg.Meta...), }, "" } func (s *session) onTransferBegin(msg *protoXferBegin) { + s.extendTransferQuiet("xfer_begin_rx", transferPrepareQuiet) meta, errStr := validateTransferBegin(msg) if errStr != "" { if msg.XferID != "" { @@ -159,6 +215,19 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { return } if s.incomingTransfer != nil { + cur := s.incomingTransfer + if cur.meta.ID == meta.ID && + cur.meta.Size == meta.Size && + cur.meta.Target == meta.Target && + cur.meta.DigestAlg == meta.DigestAlg && + cur.meta.Digest == meta.Digest { + s.logKV("xfer_begin duplicate", "id", meta.ID) + if s.sendTransferReady(meta.ID) { + s.sendTransferNeed(meta.ID, cur.bytesWritten) + } + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) + return + } s.sendTransferAbort(meta.ID, "busy") return } @@ -177,14 +246,9 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { hasher: xxhash.New(0), deadline: time.Now().Add(s.cfg.PhaseTimeout), } - println( - "[fabric]", "sid", s.localSID, - "xfer_begin accepted", - "id", meta.ID, - "size", u32s(meta.Size), - "checksum", meta.Checksum, - ) - s.sendTransferReady(meta.ID) + if s.sendTransferReady(meta.ID) { + s.sendTransferNeed(meta.ID, 0) + } } func (s *session) onTransferChunk(msg *protoXferChunk) { @@ -193,42 +257,65 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { s.logKV("xfer_chunk dropped", "id", msg.XferID) return } - // Lua transfer_mgr.lua aborts and clears the active transfer on any - // chunk-level fault (unexpected offset, decode failure, size mismatch). - // Match that — do not send xfer_need + keep alive. id := cur.meta.ID - if msg.Offset != cur.bytesWritten { - println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", - "id", id, "err", "unexpected_offset", - "off", u32s(msg.Offset), "want_off", u32s(cur.bytesWritten)) - s.abortTransfer("unexpected_offset") - s.sendTransferAbort(id, "unexpected_offset") + if msg.Offset < cur.bytesWritten { + s.sendTransferNeed(id, cur.bytesWritten) + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) return } - raw, err := base64.RawURLEncoding.DecodeString(msg.Data) - if err != nil { - println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", - "id", id, "err", "decode_failed", - "off", u32s(msg.Offset), "data_len", u32s(uint32(len(msg.Data)))) - s.abortTransfer("decode_failed") - s.sendTransferAbort(id, "decode_failed") + if msg.Offset > cur.bytesWritten { + s.sendTransferNeed(id, cur.bytesWritten) + return + } + raw, errStr := decodeChunkData(msg.Data) + if errStr != "" { + s.logKV("xfer_chunk decode retry", "err", errStr) + s.sendTransferNeed(id, cur.bytesWritten) + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) return } if len(raw) == 0 { - println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", - "id", id, "err", "empty_chunk", "off", u32s(msg.Offset)) s.abortTransfer("empty_chunk") s.sendTransferAbort(id, "empty_chunk") return } if cur.bytesWritten+uint32(len(raw)) > cur.meta.Size { - println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", - "id", id, "err", "size_overflow", - "bytes_written", u32s(cur.bytesWritten), - "raw_len", u32s(uint32(len(raw))), - "total", u32s(cur.meta.Size)) - s.abortTransfer("size_overflow") - s.sendTransferAbort(id, "size_overflow") + reason := "size_too_large" + s.abortTransfer(reason) + s.sendTransferAbort(id, reason) + return + } + // Per-chunk integrity is required by the current MCU contract. + // JSON parsing alone misses single-byte UART corruption inside the + // base64url data string: the bytes still decode, just to the wrong + // values. On mismatch we ask the sender to resume at the current + // byte offset instead of clearing the transfer. + want, ok := canonicalXXHash32Hex(msg.ChunkDigest) + if !ok { + reason := "invalid_chunk_digest" + if msg.ChunkDigest == "" { + reason = "missing_chunk_digest" + } + println( + "[fabric-xfer]", "abort_tx", + "id", id, + "reason", reason, + "offset", u32s(msg.Offset), + "digest_len", strconvx.Itoa(len(msg.ChunkDigest)), + "digest", msg.ChunkDigest, + "data_len", strconvx.Itoa(len(msg.Data)), + ) + s.abortTransfer(reason) + s.sendTransferAbort(id, reason) + return + } + got := xxhashHex(xxhash.Sum32(raw, 0)) + if got != want { + s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + // Recovery counts as progress — bump the deadline so a burst + // of digest-mismatched chunks doesn't trip the idle watchdog + // mid-recovery. + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) return } if err := cur.sink.WriteChunk(msg.Offset, raw); err != nil { @@ -241,21 +328,12 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { _, _ = cur.hasher.Write(raw) cur.bytesWritten += uint32(len(raw)) cur.chunksSeen++ + cur.idleRetries = 0 cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) - if cur.chunksSeen == 1 || (cur.chunksSeen%transferProgressLogEvery) == 0 { - println( - "[fabric]", "sid", s.localSID, - "xfer_chunk accepted", - "id", cur.meta.ID, - "off", u32s(msg.Offset), - "data_len", u32s(uint32(len(raw))), - "bytes_written", u32s(cur.bytesWritten), - ) - } raw = nil - // Forced GC after each absorbed chunk eliminates firmware-transfer byte - // drops on the safe-window allocator. Do NOT remove this without - // reproducing the regression in firmware-mono/docs/old/FABRIC_TRANSFER_FIX.md. + // Keep transfer memory bounded on TinyGo. The receiver allocates while + // unmarshalling JSON and decoding base64 chunks; without regular collection + // long updates can run out of heap before commit. runtime.GC() s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) } @@ -268,28 +346,29 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { } id := cur.meta.ID if msg.Size != cur.meta.Size || cur.bytesWritten != cur.meta.Size { - println("[fabric]", "sid", s.localSID, "xfer_commit failed", - "id", id, "err", "size_mismatch", - "bytes_written", u32s(cur.bytesWritten), - "msg_size", u32s(msg.Size), "meta_size", u32s(cur.meta.Size)) - s.abortTransfer("size_mismatch") - s.sendTransferAbort(id, "size_mismatch") + reason := "short_transfer" + s.abortTransfer(reason) + s.sendTransferAbort(id, reason) + return + } + if msg.DigestAlg != digestAlg { + s.abortTransfer("unsupported_digest_alg") + s.sendTransferAbort(id, "unsupported_digest_alg") + return + } + commitDigest, ok := canonicalXXHash32Hex(msg.Digest) + if !ok { + s.abortTransfer("invalid_digest") + s.sendTransferAbort(id, "invalid_digest") return } streamedHex := xxhashHex(cur.hasher.Sum32()) - commitChecksum := lowerHex(msg.Checksum) - if commitChecksum != cur.meta.Checksum || streamedHex != cur.meta.Checksum { - println("[fabric]", "sid", s.localSID, "xfer_commit failed", - "id", id, "err", "checksum_mismatch", - "begin", cur.meta.Checksum, - "commit", commitChecksum, - "streamed", streamedHex, - ) - s.abortTransfer("checksum_mismatch") - s.sendTransferAbort(id, "checksum_mismatch") + if commitDigest != cur.meta.Digest || streamedHex != cur.meta.Digest { + s.abortTransfer("digest_mismatch") + s.sendTransferAbort(id, "digest_mismatch") return } - info, err := cur.sink.Commit() + _, err := cur.sink.Commit() if err != nil { s.logKV("transfer commit failed", "err", err.Error()) reason := err.Error() @@ -298,22 +377,112 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { return } sink := cur.sink + meta := cur.meta + s.extendTransferQuiet("xfer_commit_target", transferCompleteQuiet) s.clearTransfer() - println( - "[fabric]", "sid", s.localSID, - "xfer_commit accepted", - "id", id, - "bytes_written", u32s(info.BytesWritten), - ) - if !s.sendTransferDone(id) { + + bytesPayload := sink.Bytes() + ok, reason := s.invokeTransferTarget(meta, id, bytesPayload) + if !ok { + s.extendTransferQuiet("xfer_target_rejected", transferCompleteQuiet) + s.sendTransferAbort(id, reason) return } - time.Sleep(postTransferDoneSettle) - if err := sink.Apply(); err != nil { - s.logKV("transfer apply failed", "err", err.Error()) - return + s.extendTransferQuiet("xfer_done", transferCompleteQuiet) + s.sendTransferDone(id) +} + +const targetCallTimeout = 5 * time.Second + +// invokeTransferTarget calls the local updater staging RPC named by +// xfer_begin.target. The wire no longer carries raw/member receiver topics; +// target="updater/main" maps to an internal bus RPC owned by the updater +// service. The reply gates whether fabric sends xfer_done or xfer_abort. +func (s *session) invokeTransferTarget(meta transferMeta, xferID string, artefact []byte) (bool, string) { + if meta.Target != transferTargetUpdaterMain { + return false, "unsupported_target" + } + payload := updater.StagePayload{ + LinkID: s.linkID, + XferID: xferID, + Target: meta.Target, + Size: meta.Size, + DigestAlg: meta.DigestAlg, + Digest: meta.Digest, + Meta: meta.Meta, + Artefact: artefact, + } + msg := s.conn.NewMessage(updater.TopicStageRPC, payload, false) + replySub := s.conn.Request(msg) + defer s.conn.Unsubscribe(replySub) + + select { + case rep, ok := <-replySub.Channel(): + if !ok || rep == nil { + return false, "stage_no_reply" + } + ok, reason := decodeStageReply(rep.Payload) + if !ok { + return false, reason + } + return true, "" + case <-time.After(targetCallTimeout): + return false, "stage_timeout" + } +} + +func decodeStageReply(payload any) (bool, string) { + switch v := payload.(type) { + case nil: + return false, "stage_nil_payload" + case updater.StageReply: + if !v.OK { + if v.Err == "" { + return false, "stage_rejected" + } + return false, v.Err + } + return true, "" + case *updater.StageReply: + if v == nil { + return false, "stage_nil_payload" + } + if !v.OK { + if v.Err == "" { + return false, "stage_rejected" + } + return false, v.Err + } + return true, "" + case map[string]any: + ok, _ := v["ok"].(bool) + if !ok { + err, _ := v["err"].(string) + if err == "" { + err = "stage_rejected" + } + return false, err + } + return true, "" + } + b, err := json.Marshal(payload) + if err != nil { + return false, "stage_marshal_failed" + } + var probe struct { + OK bool `json:"ok"` + Err string `json:"err"` + } + if err := json.Unmarshal(b, &probe); err != nil { + return false, "stage_unmarshal_failed" + } + if !probe.OK { + if probe.Err == "" { + return false, "stage_rejected" + } + return false, probe.Err } - println("[fabric]", "sid", s.localSID, "transfer apply ok", "id", id) + return true, "" } func (s *session) onTransferAbort(msg *protoXferAbort) { @@ -326,7 +495,6 @@ func (s *session) onTransferAbort(msg *protoXferAbort) { if reason == "" { reason = "remote_abort" } - println("[fabric]", "sid", s.localSID, "xfer_abort received", "id", cur.meta.ID, "reason", reason) s.abortTransfer(reason) } diff --git a/services/fabric/transfer_sink_buffer.go b/services/fabric/transfer_sink_buffer.go new file mode 100644 index 0000000..9cf3832 --- /dev/null +++ b/services/fabric/transfer_sink_buffer.go @@ -0,0 +1,80 @@ +package fabric + +import "errors" + +// bufferSink is the default transferSink for the fabric-update branch: +// it buffers the verified-by-wire (xxHash32) artefact in RAM and exposes +// the bytes via Bytes() so onTransferCommit can hand them to the +// updater/main staging RPC. The updater is responsible for signed-image +// verification and staging. +// +// Size cap is deliberately conservative: the smoke tests target small +// artefacts and large firmware images need a streaming-into-flash +// sink, which is fabric-security's job. Hitting the cap aborts the +// transfer cleanly via WriteChunk -> ErrArtefactTooLarge. +const maxArtefactBytes = 64 * 1024 + +var ErrArtefactTooLarge = errors.New("artefact_too_large") + +type bufferSink struct { + meta transferMeta + buf []byte + closed bool + committed bool +} + +func newBufferSink(meta transferMeta) *bufferSink { + return &bufferSink{ + meta: meta, + buf: make([]byte, 0, sizeHint(meta.Size)), + } +} + +func sizeHint(announced uint32) int { + if announced == 0 || announced > maxArtefactBytes { + return maxArtefactBytes + } + return int(announced) +} + +func (s *bufferSink) WriteChunk(off uint32, data []byte) error { + if s.closed { + return errors.New("sink_closed") + } + if int(off) != len(s.buf) { + return errors.New("unexpected_offset") + } + if len(s.buf)+len(data) > maxArtefactBytes { + return ErrArtefactTooLarge + } + s.buf = append(s.buf, data...) + return nil +} + +func (s *bufferSink) Commit() (transferInfo, error) { + if s.closed { + return transferInfo{}, errors.New("sink_closed") + } + s.committed = true + return transferInfo{BytesWritten: uint32(len(s.buf))}, nil +} + +// Apply is a no-op for the buffer sink — the staged-image apply +// (slot switch + reboot) belongs to the updater's commit RPC, not to +// fabric's transfer state machine. fabric-security wires the real +// apply path through `cap/self/updater/main/rpc/commit-update`. +func (s *bufferSink) Apply() error { return nil } + +func (s *bufferSink) Abort(reason string) error { + _ = reason + s.buf = nil + s.closed = true + return nil +} + +func (s *bufferSink) Bytes() []byte { + if !s.committed { + return nil + } + return s.buf +} diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index 3360aa6..7b8fdd3 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -1,19 +1,62 @@ //go:build tinygo && rp2350 -// Default RP2350 transfer sink for the fabric-protocol baseline. Rejects all -// transfers at xfer_begin: signed-image verification and staged flash writes -// land in fabric-update via the receiver topic -// `raw/member/mcu/cap/updater/main/rpc/receive` and `pico2-a-b/imagev1/`. Until -// that path lands, the safe default is to refuse incoming transfers rather -// than flash unverified bytes directly into the inactive slot. - package fabric -import "errors" +import ( + "errors" + + "devicecode-go/services/updater" +) -var errTransferUnsupported = errors.New("staging_unavailable: signed-image receiver not present in this build") +type streamedStageSink struct { + accepted uint32 + closed bool +} func beginTransfer(meta transferMeta) (transferSink, error) { - _ = meta - return nil, errTransferUnsupported + if err := updater.BeginStreamedStage(meta.Size); err != nil { + return nil, err + } + return &streamedStageSink{}, nil } + +func (s *streamedStageSink) WriteChunk(off uint32, data []byte) error { + if s.closed { + return errors.New("sink_closed") + } + if s.accepted != off { + return errors.New("unexpected_offset") + } + if err := updater.WriteStreamedStage(data); err != nil { + return err + } + s.accepted += uint32(len(data)) + return nil +} + +func (s *streamedStageSink) Commit() (transferInfo, error) { + if s.closed { + return transferInfo{}, errors.New("sink_closed") + } + written, err := updater.CommitStreamedStage() + if err != nil { + return transferInfo{}, err + } + s.closed = true + return transferInfo{BytesWritten: written}, nil +} + +func (s *streamedStageSink) Apply() error { return nil } + +func (s *streamedStageSink) Abort(reason string) error { + _ = reason + updater.AbortStreamedStage() + s.closed = true + return nil +} + +// Bytes returns nil because the TinyGo RP2350 default path verifies the signed +// container while streaming and writes only the authenticated payload into the +// inactive slot. fabric still calls updater/main staging; the updater consumes +// the verified staged descriptor instead of an in-RAM artefact. +func (s *streamedStageSink) Bytes() []byte { return nil } diff --git a/services/fabric/transfer_sink_stub.go b/services/fabric/transfer_sink_stub.go index 6386f0a..f07a074 100644 --- a/services/fabric/transfer_sink_stub.go +++ b/services/fabric/transfer_sink_stub.go @@ -1,11 +1,11 @@ //go:build !(tinygo && rp2350) -package fabric - -import "errors" +// Host build (tests, dev tooling): same buffer-sink behaviour as the +// default RP2350 build. Lets unit tests exercise updater/main staging +// without firmware stubs in the way. -var errTransferUnsupported = errors.New("unsupported") +package fabric func beginTransfer(meta transferMeta) (transferSink, error) { - return nil, errTransferUnsupported + return newBufferSink(meta), nil } diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index 7837980..d3e9bcf 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -9,6 +9,7 @@ import ( "time" "devicecode-go/bus" + "devicecode-go/services/updater" "devicecode-go/x/xxhash" ) @@ -51,11 +52,15 @@ func (s *fakeTransferSink) Abort(reason string) error { return nil } +// Bytes returns nil because the test fake doesn't retain a RAM copy +// of the transferred bytes — it tracks per-chunk writes instead. +func (s *fakeTransferSink) Bytes() []byte { return nil } + func runSessionWithSink(ctx context.Context, tr Transport, conn *bus.Connection, sink *fakeTransferSink) { s := session{ linkID: defaultLinkID, - nodeID: "mcu-1", - peerID: "cm5-local", + nodeID: "mcu", + peerID: "bigbox-cm5", localSID: "mcu-sid-test", tr: tr, conn: conn, @@ -70,15 +75,82 @@ func rawURL(data []byte) string { return base64.RawURLEncoding.EncodeToString(data) } -// xxhashStr is the wire-format checksum: lower-case hex, 8 chars, no algorithm -// field. Mirrors the Lua reference's M.digest_hex. +// xxhashStr is the wire-format digest: lower-case hex, 8 chars. Mirrors +// the Lua reference's M.digest_hex. func xxhashStr(data []byte) string { return xxhashHex(xxhash.Sum32(data, 0)) } +func xferBegin(id string, payload []byte, meta json.RawMessage) protoXferBegin { + return protoXferBegin{ + Type: msgXferBegin, + XferID: id, + Target: updater.TargetUpdaterMain, + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: xxhashStr(payload), + Meta: meta, + } +} + +func xferChunk(id string, off uint32, payload []byte) protoXferChunk { + return protoXferChunk{ + Type: msgXferChunk, + XferID: id, + Offset: off, + Data: rawURL(payload), + ChunkDigest: xxhashStr(payload), + } +} + +func xferCommit(id string, payload []byte) protoXferCommit { + return protoXferCommit{ + Type: msgXferCommit, + XferID: id, + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: xxhashStr(payload), + } +} + +func installStageResponder(t *testing.T, b *bus.Bus, reply updater.StageReply) <-chan updater.StagePayload { + t.Helper() + conn := b.NewConnection("test-stage") + sub := conn.Subscribe(updater.TopicStageRPC) + t.Cleanup(func() { conn.Unsubscribe(sub) }) + got := make(chan updater.StagePayload, 4) + go func() { + for msg := range sub.Channel() { + if msg == nil { + continue + } + if payload, ok := msg.Payload.(updater.StagePayload); ok { + select { + case got <- payload: + default: + } + } + conn.Reply(msg, reply, false) + } + }() + return got +} + +func readTransferReady(t *testing.T, tr Transport, id string, next uint32) { + t.Helper() + ready := readMsg[protoXferReady](t, tr) + if ready.Type != msgXferReady || ready.XferID != id { + t.Fatalf("bad xfer_ready: %+v", ready) + } + need := readMsg[protoXferNeed](t, tr) + if need.Type != msgXferNeed || need.XferID != id || need.Next != next { + t.Fatalf("bad initial xfer_need: %+v, want id=%s next=%d", need, id, next) + } +} + func TestTransferBeginPreservesMeta(t *testing.T) { // xfer_begin's meta is opaque to fabric-protocol but must be preserved - // for fabric-update's receiver, which pulls meta.receiver out of it. + // for updater/main staging diagnostics. b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -88,8 +160,8 @@ func TestTransferBeginPreservesMeta(t *testing.T) { sink := &fakeTransferSink{} s := session{ linkID: defaultLinkID, - nodeID: "mcu-1", - peerID: "cm5-local", + nodeID: "mcu", + peerID: "bigbox-cm5", localSID: "mcu-sid-test", tr: mcu, conn: b.NewConnection("fabric"), @@ -102,16 +174,10 @@ func TestTransferBeginPreservesMeta(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - metaBlob := json.RawMessage(`{"receiver":["raw","member","mcu","cap","updater","main","rpc","receive"],"version":"1.2.3"}`) + metaBlob := json.RawMessage(`{"version":"1.2.3"}`) - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-meta", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - Meta: metaBlob, - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-meta", payload, metaBlob)) + readTransferReady(t, cm5, "xfer-meta", 0) if string(captured.Meta) != string(metaBlob) { t.Fatalf("transferMeta.Meta = %q, want %q", captured.Meta, metaBlob) @@ -119,6 +185,29 @@ func TestTransferBeginPreservesMeta(t *testing.T) { if captured.ID != "xfer-meta" || captured.Size != uint32(len(payload)) { t.Fatalf("transferMeta basic fields wrong: %+v", captured) } + if captured.Target != updater.TargetUpdaterMain || captured.DigestAlg != updater.DigestAlgXXHash32 || captured.Digest != xxhashStr(payload) { + t.Fatalf("transferMeta contract fields wrong: %+v", captured) + } +} + +func TestTransferDuplicateBeginResendsReady(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + begin := xferBegin("xfer-dup", payload, nil) + + sendMsg(t, cm5, begin) + readTransferReady(t, cm5, "xfer-dup", 0) + + sendMsg(t, cm5, begin) + readTransferReady(t, cm5, "xfer-dup", 0) } func TestTransferReceiveSuccess(t *testing.T) { @@ -133,34 +222,19 @@ func TestTransferReceiveSuccess(t *testing.T) { SlotXIPAddr: 0x10280000, }, } + stageCalls := installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) bringUp(t, cm5) payload := []byte("abcdefghij") - checksum := xxhashStr(payload) - - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-2", - Size: uint32(len(payload)), - Checksum: checksum, - }) - - ready := readMsg[protoXferReady](t, cm5) - if ready.Type != msgXferReady || ready.XferID != "xfer-2" { - t.Fatalf("bad xfer_ready: %+v", ready) - } + sendMsg(t, cm5, xferBegin("xfer-2", payload, nil)) + readTransferReady(t, cm5, "xfer-2", 0) parts := [][]byte{payload[:4], payload[4:8], payload[8:]} off := uint32(0) for i, part := range parts { - sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-2", - Offset: off, - Data: rawURL(part), - }) + sendMsg(t, cm5, xferChunk("xfer-2", off, part)) need := readMsg[protoXferNeed](t, cm5) want := off + uint32(len(part)) if need.Next != want { @@ -169,19 +243,21 @@ func TestTransferReceiveSuccess(t *testing.T) { off = want } - sendMsg(t, cm5, protoXferCommit{ - Type: msgXferCommit, - XferID: "xfer-2", - Size: uint32(len(payload)), - Checksum: checksum, - }) + sendMsg(t, cm5, xferCommit("xfer-2", payload)) done := readMsg[protoXferDone](t, cm5) if done.Type != msgXferDone || done.XferID != "xfer-2" { t.Fatalf("bad xfer_done: %+v", done) } - time.Sleep(postTransferDoneSettle + 50*time.Millisecond) + select { + case call := <-stageCalls: + if call.XferID != "xfer-2" || call.Target != updater.TargetUpdaterMain || call.Digest != xxhashStr(payload) { + t.Fatalf("stage payload wrong: %+v", call) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage call") + } if got := string(sink.writes[0]) + string(sink.writes[1]) + string(sink.writes[2]); got != string(payload) { t.Fatalf("sink writes = %q, want %q", got, payload) @@ -189,55 +265,111 @@ func TestTransferReceiveSuccess(t *testing.T) { if !sink.committed { t.Fatal("sink.Commit was not called") } - if !sink.applied { - t.Fatal("sink.Apply was not called") + if sink.applied { + t.Fatal("sink.Apply should not be called by strict target staging") } } -func TestTransferChunkBadOffsetAborts(t *testing.T) { - // Lua transfer_mgr aborts and clears the active transfer on chunk faults - // (unexpected_offset, decode_failed, size_overflow). Match that — do not - // keep the transfer alive with an xfer_need. +func TestTransferChunkFutureOffsetRequestsCurrentAndCompletes(t *testing.T) { b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) defer cancel() sink := &fakeTransferSink{} + installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) bringUp(t, cm5) payload := []byte("abcd") - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-3", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-future-offset", payload, nil)) + readTransferReady(t, cm5, "xfer-future-offset", 0) - // Send a chunk at the wrong byte offset; expect xfer_abort and - // sink.Abort, not an xfer_need retry. - sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-3", - Offset: 7, - Data: rawURL(payload), - }) + sendMsg(t, cm5, xferChunk("xfer-future-offset", 7, payload)) - abort := readMsg[protoXferAbort](t, cm5) - if abort.Type != msgXferAbort || abort.XferID != "xfer-3" || abort.Err != "unexpected_offset" { - t.Fatalf("bad xfer_abort: %+v", abort) + need := readMsg[protoXferNeed](t, cm5) + if need.Type != msgXferNeed || need.XferID != "xfer-future-offset" || need.Next != 0 { + t.Fatalf("future offset retry xfer_need = %+v, want next=0", need) } if len(sink.writes) != 0 { t.Fatalf("sink received %d writes, want 0", len(sink.writes)) } - if len(sink.abortReasons) == 0 { - t.Fatal("expected sink.Abort to be called on chunk fault") + if len(sink.abortReasons) != 0 { + t.Fatalf("sink.Abort called on recoverable future offset: %v", sink.abortReasons) + } + + sendMsg(t, cm5, xferChunk("xfer-future-offset", 0, payload)) + need = readMsg[protoXferNeed](t, cm5) + if need.Next != uint32(len(payload)) { + t.Fatalf("xfer_need.next after recovery = %d, want %d", need.Next, len(payload)) + } + + sendMsg(t, cm5, xferCommit("xfer-future-offset", payload)) + done := readMsg[protoXferDone](t, cm5) + if done.Type != msgXferDone || done.XferID != "xfer-future-offset" { + t.Fatalf("bad xfer_done: %+v", done) + } + if got := string(sink.writes[0]); got != string(payload) { + t.Fatalf("sink writes = %q, want %q", got, payload) + } + if !sink.committed { + t.Fatal("sink.Commit was not called") + } +} + +func TestTransferChunkStaleOffsetRequestsCurrentAndCompletes(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcdef") + sendMsg(t, cm5, xferBegin("xfer-stale-offset", payload, nil)) + readTransferReady(t, cm5, "xfer-stale-offset", 0) + + sendMsg(t, cm5, xferChunk("xfer-stale-offset", 0, []byte("abc"))) + need := readMsg[protoXferNeed](t, cm5) + if need.Next != 3 { + t.Fatalf("xfer_need.next after first chunk = %d, want 3", need.Next) + } + + sendMsg(t, cm5, xferChunk("xfer-stale-offset", 0, []byte("abc"))) + need = readMsg[protoXferNeed](t, cm5) + if need.Type != msgXferNeed || need.XferID != "xfer-stale-offset" || need.Next != 3 { + t.Fatalf("stale offset retry xfer_need = %+v, want next=3", need) + } + if len(sink.writes) != 1 { + t.Fatalf("sink received %d writes after stale duplicate, want 1", len(sink.writes)) + } + if len(sink.abortReasons) != 0 { + t.Fatalf("sink.Abort called on recoverable stale offset: %v", sink.abortReasons) + } + + sendMsg(t, cm5, xferChunk("xfer-stale-offset", 3, []byte("def"))) + need = readMsg[protoXferNeed](t, cm5) + if need.Next != uint32(len(payload)) { + t.Fatalf("xfer_need.next after recovery = %d, want %d", need.Next, len(payload)) + } + + sendMsg(t, cm5, xferCommit("xfer-stale-offset", payload)) + done := readMsg[protoXferDone](t, cm5) + if done.Type != msgXferDone || done.XferID != "xfer-stale-offset" { + t.Fatalf("bad xfer_done: %+v", done) + } + if got := string(sink.writes[0]) + string(sink.writes[1]); got != string(payload) { + t.Fatalf("sink writes = %q, want %q", got, payload) + } + if !sink.committed { + t.Fatal("sink.Commit was not called") } } -func TestTransferChunkDecodeFailureAborts(t *testing.T) { +func TestTransferChunkDecodeFailureRequestsSameOffset(t *testing.T) { b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -248,28 +380,99 @@ func TestTransferChunkDecodeFailureAborts(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-d1", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-d1", payload, nil)) + readTransferReady(t, cm5, "xfer-d1", 0) // Bogus base64 (uses non-base64url chars). + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-d1", + Offset: 0, + Data: "!!!not-base64!!!", + ChunkDigest: xxhashStr(payload), + }) + + need := readMsg[protoXferNeed](t, cm5) + if need.Type != msgXferNeed || need.XferID != "xfer-d1" || need.Next != 0 { + t.Fatalf("bad retry xfer_need: %+v", need) + } + if len(sink.abortReasons) != 0 { + t.Fatalf("sink.Abort called on recoverable decode failure: %v", sink.abortReasons) + } + if len(sink.writes) != 0 { + t.Fatalf("sink received %d writes before decode passed", len(sink.writes)) + } + + sendMsg(t, cm5, xferChunk("xfer-d1", 0, payload)) + need = readMsg[protoXferNeed](t, cm5) + if need.Next != uint32(len(payload)) { + t.Fatalf("xfer_need.next after retry = %d, want %d", need.Next, len(payload)) + } +} + +func TestTransferChunkMissingDigestAborts(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-missing-digest", payload, nil)) + readTransferReady(t, cm5, "xfer-missing-digest", 0) + sendMsg(t, cm5, protoXferChunk{ Type: msgXferChunk, - XferID: "xfer-d1", + XferID: "xfer-missing-digest", Offset: 0, - Data: "!!!not-base64!!!", + Data: rawURL(payload), }) abort := readMsg[protoXferAbort](t, cm5) - if abort.Err != "decode_failed" { + if abort.Err != "missing_chunk_digest" { t.Fatalf("bad xfer_abort: %+v", abort) } if len(sink.abortReasons) == 0 { - t.Fatal("expected sink.Abort on decode failure") + t.Fatal("expected sink.Abort on missing chunk digest") + } +} + +func TestTransferChunkDigestMismatchRequestsSameOffset(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-bad-chunk-digest", payload, nil)) + readTransferReady(t, cm5, "xfer-bad-chunk-digest", 0) + + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-bad-chunk-digest", + Offset: 0, + Data: rawURL(payload), + ChunkDigest: "00000000", + }) + need := readMsg[protoXferNeed](t, cm5) + if need.Next != 0 { + t.Fatalf("retry xfer_need.next = %d, want 0", need.Next) + } + if len(sink.writes) != 0 { + t.Fatalf("sink received %d writes before digest passed", len(sink.writes)) + } + + sendMsg(t, cm5, xferChunk("xfer-bad-chunk-digest", 0, payload)) + need = readMsg[protoXferNeed](t, cm5) + if need.Next != uint32(len(payload)) { + t.Fatalf("xfer_need.next after retry = %d, want %d", need.Next, len(payload)) } } @@ -285,28 +488,24 @@ func TestTransferChunkSizeOverflowAborts(t *testing.T) { payload := []byte("abcd") // Advertise size=4 but send 6 bytes in the first chunk. - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-d2", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-d2", payload, nil)) + readTransferReady(t, cm5, "xfer-d2", 0) sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-d2", - Offset: 0, - Data: rawURL([]byte("abcdef")), + Type: msgXferChunk, + XferID: "xfer-d2", + Offset: 0, + Data: rawURL([]byte("abcdef")), + ChunkDigest: xxhashStr([]byte("abcdef")), }) abort := readMsg[protoXferAbort](t, cm5) - if abort.Err != "size_overflow" { + if abort.Err != "size_too_large" { t.Fatalf("bad xfer_abort: %+v", abort) } } -func TestTransferCommitChecksumMismatchAborts(t *testing.T) { +func TestTransferCommitDigestMismatchAborts(t *testing.T) { b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -317,40 +516,152 @@ func TestTransferCommitChecksumMismatchAborts(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - // Begin with the wrong-checksum advertised. The only way to surface a - // commit-time mismatch is for begin/commit checksums to disagree, OR for - // the streamed bytes to disagree with the begin checksum. Use the - // latter: claim a bogus begin/commit checksum but stream the real bytes. + // Begin with the wrong digest advertised. The streamed bytes disagree + // with the begin/commit digest even though the frames agree. bogus := strings.Repeat("0", 8) sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-4", - Size: uint32(len(payload)), - Checksum: bogus, + Type: msgXferBegin, + XferID: "xfer-4", + Target: updater.TargetUpdaterMain, + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: bogus, }) - _ = readMsg[protoXferReady](t, cm5) + readTransferReady(t, cm5, "xfer-4", 0) - sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-4", - Offset: 0, - Data: rawURL(payload), - }) + sendMsg(t, cm5, xferChunk("xfer-4", 0, payload)) _ = readMsg[protoXferNeed](t, cm5) sendMsg(t, cm5, protoXferCommit{ - Type: msgXferCommit, - XferID: "xfer-4", - Size: uint32(len(payload)), - Checksum: bogus, + Type: msgXferCommit, + XferID: "xfer-4", + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: bogus, }) abort := readMsg[protoXferAbort](t, cm5) - if abort.Type != msgXferAbort || abort.Err != "checksum_mismatch" { + if abort.Type != msgXferAbort || abort.Err != "digest_mismatch" { t.Fatalf("bad xfer_abort: %+v", abort) } if len(sink.abortReasons) == 0 { - t.Fatal("expected sink abort on checksum mismatch") + t.Fatal("expected sink abort on digest mismatch") + } +} + +// bufferingSinkAdapter wraps the production bufferSink so transfer tests +// can assert the bytes passed to updater/main staging. +type bufferingSinkAdapter struct { + *bufferSink + abortReasons []string +} + +func (b *bufferingSinkAdapter) Abort(reason string) error { + b.abortReasons = append(b.abortReasons, reason) + return b.bufferSink.Abort(reason) +} + +func TestTransferTargetInvokedAfterCommit(t *testing.T) { + // With target=updater/main, fabric calls the local updater stage RPC + // after xfer_commit and before xfer_done. The wire never names a + // raw/member receiver topic. + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + gotPayload := installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) + + sink := &bufferingSinkAdapter{bufferSink: newBufferSink(transferMeta{Size: 4})} + s := session{ + linkID: defaultLinkID, + nodeID: "mcu", + peerID: "bigbox-cm5", + localSID: "mcu-sid-test", + tr: mcu, + conn: b.NewConnection("fabric"), + beginTransfer: func(meta transferMeta) (transferSink, error) { + sink.bufferSink.meta = meta + return sink, nil + }, + } + go s.run(ctx) + bringUp(t, cm5) + + payload := []byte("abcd") + metaBlob := json.RawMessage(`{"version":"1.2.3"}`) + + sendMsg(t, cm5, xferBegin("xfer-stage", payload, metaBlob)) + readTransferReady(t, cm5, "xfer-stage", 0) + + sendMsg(t, cm5, xferChunk("xfer-stage", 0, payload)) + _ = readMsg[protoXferNeed](t, cm5) + + sendMsg(t, cm5, xferCommit("xfer-stage", payload)) + + select { + case p := <-gotPayload: + if p.XferID != "xfer-stage" { + t.Fatalf("stage xfer_id = %v, want xfer-stage", p.XferID) + } + if p.LinkID != defaultLinkID { + t.Fatalf("stage link_id = %q, want %q", p.LinkID, defaultLinkID) + } + if p.Target != updater.TargetUpdaterMain || p.DigestAlg != updater.DigestAlgXXHash32 || p.Digest != xxhashStr(payload) { + t.Fatalf("stage contract fields wrong: %+v", p) + } + if string(p.Artefact) != string(payload) { + t.Fatalf("stage artefact = %v, want %q", p.Artefact, payload) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage call") + } + + done := readMsg[protoXferDone](t, cm5) + if done.XferID != "xfer-stage" { + t.Fatalf("xfer_done xfer_id = %q, want xfer-stage", done.XferID) + } +} + +func TestTransferTargetRejectAbortsTransfer(t *testing.T) { + // updater/main stage replies {ok=false, err=...}. fabric must send + // xfer_abort with the stage reason rather than xfer_done. + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _ = installStageResponder(t, b, updater.StageReply{OK: false, Err: "manifest_check_failed"}) + + sink := &bufferingSinkAdapter{bufferSink: newBufferSink(transferMeta{Size: 4})} + s := session{ + linkID: defaultLinkID, + nodeID: "mcu", + peerID: "bigbox-cm5", + localSID: "mcu-sid-test", + tr: mcu, + conn: b.NewConnection("fabric"), + beginTransfer: func(meta transferMeta) (transferSink, error) { + sink.bufferSink.meta = meta + return sink, nil + }, + } + go s.run(ctx) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-rej", payload, nil)) + readTransferReady(t, cm5, "xfer-rej", 0) + sendMsg(t, cm5, xferChunk("xfer-rej", 0, payload)) + _ = readMsg[protoXferNeed](t, cm5) + sendMsg(t, cm5, xferCommit("xfer-rej", payload)) + + abort := readMsg[protoXferAbort](t, cm5) + if abort.XferID != "xfer-rej" { + t.Fatalf("xfer_abort xfer_id = %q, want xfer-rej", abort.XferID) + } + if abort.Err != "manifest_check_failed" { + t.Fatalf("xfer_abort err = %q, want manifest_check_failed", abort.Err) } } @@ -367,8 +678,8 @@ func TestTransferIdleChunkWatchdog(t *testing.T) { sink := &fakeTransferSink{} s := session{ linkID: defaultLinkID, - nodeID: "mcu-1", - peerID: "cm5-local", + nodeID: "mcu", + peerID: "bigbox-cm5", localSID: "mcu-sid-test", tr: mcu, conn: b.NewConnection("fabric"), @@ -381,16 +692,18 @@ func TestTransferIdleChunkWatchdog(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-wd", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-wd", payload, nil)) + readTransferReady(t, cm5, "xfer-wd", 0) - // Stop sending chunks; watchdog should fire within ~PhaseTimeout + - // one exportTickInterval (50ms). + // Stop sending chunks. The watchdog should resend the current offset a + // bounded number of times before aborting, so a lost xfer_need does not + // strand both sides until the first idle timeout. + for i := 0; i < transferIdleRetryLimit; i++ { + need := readMsg[protoXferNeed](t, cm5) + if need.Type != msgXferNeed || need.XferID != "xfer-wd" || need.Next != 0 { + t.Fatalf("bad retry xfer_need[%d]: %+v", i, need) + } + } abort := readMsg[protoXferAbort](t, cm5) if abort.Type != msgXferAbort || abort.XferID != "xfer-wd" || abort.Err != "timeout" { t.Fatalf("bad xfer_abort: %+v", abort) @@ -400,8 +713,8 @@ func TestTransferIdleChunkWatchdog(t *testing.T) { } } -func TestTransferCommitChecksumMismatchOnCommitFrameAborts(t *testing.T) { - // xfer_begin and xfer_commit must agree on the checksum. If they +func TestTransferCommitDigestMismatchOnCommitFrameAborts(t *testing.T) { + // xfer_begin and xfer_commit must agree on the digest. If they // disagree (even when the streamed bytes match begin), commit aborts. b := newBus() cm5, mcu := pipePair() @@ -413,33 +726,22 @@ func TestTransferCommitChecksumMismatchOnCommitFrameAborts(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - good := xxhashStr(payload) - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-5", - Size: uint32(len(payload)), - Checksum: good, - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-5", payload, nil)) + readTransferReady(t, cm5, "xfer-5", 0) - sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-5", - Offset: 0, - Data: rawURL(payload), - }) + sendMsg(t, cm5, xferChunk("xfer-5", 0, payload)) _ = readMsg[protoXferNeed](t, cm5) - // Commit advertises a different checksum than begin: must abort. + // Commit advertises a different digest than begin: must abort. sendMsg(t, cm5, protoXferCommit{ - Type: msgXferCommit, - XferID: "xfer-5", - Size: uint32(len(payload)), - Checksum: strings.Repeat("0", 8), + Type: msgXferCommit, + XferID: "xfer-5", + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: strings.Repeat("0", 8), }) - abort := readMsg[protoXferAbort](t, cm5) - if abort.Type != msgXferAbort || abort.Err != "checksum_mismatch" { + if abort.Type != msgXferAbort || abort.Err != "digest_mismatch" { t.Fatalf("bad xfer_abort: %+v", abort) } } diff --git a/services/fabric/transport_limits.go b/services/fabric/transport_limits.go index 7f5afec..8d689ad 100644 --- a/services/fabric/transport_limits.go +++ b/services/fabric/transport_limits.go @@ -3,8 +3,8 @@ package fabric import "fmt" // maxLineLen caps a single fabric frame (line-delimited JSON) end-to-end. -// It must clear the release transfer chunk: 1024 raw bytes becomes about -// 1366 base64url chars, plus JSON envelope and newline. 4096 leaves margin +// It must clear the release transfer chunk: 2048 raw bytes becomes about +// 2731 base64url chars, plus JSON envelope and newline. 4096 leaves margin // while keeping malformed lines bounded. const maxLineLen = 4096 diff --git a/services/fabric/writer.go b/services/fabric/writer.go index 5d3f596..4ca94df 100644 --- a/services/fabric/writer.go +++ b/services/fabric/writer.go @@ -43,6 +43,18 @@ func (l *txLane) pop() []byte { // (e.g. drainExports + drainOutbound generating frames back-to-back). func (s *session) enqueueFrame(l lane, data []byte) bool { s.lane(l).push(data) + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "enqueue_frame", + "lane", laneName(l), + "type", protoType(data), + "len", len(data), + "q_control", s.txControl.len(), + "q_rpc", s.txRPC.len(), + "q_bulk", s.txBulk.len(), + ) + } return s.flushWriter() } @@ -59,6 +71,19 @@ func (s *session) lane(l lane) *txLane { } } +func laneName(l lane) string { + switch l { + case laneControl: + return "control" + case laneRPC: + return "rpc" + case laneBulk: + return "bulk" + default: + return "unknown" + } +} + // flushWriter writes queued frames to the transport in priority order: // 1. drain controlQ fully (no fairness), // 2. weighted RR between rpcQ and bulkQ until both empty. @@ -75,18 +100,18 @@ func (s *session) flushWriter() bool { bulkQ = 1 } for s.txControl.len() > 0 { - if !s.writeFrame(s.txControl.pop()) { + if !s.writeFrame(laneControl, s.txControl.pop()) { return false } } for s.txRPC.len() > 0 || s.txBulk.len() > 0 { for i := 0; i < rpcQ && s.txRPC.len() > 0; i++ { - if !s.writeFrame(s.txRPC.pop()) { + if !s.writeFrame(laneRPC, s.txRPC.pop()) { return false } } for i := 0; i < bulkQ && s.txBulk.len() > 0; i++ { - if !s.writeFrame(s.txBulk.pop()) { + if !s.writeFrame(laneBulk, s.txBulk.pop()) { return false } } @@ -96,10 +121,20 @@ func (s *session) flushWriter() bool { // writeFrame is the actual transport write. Mirrors what the prior // sendFrame did inline; isolated so flushWriter can call it per-frame. -func (s *session) writeFrame(data []byte) bool { +func (s *session) writeFrame(l lane, data []byte) bool { if len(data) > 0 && data[len(data)-1] == '\n' { data = data[:len(data)-1] } + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "tx_frame", + "lane", laneName(l), + "type", protoType(data), + "len", len(data), + "line", tracePreview(data), + ) + } if err := s.tr.WriteLine(data); err != nil { if errors.Is(err, ErrLineTooLong) { s.log("oversized write dropped") From ebfb52cebbc591734ec73d4b61561b63bcd28408 Mon Sep 17 00:00:00 2001 From: cpunt Date: Wed, 27 May 2026 16:05:50 +0000 Subject: [PATCH 3/6] reactor: wire updater and MCU telemetry facts --- services/reactor/reactor.go | 78 +++- services/reactor/reactor_test.go | 35 ++ services/telemetry/alerts.go | 199 ++++++++++ services/telemetry/telemetry.go | 521 +++++++++++++++++++++++++++ services/telemetry/telemetry_test.go | 410 +++++++++++++++++++++ 5 files changed, 1237 insertions(+), 6 deletions(-) create mode 100644 services/reactor/reactor_test.go create mode 100644 services/telemetry/alerts.go create mode 100644 services/telemetry/telemetry.go create mode 100644 services/telemetry/telemetry_test.go diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 9429b2c..257eaa5 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -9,13 +9,49 @@ import ( "devicecode-go/bus" "devicecode-go/services/fabric" + "devicecode-go/services/telemetry" + "devicecode-go/services/updater" "devicecode-go/types" "devicecode-go/utilities" "devicecode-go/x/shmring" "devicecode-go/x/strconvx" ) -const fabricWaitLogInterval = 2 * time.Second +// FirmwareVersion/FirmwareBuild/FirmwareImageID are the stamps the updater +// publishes via state/self/software. main may override them before the reactor +// starts; defaults are development sentinels. +var ( + FirmwareVersion = "0.0.0-dev" + FirmwareBuild = "local" + FirmwareImageID = "img-dev" +) + +func firmwareIdentity() updater.Identity { + return updater.Identity{ + Version: FirmwareVersion, + Build: FirmwareBuild, + ImageID: FirmwareImageID, + } +} + +const ( + fabricWaitLogInterval = 2 * time.Second + fabricStopWaitTimeout = 500 * time.Millisecond +) + +func waitFabricDone(done <-chan struct{}, timeout time.Duration) bool { + if done == nil { + return true + } + timer := time.NewTimer(timeout) + defer timer.Stop() + select { + case <-done: + return true + case <-timer.C: + return false + } +} // ----------------------------------------------------------------------------- // Thresholds & timing @@ -162,6 +198,9 @@ type Reactor struct { // misc now time.Time + + // updater service handle used by the post-hello_ack republish hook. + updater *updater.Service } func NewReactor(b *bus.Bus, uiConn *bus.Connection) *Reactor { @@ -398,6 +437,31 @@ func (r *Reactor) emitMemSnapshot() { } func (r *Reactor) Run(ctx context.Context) { + // Updater service: state machine + updater prepare/commit RPC + // RPC handlers + updater/main staging + retained state/self/{software, + // updater, health} facts. Started early so the initial fact retains + // land before fabric establishes — that way the first hello_ack + // observer sees a populated retain store. + updaterConn := r.bus.NewConnection("updater") + identity := firmwareIdentity() + updaterSvc := updater.New(updater.Options{ + Conn: updaterConn, + Verifier: updater.PassthroughVerifier(identity), + Applier: updater.ProductionApplier(), + Identity: identity, + }) + go updaterSvc.Run(ctx) + r.updater = updaterSvc + + // Telemetry service: subscribes to HAL value topics and republishes + // at state/self/* with integer engineering units; runs the charger + // alert FSM and emits event/self/power/charger/alert on bit-set + // transitions. Started after the updater so the initial software/ + // updater retains land first. + telemetryConn := r.bus.NewConnection("telemetry") + telemetrySvc := telemetry.New(telemetryConn) + go telemetrySvc.Run(ctx) + // Subscriptions (env + power) log.Println("[main] subscribing env + power …") tempSub := r.uiConn.Subscribe(tTempValue) @@ -426,11 +490,12 @@ func (r *Reactor) Run(ctx context.Context) { if fabricCancel == nil { return } + done := fabricDone fabricCancel() fabricCancel = nil - if fabricDone != nil { - <-fabricDone - fabricDone = nil + fabricDone = nil + if !waitFabricDone(done, fabricStopWaitTimeout) { + log.Println("[uart1] fabric session stop timed out") } } @@ -456,11 +521,12 @@ func (r *Reactor) Run(ctx context.Context) { fabricCancel = cancel fabricDone = done fabricSessionOpen = true + log.Println("[uart1] fabric session opening node=mcu peer=bigbox-cm5 link=mcu-uart0") go func() { defer close(done) - fabric.Run(fabricCtx, tr, fabricConn, "mcu-1", "cm5", fabric.DefaultLinkConfig()) + fabric.Run(fabricCtx, tr, fabricConn, "mcu", "bigbox-cm5", fabric.DefaultLinkConfig()) }() - log.Println("[uart1] fabric session opened") + log.Println("[uart1] fabric session opened node=mcu peer=bigbox-cm5 link=mcu-uart0") } case <-subSessClosedFabric.Channel(): // Ignore stale close events — the open handler already tears down diff --git a/services/reactor/reactor_test.go b/services/reactor/reactor_test.go new file mode 100644 index 0000000..5a9a1a3 --- /dev/null +++ b/services/reactor/reactor_test.go @@ -0,0 +1,35 @@ +//go:build !qa_reactor + +package reactor + +import ( + "testing" + "time" +) + +func TestWaitFabricDoneNil(t *testing.T) { + if !waitFabricDone(nil, time.Millisecond) { + t.Fatal("nil fabric done channel should be treated as stopped") + } +} + +func TestWaitFabricDoneClosed(t *testing.T) { + done := make(chan struct{}) + close(done) + + if !waitFabricDone(done, 50*time.Millisecond) { + t.Fatal("closed fabric done channel should report stopped") + } +} + +func TestWaitFabricDoneTimeout(t *testing.T) { + done := make(chan struct{}) + start := time.Now() + + if waitFabricDone(done, 10*time.Millisecond) { + t.Fatal("open fabric done channel should time out") + } + if elapsed := time.Since(start); elapsed > 250*time.Millisecond { + t.Fatalf("timeout wait took too long: %s", elapsed) + } +} diff --git a/services/telemetry/alerts.go b/services/telemetry/alerts.go new file mode 100644 index 0000000..ffb4982 --- /dev/null +++ b/services/telemetry/alerts.go @@ -0,0 +1,199 @@ +package telemetry + +import "devicecode-go/types" + +// chargerAlertFSM implements W8 from docs/firmware-alignment-update.md: +// hold previous bitfield state; on bit-set transition for a kind, emit +// one normal event with the canonical kind name. The 14 canonical kinds +// split into: +// - 11 bit-driven kinds (state[] + status[]), compared against the +// previous ChargerValue snapshot +// - 3 analog kinds (vin_lo / vin_hi / bsr_high), compared against +// the thresholds carried on state/self/power/charger/config. +// vin_lo + vin_hi observe ChargerValue.VIN_mV; bsr_high observes +// BatteryValue.BSR_uOhmPerCell. +// +// Each kind fires only on the boundary-crossing edge. While a value +// stays past its threshold (or a bit stays set), no further alerts. +type chargerAlertFSM struct { + prev types.ChargerValue + prevBSR uint32 + seen bool + seenBSR bool +} + +// AlertKind is the canonical alert kind name (snake_case) sent on +// the wire as event/self/power/charger/alert.kind. The 14 values +// are frozen by the spec; new kinds must be added here AND on the +// CM5 import side. +type AlertKind string + +const ( + AlertVinLo AlertKind = "vin_lo" + AlertVinHi AlertKind = "vin_hi" + AlertBsrHigh AlertKind = "bsr_high" + AlertBatMissing AlertKind = "bat_missing" + AlertBatShort AlertKind = "bat_short" + AlertMaxChargeTimeFault AlertKind = "max_charge_time_fault" + AlertAbsorb AlertKind = "absorb" + AlertEqualize AlertKind = "equalize" + AlertCccv AlertKind = "cccv" + AlertPrecharge AlertKind = "precharge" + AlertIinLimited AlertKind = "iin_limited" + AlertUvclActive AlertKind = "uvcl_active" + AlertCcPhase AlertKind = "cc_phase" + AlertCvPhase AlertKind = "cv_phase" +) + +// AllAlertKinds enumerates every canonical kind. Tests assert this is +// exactly 14 entries and that publishing rejects anything outside the +// set. +var AllAlertKinds = []AlertKind{ + AlertVinLo, AlertVinHi, AlertBsrHigh, + AlertBatMissing, AlertBatShort, AlertMaxChargeTimeFault, + AlertAbsorb, AlertEqualize, AlertCccv, AlertPrecharge, + AlertIinLimited, AlertUvclActive, AlertCcPhase, AlertCvPhase, +} + +// AlertEvent is the payload at event/self/power/charger/alert. Not +// retained — the publisher uses retained=false so subscribers only +// see live transitions, not stale alerts on reconnect. +type AlertEvent struct { + Kind AlertKind `json:"kind"` + Severity string `json:"severity"` + Source string `json:"source"` + StateBits uint16 `json:"state_bits"` + StatusBits uint16 `json:"status_bits"` + SystemBits uint16 `json:"system_bits"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +// alertSeverity returns the canonical severity for a kind. Faults +// surface as "warning"; charge-phase / control-loop transitions are +// "info". Splitting it out keeps the FSM's emit loop tiny and gives +// the spec one place to grow if severity refines later. +func alertSeverity(k AlertKind) string { + switch k { + case AlertBatMissing, AlertBatShort, AlertMaxChargeTimeFault, AlertBsrHigh: + return "warning" + default: + return "info" + } +} + +// observe runs one tick of the FSM against an incoming ChargerValue. +// On every bit-set transition we emit one event. Cleared bits do +// nothing (sparse stream — no clear-events). +func (f *chargerAlertFSM) observe(s *Service, v types.ChargerValue) { + if !f.seen { + f.prev = v + f.seen = true + return + } + + // State bits (CHARGER_STATE_ALERTS): 6 of the 11 bits map to + // canonical kinds. Bits with display name "suspended", "ntc_pause", + // "timer_term", "c_over_x_term" don't map to alert kinds in the + // spec — they're informational only. + f.fireOnSet(s, v, uint16(types.BatMissingFault), AlertBatMissing, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.BatShortFault), AlertBatShort, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.MaxChargeTimeFault), AlertMaxChargeTimeFault, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.AbsorbCharge), AlertAbsorb, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.EqualizeCharge), AlertEqualize, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.CCCVCharge), AlertCccv, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.Precharge), AlertPrecharge, + uint16(v.State), uint16(f.prev.State)) + + // Status bits (CHARGE_STATUS): all 4 map to kinds. + f.fireOnSet(s, v, uint16(types.IinLimitActive), AlertIinLimited, + uint16(v.Status), uint16(f.prev.Status)) + f.fireOnSet(s, v, uint16(types.VinUvclActive), AlertUvclActive, + uint16(v.Status), uint16(f.prev.Status)) + f.fireOnSet(s, v, uint16(types.ConstCurrent), AlertCcPhase, + uint16(v.Status), uint16(f.prev.Status)) + f.fireOnSet(s, v, uint16(types.ConstVoltage), AlertCvPhase, + uint16(v.Status), uint16(f.prev.Status)) + + // Analog kinds — vin_lo and vin_hi compare ChargerValue.VIN_mV + // against the published thresholds on state/self/power/charger/ + // config. bsr_high comes from BatteryValue and is handled in + // observeBattery below. + th := s.chargerThresholds() + if th.VinLoMV > 0 { + // Edge from "vin >= threshold" to "vin < threshold". + if f.prev.VIN_mV >= th.VinLoMV && v.VIN_mV < th.VinLoMV { + s.emitAlert(v, AlertVinLo) + } + } + if th.VinHiMV > 0 { + // Edge from "vin <= threshold" to "vin > threshold". + if f.prev.VIN_mV <= th.VinHiMV && v.VIN_mV > th.VinHiMV { + s.emitAlert(v, AlertVinHi) + } + } + + f.prev = v +} + +// observeBattery feeds the bsr_high analog kind. BSR +// (battery-source-resistance) lives on BatteryValue, not +// ChargerValue, so it gets its own observer entry point. +func (f *chargerAlertFSM) observeBattery(s *Service, b types.BatteryValue) { + if !f.seenBSR { + f.prevBSR = b.BSR_uOhmPerCell + f.seenBSR = true + return + } + th := s.chargerThresholds() + if th.BSRHighUohmPerCell > 0 { + if f.prevBSR <= th.BSRHighUohmPerCell && b.BSR_uOhmPerCell > th.BSRHighUohmPerCell { + // bsr_high carries the latest charger snapshot for context; + // state/status/system bits are the most recent we saw. + s.emitAlert(f.prev, AlertBsrHigh) + } + } + f.prevBSR = b.BSR_uOhmPerCell +} + +// fireOnSet emits an alert if the bit went from clear to set between +// prev and curr. Bit is passed as a uint16 mask — call sites convert +// from their typed bit-flag (types.ChargerStateBits etc.) at the +// callsite to keep this helper free of generics overhead. +func (f *chargerAlertFSM) fireOnSet( + s *Service, + v types.ChargerValue, + mask uint16, + kind AlertKind, + curr, prev uint16, +) { + if mask == 0 { + return + } + wasSet := (prev & mask) != 0 + isSet := (curr & mask) != 0 + if !wasSet && isSet { + s.emitAlert(v, kind) + } +} + +func (s *Service) emitAlert(v types.ChargerValue, kind AlertKind) { + ev := AlertEvent{ + Kind: kind, + Severity: alertSeverity(kind), + Source: "ltc4015", + StateBits: v.State, + StatusBits: v.Status, + SystemBits: v.Sys, + Seq: s.seqChargerAlert.Add(1), + UptimeMs: s.uptimeMs(), + } + // Sparse alerts: NOT retained. + s.conn.Publish(s.conn.NewMessage(TopicChargerAlert, ev, false)) +} diff --git a/services/telemetry/telemetry.go b/services/telemetry/telemetry.go new file mode 100644 index 0000000..c298d28 --- /dev/null +++ b/services/telemetry/telemetry.go @@ -0,0 +1,521 @@ +// Package telemetry implements the W7/W8 retained-state and sparse- +// alert publishers from docs/firmware-alignment-update.md. It +// subscribes to the existing HAL value topics (hal/cap/env/..., +// hal/cap/power/...) and republishes them under the canonical +// state/self/* surface using integer engineering units, plus runs the +// charger alert FSM that emits event/self/power/charger/alert with +// 14 canonical kinds. +// +// Boundary: telemetry does NOT touch the updater state machine — it +// only consumes HAL data and produces fact retains + alert events. +// The fabric service then exports them onto the wire via the +// state/self/* + event/self/* export rules in services/fabric/remap.go. +package telemetry + +import ( + "context" + "encoding/json" + "runtime" + "sync/atomic" + "time" + + "devicecode-go/bus" + "devicecode-go/types" +) + +// Topic constants. Mirrors the canonical fact schema in +// docs/firmware-alignment-update.md §"Telemetry/state facts". +var ( + TopicBattery = bus.T("state", "self", "power", "battery") + TopicCharger = bus.T("state", "self", "power", "charger") + TopicChargerCfg = bus.T("state", "self", "power", "charger", "config") + TopicEnvTemp = bus.T("state", "self", "environment", "temperature") + TopicEnvHumidity = bus.T("state", "self", "environment", "humidity") + TopicRuntimeMem = bus.T("state", "self", "runtime", "memory") + + TopicChargerAlert = bus.T("event", "self", "power", "charger", "alert") + + // TopicFabricLink mirrors the updater's W10 watcher — telemetry + // republishes the charger config retain on every link-ready edge + // so the CM5 sees a fresh config fact on every newly established + // session, warm or cold. (Per-value retains like + // state/self/power/battery refresh naturally on the next HAL + // publish; the static-ish config fact needs an explicit re-emit.) + topicFabricLink = bus.T("state", "fabric", "link", "+") +) + +// HAL source topics — single point of truth for what we subscribe to. +var ( + halEnvTemp = bus.T("hal", "cap", "env", string(types.KindTemperature), "core", "value") + halEnvHum = bus.T("hal", "cap", "env", string(types.KindHumidity), "core", "value") + halPwrAny = bus.T("hal", "cap", "power", "+", "internal", "value") +) + +// MemSnapshotInterval is how often the runtime/memory fact republishes. +// Keep it on the order of the existing reactor mem-stat cadence to +// avoid burning UART bandwidth on changes that don't affect anything. +const MemSnapshotInterval = 3 * time.Second + +// ChargerThresholds carries the analog comparison thresholds used by +// both the state/self/power/charger/config retained fact (W7 finish) +// and the charger alert FSM's analog kinds (W8 finish — vin_lo, +// vin_hi, bsr_high). +// +// These ARE the LTC4015 effective config in production; on this +// branch they default to conservative bring-up values. +type ChargerThresholds struct { + VinLoMV int32 `json:"vin_lo_mV"` + VinHiMV int32 `json:"vin_hi_mV"` + BSRHighUohmPerCell uint32 `json:"bsr_high_uohm_per_cell"` +} + +// ChargerAlertMask is the 14-bool mask matching the 14 canonical alert +// kinds. Pre-fabric-security the mask is informational only — the +// alert FSM ignores it for emission. Once the LTC4015 driver +// programs the chip's alert-enable register from this and reports it +// back, masking can flow through to the FSM. Names here are +// spec-frozen to match docs/firmware-alignment-update.md. +type ChargerAlertMask struct { + VinLo bool `json:"vin_lo"` + VinHi bool `json:"vin_hi"` + BSRHigh bool `json:"bsr_high"` + BatMissing bool `json:"bat_missing"` + BatShort bool `json:"bat_short"` + MaxChargeTimeFault bool `json:"max_charge_time_fault"` + Absorb bool `json:"absorb"` + Equalize bool `json:"equalize"` + CCCV bool `json:"cccv"` + Precharge bool `json:"precharge"` + IinLimited bool `json:"iin_limited"` + UvclActive bool `json:"uvcl_active"` + CcPhase bool `json:"cc_phase"` + CvPhase bool `json:"cv_phase"` +} + +// ChargerConfig is the typed input into the publisher; the runtime +// fact wraps it inside ChargerConfigFact with seq + uptime_ms. +// +// Source is the value emitted on the wire as the fact's "source" +// field. Use "ltc4015" when the driver has read the effective +// programmed register state; use "ltc4015-default" (the +// DefaultChargerConfig value) to make it visible on the wire that +// these are fallback bring-up values, not what the chip is actually +// programmed with. The source string tracks the data's provenance so defaults +// are not presented as values read back from the chip. +type ChargerConfig struct { + Source string + Thresholds ChargerThresholds + AlertMaskBits uint16 + AlertMask ChargerAlertMask +} + +// DefaultChargerConfig returns conservative bring-up values labelled +// source="ltc4015-default" so consumers can spot that the LTC4015 +// driver has not supplied effective programmed values. VinLoMV / +// VinHiMV bracket a healthy USB-C / 12 V input; BSRHigh targets a +// typical lead-acid pack BSR. +func DefaultChargerConfig() ChargerConfig { + return ChargerConfig{ + Source: "ltc4015-default", + Thresholds: ChargerThresholds{ + VinLoMV: 10500, + VinHiMV: 17000, + BSRHighUohmPerCell: 5000, + }, + // Mask bits + booleans both zero — alerts unmasked at the + // chip level by default. The FSM emits regardless on this + // branch (informational mask only). + } +} + +// Service runs the telemetry publishers + charger alert FSM. Started +// from the reactor in its own goroutine. +type Service struct { + conn *bus.Connection + + // monotonic seq counters per topic — keeps the CM5 import side + // able to spot stalls without reading payload contents. + seqBattery atomic.Uint32 + seqCharger atomic.Uint32 + seqChargerCfg atomic.Uint32 + seqEnvTemp atomic.Uint32 + seqEnvHum atomic.Uint32 + seqRuntimeMem atomic.Uint32 + seqChargerAlert atomic.Uint32 + + startedAt time.Time + + // chargerCfg carries the analog thresholds the alert FSM uses for + // the vin_lo / vin_hi / bsr_high kinds, plus the alert mask the + // charger config fact retains to the wire. + chargerCfg ChargerConfig + + // alert FSM previous-bitfield state. Compared against incoming + // values to detect bit-set transitions. + alertFSM chargerAlertFSM +} + +// New constructs the service. conn must be a fresh bus connection +// dedicated to telemetry (not shared with the updater or fabric). +func New(conn *bus.Connection) *Service { + return &Service{ + conn: conn, + startedAt: time.Now(), + chargerCfg: DefaultChargerConfig(), + } +} + +func (s *Service) chargerThresholds() ChargerThresholds { + return s.chargerCfg.Thresholds +} + +// Run subscribes to HAL inputs and runs the publish loop. Blocks +// until ctx is cancelled. +func (s *Service) Run(ctx context.Context) { + tempSub := s.conn.Subscribe(halEnvTemp) + defer s.conn.Unsubscribe(tempSub) + humSub := s.conn.Subscribe(halEnvHum) + defer s.conn.Unsubscribe(humSub) + pwrSub := s.conn.Subscribe(halPwrAny) + defer s.conn.Unsubscribe(pwrSub) + + // Charger config retain at startup — the CM5-side import on + // the CM5 keys off this for the `update.components.mcu.charger_*` + // view, and the alert FSM analog kinds (vin_lo / vin_hi / + // bsr_high) depend on it being present to know what to compare + // against. + s.publishChargerConfig() + + linkSub := s.conn.Subscribe(topicFabricLink) + defer s.conn.Unsubscribe(linkSub) + + memTick := time.NewTicker(MemSnapshotInterval) + defer memTick.Stop() + + prevReady := map[string]bool{} + + for { + select { + case <-ctx.Done(): + return + case msg, ok := <-tempSub.Channel(): + if !ok || msg == nil { + continue + } + if v, ok := msg.Payload.(types.TemperatureValue); ok { + s.publishEnvTemp(v) + } + case msg, ok := <-humSub.Channel(): + if !ok || msg == nil { + continue + } + if v, ok := msg.Payload.(types.HumidityValue); ok { + s.publishEnvHum(v) + } + case msg, ok := <-pwrSub.Channel(): + if !ok || msg == nil { + continue + } + s.dispatchPower(msg) + case msg, ok := <-linkSub.Channel(): + if !ok || msg == nil { + continue + } + linkID, ready := decodeLinkReady(msg) + if linkID == "" { + continue + } + was := prevReady[linkID] + if ready && !was { + s.publishChargerConfig() + } + prevReady[linkID] = ready + case <-memTick.C: + s.publishRuntimeMem() + } + } +} + +// decodeLinkReady mirrors services/updater's helper but local to the +// telemetry package — kept duplicated rather than reaching into +// updater (cleaner package boundary). +func decodeLinkReady(msg *bus.Message) (string, bool) { + if msg == nil { + return "", false + } + t := msg.Topic + if t == nil || t.Len() < 4 { + return "", false + } + id, _ := t.At(t.Len() - 1).(string) + if id == "" { + return "", false + } + switch p := msg.Payload.(type) { + case nil: + return id, false + case map[string]any: + ready, _ := p["ready"].(bool) + return id, ready + } + // Probe via JSON for the typed-struct payload fabric publishes. + b, err := json.Marshal(msg.Payload) + if err != nil { + return id, false + } + var probe struct { + Ready bool `json:"ready"` + } + if err := json.Unmarshal(b, &probe); err != nil { + return id, false + } + return id, probe.Ready +} + +// dispatchPower splits the power-domain wildcard into per-kind +// publish paths. Kept tiny: BatteryValue and ChargerValue are the only +// shapes we consume on this branch (TemperatureValue from +// power/temperature/internal is intentionally NOT republished — the +// canonical contract puts thermal info under environment/temperature). +func (s *Service) dispatchPower(msg *bus.Message) { + switch v := msg.Payload.(type) { + case types.BatteryValue: + s.publishBattery(v) + s.alertFSM.observeBattery(s, v) + case types.ChargerValue: + s.publishCharger(v) + s.alertFSM.observe(s, v) + } +} + +// uptimeMs returns a service-monotonic uptime — close enough to a +// boot-uptime for the consumers' purposes (within a few HAL-init ms). +func (s *Service) uptimeMs() int64 { + return time.Since(s.startedAt).Milliseconds() +} + +// ---- W7: retained-state publishers --------------------------------- + +// BatteryFact is the retained payload at state/self/power/battery. +// All units are integer engineering units per the spec. +type BatteryFact struct { + PackMV int32 `json:"pack_mV"` + PerCellMV int32 `json:"per_cell_mV"` + IBatMA int32 `json:"ibat_mA"` + TempMC int32 `json:"temp_mC"` + BSRUOhmPerCell uint32 `json:"bsr_uohm_per_cell"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishBattery(v types.BatteryValue) { + fact := BatteryFact{ + PackMV: v.PackMilliV, + PerCellMV: v.PerCellMilliV, + IBatMA: v.IBatMilliA, + TempMC: v.TempMilliC, + BSRUOhmPerCell: v.BSR_uOhmPerCell, + Seq: s.seqBattery.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicBattery, fact, true)) +} + +// ChargerFact is the retained payload at state/self/power/charger. +// Carries raw bitfields AND 3 decoded boolean objects. +// +// The canonical key names below come from +// docs/firmware-alignment-update.md §"Telemetry/state facts" — they +// are NOT the existing display names in types.ChargerStateTable etc. +// (those drop the `_charge` / `_active` / `_fault` suffixes for +// log-line brevity). The wire-canonical names are spec-frozen because +// the Lua import side keys off them; renaming any of these is a +// wire-break. +type ChargerFact struct { + VinMV int32 `json:"vin_mV"` + VsysMV int32 `json:"vsys_mV"` + IinMA int32 `json:"iin_mA"` + StateBits uint16 `json:"state_bits"` + StatusBits uint16 `json:"status_bits"` + SystemBits uint16 `json:"system_bits"` + State map[string]bool `json:"state"` + Status map[string]bool `json:"status"` + System map[string]bool `json:"system"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +// Canonical name tables. Each entry is a (bit, canonical-name) pair. +// Counts match the spec's "27 booleans total: 11 + 4 + 12". +var chargerStateNames = []struct { + bit types.ChargerStateBits + name string +}{ + {types.EqualizeCharge, "equalize_charge"}, + {types.AbsorbCharge, "absorb_charge"}, + {types.ChargerSuspended, "charger_suspended"}, + {types.Precharge, "precharge"}, + {types.CCCVCharge, "cccv_charge"}, + {types.NTCPause, "ntc_pause"}, + {types.TimerTerm, "timer_term"}, + {types.COverXTerm, "c_over_x_term"}, + {types.MaxChargeTimeFault, "max_charge_time_fault"}, + {types.BatMissingFault, "bat_missing_fault"}, + {types.BatShortFault, "bat_short_fault"}, +} + +var chargerStatusNames = []struct { + bit types.ChargeStatusBits + name string +}{ + {types.VinUvclActive, "vin_uvcl_active"}, + {types.IinLimitActive, "iin_limit_active"}, + {types.ConstCurrent, "const_current"}, + {types.ConstVoltage, "const_voltage"}, +} + +var chargerSystemNames = []struct { + bit types.SystemStatus + name string +}{ + {types.ChargerEnabled, "charger_enabled"}, + {types.MpptEnPin, "mppt_en_pin"}, + {types.EqualizeReq, "equalize_req"}, + {types.DrvccGood, "drvcc_good"}, + {types.CellCountError, "cell_count_error"}, + {types.OkToCharge, "ok_to_charge"}, + {types.NoRt, "no_rt"}, + {types.ThermalShutdown, "thermal_shutdown"}, + {types.VinOvlo, "vin_ovlo"}, + {types.VinGtVbat, "vin_gt_vbat"}, + {types.IntvccGt4p3V, "intvcc_gt_4p3v"}, + {types.IntvccGt2p8V, "intvcc_gt_2p8v"}, +} + +func decodeChargerState(v uint16) map[string]bool { + out := make(map[string]bool, len(chargerStateNames)) + for _, e := range chargerStateNames { + out[e.name] = (v & uint16(e.bit)) != 0 + } + return out +} + +func decodeChargerStatus(v uint16) map[string]bool { + out := make(map[string]bool, len(chargerStatusNames)) + for _, e := range chargerStatusNames { + out[e.name] = (v & uint16(e.bit)) != 0 + } + return out +} + +func decodeChargerSystem(v uint16) map[string]bool { + out := make(map[string]bool, len(chargerSystemNames)) + for _, e := range chargerSystemNames { + out[e.name] = (v & uint16(e.bit)) != 0 + } + return out +} + +func (s *Service) publishCharger(v types.ChargerValue) { + fact := ChargerFact{ + VinMV: v.VIN_mV, + VsysMV: v.VSYS_mV, + IinMA: v.IIn_mA, + StateBits: v.State, + StatusBits: v.Status, + SystemBits: v.Sys, + State: decodeChargerState(v.State), + Status: decodeChargerStatus(v.Status), + System: decodeChargerSystem(v.Sys), + Seq: s.seqCharger.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicCharger, fact, true)) +} + +// ChargerConfigFact — state/self/power/charger/config. Effective +// LTC4015 configuration. Strictly forbidden from carrying +// operating-state booleans (charger_enabled, ok_to_charge, etc.) — +// those live on state/self/power/charger. +type ChargerConfigFact struct { + Schema int `json:"schema"` + Source string `json:"source"` + Thresholds ChargerThresholds `json:"thresholds"` + AlertMaskBits uint16 `json:"alert_mask_bits"` + AlertMask ChargerAlertMask `json:"alert_mask"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishChargerConfig() { + cfg := s.chargerCfg + source := cfg.Source + if source == "" { + // Defensive: a caller that constructed ChargerConfig without + // going through DefaultChargerConfig may have left this empty. + // Make the gap visible on the wire rather than misreporting. + source = "ltc4015-default" + } + fact := ChargerConfigFact{ + Schema: 1, + Source: source, + Thresholds: cfg.Thresholds, + AlertMaskBits: cfg.AlertMaskBits, + AlertMask: cfg.AlertMask, + Seq: s.seqChargerCfg.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicChargerCfg, fact, true)) +} + +// EnvTempFact — state/self/environment/temperature. +type EnvTempFact struct { + DeciC int32 `json:"deci_c"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishEnvTemp(v types.TemperatureValue) { + fact := EnvTempFact{ + DeciC: int32(v.DeciC), + Seq: s.seqEnvTemp.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicEnvTemp, fact, true)) +} + +// EnvHumFact — state/self/environment/humidity. +type EnvHumFact struct { + RHx100 int32 `json:"rh_x100"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishEnvHum(v types.HumidityValue) { + fact := EnvHumFact{ + RHx100: int32(v.RHx100), + Seq: s.seqEnvHum.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicEnvHumidity, fact, true)) +} + +// RuntimeMemFact — state/self/runtime/memory. Sourced from +// runtime.MemStats.Alloc; sufficient for the retained-fact +// "memory pressure" signal Lua consumers expect. +type RuntimeMemFact struct { + AllocBytes uint64 `json:"alloc_bytes"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishRuntimeMem() { + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + fact := RuntimeMemFact{ + AllocBytes: ms.Alloc, + Seq: s.seqRuntimeMem.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicRuntimeMem, fact, true)) +} diff --git a/services/telemetry/telemetry_test.go b/services/telemetry/telemetry_test.go new file mode 100644 index 0000000..2c40985 --- /dev/null +++ b/services/telemetry/telemetry_test.go @@ -0,0 +1,410 @@ +package telemetry + +import ( + "context" + "testing" + "time" + + "devicecode-go/bus" + "devicecode-go/types" +) + +func newTestBus() *bus.Bus { return bus.NewBus(8, "+", "#") } + +// runService is the same kind of subscribe-then-start helper used in +// services/updater_test.go: a fresh probe subscription on a bus +// connection guarantees we capture the first publish without racing +// the goroutine's Subscribe calls. +func runService(t *testing.T, b *bus.Bus) (*bus.Connection, context.CancelFunc) { + t.Helper() + conn := b.NewConnection("telemetry") + svc := New(conn) + ctx, cancel := context.WithCancel(context.Background()) + go svc.Run(ctx) + // Telemetry only emits in response to incoming HAL data, so we + // don't need to wait on a startup retain; the SubscribeOnHAL test + // below uses a settle delay. + time.Sleep(10 * time.Millisecond) + return conn, cancel +} + +func TestPublishesBatteryFact(t *testing.T) { + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicBattery) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + hal.Publish(hal.NewMessage(halPwrAny, types.BatteryValue{ + PackMilliV: 12000, + PerCellMilliV: 3000, + IBatMilliA: -500, + TempMilliC: 24500, + BSR_uOhmPerCell: 1200, + }, true)) + + select { + case msg := <-sub.Channel(): + fact, ok := msg.Payload.(BatteryFact) + if !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + if fact.PackMV != 12000 || fact.IBatMA != -500 || fact.BSRUOhmPerCell != 1200 { + t.Fatalf("battery fact wrong: %+v", fact) + } + if fact.Seq != 1 { + t.Fatalf("seq = %d, want 1", fact.Seq) + } + if fact.UptimeMs < 0 { + t.Fatalf("uptime_ms = %d, want >= 0", fact.UptimeMs) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for battery fact") + } +} + +func TestPublishesChargerWithDecodedBooleans(t *testing.T) { + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicCharger) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + VIN_mV: 18000, + VSYS_mV: 12200, + IIn_mA: 500, + State: uint16(types.AbsorbCharge | types.CCCVCharge), + Status: uint16(types.IinLimitActive), + Sys: uint16(types.ChargerEnabled | types.OkToCharge), + }, true)) + + select { + case msg := <-sub.Channel(): + fact, ok := msg.Payload.(ChargerFact) + if !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + if fact.VinMV != 18000 || fact.VsysMV != 12200 || fact.IinMA != 500 { + t.Fatalf("analog values wrong: %+v", fact) + } + if fact.StateBits != uint16(types.AbsorbCharge|types.CCCVCharge) { + t.Fatalf("state_bits = 0x%x", fact.StateBits) + } + // Decoded booleans use the canonical names from + // docs/firmware-alignment-update.md §6.2 — these are the + // wire-frozen names the Lua side keys off. + if !fact.State["absorb_charge"] || !fact.State["cccv_charge"] { + t.Fatalf("decoded state booleans wrong: %+v", fact.State) + } + if fact.State["bat_short_fault"] || fact.State["bat_missing_fault"] { + t.Fatalf("unset state bits decoded as true: %+v", fact.State) + } + if !fact.Status["iin_limit_active"] { + t.Fatalf("status iin_limit_active not decoded: %+v", fact.Status) + } + if !fact.System["charger_enabled"] || !fact.System["ok_to_charge"] { + t.Fatalf("system booleans wrong: %+v", fact.System) + } + // All three maps must be exactly the spec sizes. + if got := len(fact.State); got != 11 { + t.Fatalf("state map size = %d, want 11", got) + } + if got := len(fact.Status); got != 4 { + t.Fatalf("status map size = %d, want 4", got) + } + if got := len(fact.System); got != 12 { + t.Fatalf("system map size = %d, want 12", got) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for charger fact") + } +} + +func TestPublishesEnvironmentFacts(t *testing.T) { + b := newTestBus() + observer := b.NewConnection("observer") + tSub := observer.Subscribe(TopicEnvTemp) + defer observer.Unsubscribe(tSub) + hSub := observer.Subscribe(TopicEnvHumidity) + defer observer.Unsubscribe(hSub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + hal.Publish(hal.NewMessage(halEnvTemp, types.TemperatureValue{DeciC: 235}, true)) + hal.Publish(hal.NewMessage(halEnvHum, types.HumidityValue{RHx100: 4530}, true)) + + select { + case msg := <-tSub.Channel(): + fact, ok := msg.Payload.(EnvTempFact) + if !ok || fact.DeciC != 235 { + t.Fatalf("env temp fact = %+v ok=%v", msg.Payload, ok) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for env temp fact") + } + select { + case msg := <-hSub.Channel(): + fact, ok := msg.Payload.(EnvHumFact) + if !ok || fact.RHx100 != 4530 { + t.Fatalf("env hum fact = %+v ok=%v", msg.Payload, ok) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for env hum fact") + } +} + +func TestAllAlertKindsCount(t *testing.T) { + if got := len(AllAlertKinds); got != 14 { + t.Fatalf("AllAlertKinds has %d entries, want 14", got) + } + // Spec-frozen names — typo in the kind enum is a wire-break, so + // guard the canonical strings explicitly. + want := []string{ + "vin_lo", "vin_hi", "bsr_high", + "bat_missing", "bat_short", "max_charge_time_fault", + "absorb", "equalize", "cccv", "precharge", + "iin_limited", "uvcl_active", "cc_phase", "cv_phase", + } + for i, k := range AllAlertKinds { + if string(k) != want[i] { + t.Fatalf("AllAlertKinds[%d] = %q, want %q", i, string(k), want[i]) + } + } +} + +func TestChargerAlertFSMEdgeOnly(t *testing.T) { + // Spec: "On bit-set transition for a kind, emit one normal event." + // Subsequent retains that keep the bit set must NOT re-emit. + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + // First publish primes the FSM (no alerts emitted on init). + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{}, true)) + time.Sleep(20 * time.Millisecond) + + // Bit goes from clear to set: one alert emitted. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + Status: uint16(types.IinLimitActive), + }, true)) + + select { + case msg := <-sub.Channel(): + ev, ok := msg.Payload.(AlertEvent) + if !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + if ev.Kind != AlertIinLimited { + t.Fatalf("kind = %q, want %q", ev.Kind, AlertIinLimited) + } + if ev.Source != "ltc4015" { + t.Fatalf("source = %q", ev.Source) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for first alert") + } + + // Second publish keeps the bit set — no new alert. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + Status: uint16(types.IinLimitActive), + }, true)) + select { + case msg := <-sub.Channel(): + t.Fatalf("unexpected duplicate alert: %+v", msg.Payload) + case <-time.After(150 * time.Millisecond): + } + + // Bit clears, then sets again: one more alert. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{}, true)) + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + Status: uint16(types.IinLimitActive), + }, true)) + + select { + case msg := <-sub.Channel(): + ev, _ := msg.Payload.(AlertEvent) + if ev.Kind != AlertIinLimited { + t.Fatalf("re-edge alert kind = %q", ev.Kind) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for re-edge alert") + } +} + +func TestPublishesChargerConfigAtStartup(t *testing.T) { + // W7 finish: state/self/power/charger/config retains at startup + // with the conservative defaults from DefaultChargerConfig(). + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerCfg) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + select { + case msg := <-sub.Channel(): + fact, ok := msg.Payload.(ChargerConfigFact) + if !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + if fact.Schema != 1 || fact.Source != "ltc4015-default" { + t.Fatalf("schema/source wrong: %+v", fact) + } + if fact.Thresholds.VinLoMV == 0 || fact.Thresholds.VinHiMV == 0 || fact.Thresholds.BSRHighUohmPerCell == 0 { + t.Fatalf("default thresholds not populated: %+v", fact.Thresholds) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for charger config fact") + } +} + +func TestChargerAlertFSMVinLoEdge(t *testing.T) { + // W8 finish: vin_lo fires on ChargerValue.VIN_mV crossing below + // the configured threshold. Subsequent observations below the + // threshold do NOT re-fire. + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + // Prime the FSM with vin above threshold (default vin_lo = 10500). + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 12000}, true)) + time.Sleep(20 * time.Millisecond) + + // vin drops below threshold. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 10000}, true)) + + select { + case msg := <-sub.Channel(): + ev, _ := msg.Payload.(AlertEvent) + if ev.Kind != AlertVinLo { + t.Fatalf("kind = %q, want vin_lo", ev.Kind) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for vin_lo alert") + } + + // Stays below — no re-emit. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 9500}, true)) + select { + case msg := <-sub.Channel(): + t.Fatalf("unexpected duplicate vin_lo: %+v", msg.Payload) + case <-time.After(150 * time.Millisecond): + } +} + +func TestChargerAlertFSMVinHiEdge(t *testing.T) { + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + // Default vin_hi = 17000; prime below threshold. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 12000}, true)) + time.Sleep(20 * time.Millisecond) + // Cross above. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 18000}, true)) + + select { + case msg := <-sub.Channel(): + ev, _ := msg.Payload.(AlertEvent) + if ev.Kind != AlertVinHi { + t.Fatalf("kind = %q, want vin_hi", ev.Kind) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for vin_hi alert") + } +} + +func TestChargerAlertFSMBSRHighEdge(t *testing.T) { + // bsr_high observes BatteryValue.BSR_uOhmPerCell against the + // threshold from charger config (default 5000 uohm/cell). + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + // Prime with healthy BSR (below threshold). + hal.Publish(hal.NewMessage(halPwrAny, types.BatteryValue{BSR_uOhmPerCell: 2000}, true)) + time.Sleep(20 * time.Millisecond) + // Crosses threshold. + hal.Publish(hal.NewMessage(halPwrAny, types.BatteryValue{BSR_uOhmPerCell: 6000}, true)) + + select { + case msg := <-sub.Channel(): + ev, _ := msg.Payload.(AlertEvent) + if ev.Kind != AlertBsrHigh { + t.Fatalf("kind = %q, want bsr_high", ev.Kind) + } + if ev.Severity != "warning" { + t.Fatalf("severity = %q, want warning", ev.Severity) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for bsr_high alert") + } +} + +func TestChargerAlertFSMMultipleBitsTransitionTogether(t *testing.T) { + // Two state bits flip in the same publish — both alerts should + // fire. The order is deterministic per the FSM's switch order. + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{}, true)) + time.Sleep(20 * time.Millisecond) + + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + State: uint16(types.AbsorbCharge | types.CCCVCharge), + }, true)) + + gotKinds := make(map[AlertKind]bool) + deadline := time.After(2 * time.Second) + for len(gotKinds) < 2 { + select { + case msg := <-sub.Channel(): + ev, ok := msg.Payload.(AlertEvent) + if !ok { + continue + } + gotKinds[ev.Kind] = true + case <-deadline: + t.Fatalf("only got %v before deadline; want absorb + cccv", gotKinds) + } + } + if !gotKinds[AlertAbsorb] || !gotKinds[AlertCccv] { + t.Fatalf("expected absorb+cccv, got %v", gotKinds) + } +} From 67b24b29aa98770c2f4c6a1183da84742669eb4d Mon Sep 17 00:00:00 2001 From: cpunt Date: Wed, 27 May 2026 16:06:01 +0000 Subject: [PATCH 4/6] serial_raw: bound UART pump turns --- services/hal/devices/serial_raw/builder.go | 208 ++++++++++++++---- .../hal/devices/serial_raw/builder_test.go | 206 +++++++++++++++++ .../hal/internal/provider/rp2_resources.go | 14 ++ 3 files changed, 384 insertions(+), 44 deletions(-) create mode 100644 services/hal/devices/serial_raw/builder_test.go diff --git a/services/hal/devices/serial_raw/builder.go b/services/hal/devices/serial_raw/builder.go index f417760..9d7b482 100644 --- a/services/hal/devices/serial_raw/builder.go +++ b/services/hal/devices/serial_raw/builder.go @@ -2,6 +2,7 @@ package serial_raw import ( "context" + "runtime" "sync/atomic" "time" @@ -23,6 +24,11 @@ type Params struct { TXSize int // power of two; default 512 if zero in SessionOpen } +const ( + serialRawPumpRXBudget = 256 + serialRawPumpTXBudget = 256 +) + // ---- Device ---- type Device struct { @@ -52,9 +58,11 @@ type session struct { txRing *shmring.Ring // Reactor-owned observability. Single writer only. - rxRingFull uint32 - rxLogAt time.Time - rxLogHits uint32 + rxRingFull uint32 + rxLogAt time.Time + rxLogHits uint32 + rxPressureAt time.Time + rxPressureHits uint32 // Single worker (reactor) for the port. ctx context.Context @@ -62,6 +70,11 @@ type session struct { done chan struct{} } +type serialRXDiagnostics interface { + RXBuffered() int + RXBufferCap() int +} + // ---- Builder registration ---- func Builder() core.Builder { return builder{} } @@ -345,10 +358,57 @@ func (d *Device) logRingFullChange(s *session, force bool) { "[serial-raw]", "rx_ring_full", "uart", d.a.Name, "hits", strconvx.Utoa64(uint64(hits)), + "ring_avail", strconvx.Itoa(s.rxRing.Available()), + "ring_space", strconvx.Itoa(s.rxRing.Space()), + "ring_cap", strconvx.Itoa(s.rxRing.Cap()), ) s.rxLogHits = hits } +func (d *Device) logDriverPressure(s *session, force bool) { + const minInterval = 1 * time.Second + + diag, ok := d.port.(serialRXDiagnostics) + if !ok { + return + } + used := diag.RXBuffered() + capacity := diag.RXBufferCap() + if capacity <= 0 || used < 0 { + return + } + threshold := (capacity * 3) / 4 + if threshold < 1 { + threshold = 1 + } + if !force && used < threshold { + return + } + + hits := s.rxPressureHits + 1 + if !force { + now := time.Now() + if now.Sub(s.rxPressureAt) < minInterval { + return + } + s.rxPressureAt = now + } else { + s.rxPressureAt = time.Now() + } + s.rxPressureHits = hits + + println( + "[serial-raw]", "rx_driver_pressure", + "uart", d.a.Name, + "hits", strconvx.Utoa64(uint64(hits)), + "driver_used", strconvx.Itoa(used), + "driver_cap", strconvx.Itoa(capacity), + "ring_avail", strconvx.Itoa(s.rxRing.Available()), + "ring_space", strconvx.Itoa(s.rxRing.Space()), + "ring_cap", strconvx.Itoa(s.rxRing.Cap()), + ) +} + func (d *Device) reactor(s *session) { defer close(s.done) @@ -359,54 +419,22 @@ func (d *Device) reactor(s *session) { for { made := false - // UART RX -> rxRing (use spans; fill p1 completely before p2) - for { - p1, p2 := rxR.WriteAcquire() - if len(p1) == 0 { - s.rxRingFull++ - break - } - n1 := u.TryRead(p1) - if n1 == 0 { - break - } - if n1 < len(p1) { - rxR.WriteCommit(n1) - made = true - continue - } - n2 := 0 - if len(p2) > 0 { - n2 = u.TryRead(p2) - } - rxR.WriteCommit(n1 + n2) + if d.pumpRX(s, u, rxR, serialRawPumpRXBudget) { made = true } - // txRing -> UART TX (use spans; drain p1 completely before p2) - for { - p1, p2 := txR.ReadAcquire() - if len(p1) == 0 { - break - } - n1 := u.TryWrite(p1) - if n1 == 0 { - break - } - if n1 < len(p1) { - txR.ReadRelease(n1) - made = true - continue - } - n2 := 0 - if len(p2) > 0 { - n2 = u.TryWrite(p2) - } - txR.ReadRelease(n1 + n2) + if d.pumpTX(u, txR, serialRawPumpTXBudget) { made = true } if made { + select { + case <-s.ctx.Done(): + d.logRingFullChange(s, true) + return + default: + } + runtime.Gosched() continue } @@ -424,6 +452,98 @@ func (d *Device) reactor(s *session) { } } +func (d *Device) pumpRX(s *session, u core.SerialPort, rxR *shmring.Ring, budget int) bool { + moved := 0 + + for moved < budget { + d.logDriverPressure(s, false) + p1, p2 := rxR.WriteAcquire() + if len(p1) == 0 { + s.rxRingFull++ + break + } + + remaining := budget - moved + p1 = limitSpan(p1, remaining) + n1 := u.TryRead(p1) + if n1 == 0 { + break + } + n := n1 + moved += n1 + if n1 < len(p1) { + rxR.WriteCommit(n) + break + } + + remaining = budget - moved + if remaining > 0 && len(p2) > 0 { + p2 = limitSpan(p2, remaining) + n2 := u.TryRead(p2) + n += n2 + moved += n2 + if n2 < len(p2) { + rxR.WriteCommit(n) + break + } + } + + rxR.WriteCommit(n) + } + + return moved > 0 +} + +func (d *Device) pumpTX(u core.SerialPort, txR *shmring.Ring, budget int) bool { + moved := 0 + + for moved < budget { + p1, p2 := txR.ReadAcquire() + if len(p1) == 0 { + break + } + + remaining := budget - moved + p1 = limitSpan(p1, remaining) + n1 := u.TryWrite(p1) + if n1 == 0 { + break + } + n := n1 + moved += n1 + if n1 < len(p1) { + txR.ReadRelease(n) + break + } + + remaining = budget - moved + if remaining > 0 && len(p2) > 0 { + p2 = limitSpan(p2, remaining) + n2 := u.TryWrite(p2) + n += n2 + moved += n2 + if n2 < len(p2) { + txR.ReadRelease(n) + break + } + } + + txR.ReadRelease(n) + } + + return moved > 0 +} + +func limitSpan(p []byte, max int) []byte { + if max <= 0 { + return p[:0] + } + if len(p) > max { + return p[:max] + } + return p +} + // ---- Helpers ---- func isPow2(n int) bool { return n > 0 && (n&(n-1)) == 0 } diff --git a/services/hal/devices/serial_raw/builder_test.go b/services/hal/devices/serial_raw/builder_test.go new file mode 100644 index 0000000..e824e35 --- /dev/null +++ b/services/hal/devices/serial_raw/builder_test.go @@ -0,0 +1,206 @@ +package serial_raw + +import ( + "context" + "sync" + "sync/atomic" + "testing" + "time" + + "devicecode-go/services/hal/internal/core" + "devicecode-go/types" +) + +type fakeSerialPort struct { + readable chan struct{} + writable chan struct{} + + continuousRX atomic.Bool + writeCalls atomic.Int32 + readCalls atomic.Int32 + maxReadLen atomic.Int32 + maxWriteLen atomic.Int32 + + mu sync.Mutex + written []byte +} + +func newFakeSerialPort() *fakeSerialPort { + p := &fakeSerialPort{ + readable: make(chan struct{}, 1), + writable: make(chan struct{}, 1), + } + p.signalReadable() + p.signalWritable() + return p +} + +func (p *fakeSerialPort) TryRead(dst []byte) int { + p.readCalls.Add(1) + recordMax(&p.maxReadLen, len(dst)) + if !p.continuousRX.Load() || len(dst) == 0 { + return 0 + } + for i := range dst { + dst[i] = 'r' + } + p.signalReadable() + return len(dst) +} + +func (p *fakeSerialPort) TryWrite(src []byte) int { + p.writeCalls.Add(1) + recordMax(&p.maxWriteLen, len(src)) + if len(src) == 0 { + return 0 + } + p.mu.Lock() + p.written = append(p.written, src...) + p.mu.Unlock() + p.signalWritable() + return len(src) +} + +func (p *fakeSerialPort) Readable() <-chan struct{} { return p.readable } +func (p *fakeSerialPort) Writable() <-chan struct{} { return p.writable } +func (p *fakeSerialPort) Flush() error { return nil } + +func (p *fakeSerialPort) signalReadable() { + select { + case p.readable <- struct{}{}: + default: + } +} + +func (p *fakeSerialPort) signalWritable() { + select { + case p.writable <- struct{}{}: + default: + } +} + +func (p *fakeSerialPort) writtenBytes() []byte { + p.mu.Lock() + defer p.mu.Unlock() + out := make([]byte, len(p.written)) + copy(out, p.written) + return out +} + +func recordMax(max *atomic.Int32, n int) { + for { + cur := max.Load() + if int32(n) <= cur { + return + } + if max.CompareAndSwap(cur, int32(n)) { + return + } + } +} + +func newTestDevice(port *fakeSerialPort) *Device { + return &Device{ + id: "uart1_raw", + a: core.CapAddr{Domain: "io", Kind: types.KindSerial, Name: "uart1"}, + port: port, + } +} + +func drainRXUntil(ctx context.Context, s *session) { + var buf [128]byte + for { + if s.rxRing.TryReadInto(buf[:]) > 0 { + continue + } + select { + case <-ctx.Done(): + return + case <-s.rxRing.Readable(): + } + } +} + +func waitUntil(t *testing.T, timeout time.Duration, pred func() bool) { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if pred() { + return + } + time.Sleep(time.Millisecond) + } + t.Fatal("condition was not met before timeout") +} + +func TestReactorServicesTXWhileRXIsContinuous(t *testing.T) { + port := newFakeSerialPort() + port.continuousRX.Store(true) + dev := newTestDevice(port) + dev.startSession(512, 512) + + drainCtx, stopDrain := context.WithCancel(context.Background()) + defer stopDrain() + go drainRXUntil(drainCtx, dev.sess) + + payload := []byte("tx while rx is busy") + if n := dev.sess.txRing.TryWriteFrom(payload); n != len(payload) { + t.Fatalf("failed to seed tx ring: wrote %d/%d", n, len(payload)) + } + + waitUntil(t, 100*time.Millisecond, func() bool { + return port.writeCalls.Load() > 0 + }) + + if got := string(port.writtenBytes()); got != string(payload) { + t.Fatalf("written payload mismatch: got %q want %q", got, payload) + } + if max := port.maxReadLen.Load(); max > serialRawPumpRXBudget { + t.Fatalf("TryRead span exceeded budget: got %d want <= %d", max, serialRawPumpRXBudget) + } + if max := port.maxWriteLen.Load(); max > serialRawPumpTXBudget { + t.Fatalf("TryWrite span exceeded budget: got %d want <= %d", max, serialRawPumpTXBudget) + } + + dev.stopSession() +} + +func TestStopSessionReturnsUnderContinuousRX(t *testing.T) { + port := newFakeSerialPort() + port.continuousRX.Store(true) + dev := newTestDevice(port) + dev.startSession(512, 512) + + drainCtx, stopDrain := context.WithCancel(context.Background()) + defer stopDrain() + go drainRXUntil(drainCtx, dev.sess) + + done := make(chan struct{}) + go func() { + dev.stopSession() + close(done) + }() + + select { + case <-done: + case <-time.After(100 * time.Millisecond): + t.Fatal("stopSession did not return under continuous RX") + } +} + +func TestStopSessionReturnsWhenIdle(t *testing.T) { + dev := newTestDevice(newFakeSerialPort()) + dev.startSession(512, 512) + + done := make(chan struct{}) + go func() { + dev.stopSession() + close(done) + }() + + select { + case <-done: + case <-time.After(100 * time.Millisecond): + t.Fatal("stopSession did not return while idle") + } +} diff --git a/services/hal/internal/provider/rp2_resources.go b/services/hal/internal/provider/rp2_resources.go index 28e24e6..f047444 100644 --- a/services/hal/internal/provider/rp2_resources.go +++ b/services/hal/internal/provider/rp2_resources.go @@ -711,6 +711,20 @@ func (p *rp2SerialPort) TryRead(b []byte) int { return p.u.TryRead(b) } func (p *rp2SerialPort) TryWrite(b []byte) int { return p.u.TryWrite(b) } func (p *rp2SerialPort) Flush() error { return p.u.Flush() } +func (p *rp2SerialPort) RXBuffered() int { + if p.u == nil || p.u.Buffer == nil { + return -1 + } + return int(p.u.Buffer.Used()) +} + +func (p *rp2SerialPort) RXBufferCap() int { + if p.u == nil || p.u.Buffer == nil { + return -1 + } + return int(p.u.Buffer.Size()) +} + func (p *rp2SerialPort) SetBaudRate(br uint32) error { p.u.SetBaudRate(br); return nil } // Parity strings: "none","even","odd" From 88b80940044784b2f56cc693c02d87ded678a9b7 Mon Sep 17 00:00:00 2001 From: cpunt Date: Wed, 27 May 2026 16:09:27 +0000 Subject: [PATCH 5/6] fabric: clarify RP2350 raw streaming sink --- services/fabric/transfer_sink_rp2350.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index 7b8fdd3..567392a 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -55,8 +55,7 @@ func (s *streamedStageSink) Abort(reason string) error { return nil } -// Bytes returns nil because the TinyGo RP2350 default path verifies the signed -// container while streaming and writes only the authenticated payload into the -// inactive slot. fabric still calls updater/main staging; the updater consumes -// the verified staged descriptor instead of an in-RAM artefact. +// Bytes returns nil because the TinyGo RP2350 default path streams directly +// into the inactive slot. fabric still calls updater/main staging; the updater +// consumes the pre-staged descriptor instead of an in-RAM artefact. func (s *streamedStageSink) Bytes() []byte { return nil } From ae5188a7509f60a5aadde80ab16423a53de0f558 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 2 Jun 2026 16:43:12 +0000 Subject: [PATCH 6/6] ota: harden fabric update flow --- main.go | 12 +- services/fabric/fabric_test.go | 760 +++++++++++++++++++++-- services/fabric/protocol.go | 27 +- services/fabric/remap.go | 18 + services/fabric/session.go | 392 +++++++++--- services/fabric/transfer.go | 164 +++-- services/fabric/transfer_sink_buffer.go | 57 +- services/fabric/transfer_sink_rp2350.go | 27 +- services/fabric/transfer_sink_stub.go | 2 +- services/fabric/transfer_test.go | 777 +++++++++++++++++++++++- services/fabric/writer.go | 2 +- services/reactor/qa_reactor.go | 26 +- services/reactor/reactor.go | 72 ++- services/reactor/reactor_test.go | 71 +++ services/updater/boot_id.go | 7 +- services/updater/facts.go | 8 + services/updater/prestage_host.go | 25 +- services/updater/prestage_tinygo.go | 26 +- services/updater/receiver.go | 107 +++- services/updater/rpc.go | 31 +- services/updater/stream_lease.go | 240 ++++++++ services/updater/types.go | 50 +- services/updater/updater.go | 269 ++++++-- services/updater/updater_test.go | 766 +++++++++++++++++++++-- 24 files changed, 3505 insertions(+), 431 deletions(-) create mode 100644 services/updater/stream_lease.go diff --git a/main.go b/main.go index 978f3c8..182def0 100644 --- a/main.go +++ b/main.go @@ -10,6 +10,7 @@ import ( "devicecode-go/services/updater" "devicecode-go/types" "devicecode-go/utilities" + "pico2-a-b/abupdate" ) // HAL @@ -35,6 +36,11 @@ func main() { time.Sleep(3 * time.Second) log.SetStart(time.Now()) + bootBuyRC := abupdate.CheckAndBuy() + if bootBuyRC != 0 { + log.Println("[main] abupdate CheckAndBuy rc =", bootBuyRC) + } + ctx := context.Background() log.Println("[main] bootstrapping bus …") @@ -58,8 +64,8 @@ func main() { } } - // boot_id (master R3 / fabric-update W6): generate AFTER HAL ready - // and BEFORE the reactor opens fabric. RAM-only — never persisted. + // boot_id: generate AFTER HAL ready and BEFORE the reactor opens + // fabric. RAM-only — never persisted. bootID := updater.GenerateBootID() log.Println("[main] boot_id =", bootID) @@ -68,7 +74,7 @@ func main() { reactor.FirmwareImageID = FirmwareImageID // Reactor - r := reactor.NewReactor(b, uiConn) + r := reactor.NewReactorWithOptions(b, uiConn, reactor.Options{BootBuyRC: bootBuyRC}) r.Run(ctx) } diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 0aa15a1..ece5583 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -79,7 +79,7 @@ func bringUp(t *testing.T, cm5 Transport) protoHelloAck { func unlockExports(t *testing.T, cm5 Transport) { t.Helper() - sendMsg(t, cm5, protoPing{Type: "ping", TS: 77, SID: testCM5SID}) + sendMsg(t, cm5, protoPing{Type: "ping", SID: testCM5SID}) pong := readMsg[protoPong](t, cm5) if pong.Type != "pong" { t.Fatalf("expected pong, got %q", pong.Type) @@ -115,8 +115,8 @@ func TestCodecAllTypes(t *testing.T) { }{ {protoHello{Type: "hello"}, "hello"}, {protoHelloAck{Type: "hello_ack"}, "hello_ack"}, - {protoPing{Type: "ping", TS: 1}, "ping"}, - {protoPong{Type: "pong", TS: 2}, "pong"}, + {protoPing{Type: "ping"}, "ping"}, + {protoPong{Type: "pong"}, "pong"}, {protoPub{Type: "pub", Topic: []string{"a"}}, "pub"}, {protoUnretain{Type: "unretain", Topic: []string{"a"}}, "unretain"}, {protoCall{Type: "call", ID: "c1"}, "call"}, @@ -182,11 +182,11 @@ func TestTransportRoundTrip(t *testing.T) { t.Errorf("ReadLine: %v", err) return } - if string(line) != `{"type":"ping","ts":99}` { + if string(line) != `{"type":"ping","sid":"s1"}` { t.Errorf("got %q", line) } }() - sendMsg(t, a, protoPing{Type: "ping", TS: 99}) + sendMsg(t, a, protoPing{Type: "ping", SID: "s1"}) select { case <-done: case <-time.After(2 * time.Second): @@ -195,8 +195,8 @@ func TestTransportRoundTrip(t *testing.T) { } func TestOversizeLineRecovery(t *testing.T) { - big := `{"type":"ping","ts":0,"x":"` + strings.Repeat("x", maxLineLen+100) + `"}` - input := big + "\n" + `{"type":"ping","ts":3}` + "\n" + big := `{"type":"test","n":0,"x":"` + strings.Repeat("x", maxLineLen+100) + `"}` + input := big + "\n" + `{"type":"test","n":3}` + "\n" tr := newRWTransport(strings.NewReader(input), io.Discard) _, err := tr.ReadLine() if !errors.Is(err, ErrLineTooLong) { @@ -206,7 +206,7 @@ func TestOversizeLineRecovery(t *testing.T) { if err != nil { t.Fatalf("second ReadLine: %v", err) } - if string(line) != `{"type":"ping","ts":3}` { + if string(line) != `{"type":"test","n":3}` { t.Errorf("got %q", line) } } @@ -233,21 +233,21 @@ func TestShmringTransportRoundTrip(t *testing.T) { mcuTr := NewShmringTransport(rx, tx) defer mcuTr.Close() - rx.TryWriteFrom([]byte(`{"type":"ping","ts":42}` + "\n")) + rx.TryWriteFrom([]byte(`{"type":"test","n":42}` + "\n")) line, err := mcuTr.ReadLine() if err != nil { t.Fatalf("ReadLine: %v", err) } - if string(line) != `{"type":"ping","ts":42}` { + if string(line) != `{"type":"test","n":42}` { t.Errorf("got %q", line) } - if err := mcuTr.WriteLine([]byte(`{"type":"pong","ts":42}`)); err != nil { + if err := mcuTr.WriteLine([]byte(`{"type":"test","n":42}`)); err != nil { t.Fatalf("WriteLine: %v", err) } var out [128]byte n := tx.TryReadInto(out[:]) - if string(out[:n]) != `{"type":"pong","ts":42}`+"\n" { + if string(out[:n]) != `{"type":"test","n":42}`+"\n" { t.Errorf("tx got %q", out[:n]) } } @@ -256,13 +256,13 @@ func TestShmringTransportMultiLine(t *testing.T) { rx := shmring.New(256) tr := NewShmringTransport(rx, shmring.New(256)) defer tr.Close() - rx.TryWriteFrom([]byte(`{"type":"ping","ts":1}` + "\n" + `{"type":"ping","ts":2}` + "\n")) + rx.TryWriteFrom([]byte(`{"type":"test","n":1}` + "\n" + `{"type":"test","n":2}` + "\n")) line1, _ := tr.ReadLine() line2, _ := tr.ReadLine() - if string(line1) != `{"type":"ping","ts":1}` { + if string(line1) != `{"type":"test","n":1}` { t.Errorf("line1 = %q", line1) } - if string(line2) != `{"type":"ping","ts":2}` { + if string(line2) != `{"type":"test","n":2}` { t.Errorf("line2 = %q", line2) } } @@ -314,7 +314,7 @@ func TestShmringTransportWriteLineWrapsAcrossSegments(t *testing.T) { } func TestShmringTransportOversize(t *testing.T) { - // Ring must be larger than maxLineLen+100 + newline + the trailing ping + // Ring must be larger than maxLineLen+100 + newline + the trailing test // frame so the producer can deposit both lines without blocking. The rx // ring used to be 4096 when maxLineLen=2048, leaving comfortable // headroom; now that maxLineLen=4096, bump to 8192. @@ -327,7 +327,7 @@ func TestShmringTransportOversize(t *testing.T) { } rx.TryWriteFrom(big) rx.TryWriteFrom([]byte("\n")) - rx.TryWriteFrom([]byte(`{"type":"ping","ts":7}` + "\n")) + rx.TryWriteFrom([]byte(`{"type":"test","n":7}` + "\n")) _, err := tr.ReadLine() if !errors.Is(err, ErrLineTooLong) { t.Fatalf("expected ErrLineTooLong, got %v", err) @@ -336,7 +336,7 @@ func TestShmringTransportOversize(t *testing.T) { if err != nil { t.Fatalf("second ReadLine: %v", err) } - if string(line) != `{"type":"ping","ts":7}` { + if string(line) != `{"type":"test","n":7}` { t.Errorf("got %q", line) } } @@ -370,9 +370,9 @@ func TestHandshake(t *testing.T) { t.Errorf("bad ack: %+v", ack) } time.Sleep(50 * time.Millisecond) - sendMsg(t, cm5, protoPing{Type: "ping", TS: 99, SID: "s1"}) + sendMsg(t, cm5, protoPing{Type: "ping", SID: "s1"}) pong := readMsg[protoPong](t, cm5) - if pong.TS != 99 || pong.SID != ack.SID { + if pong.SID != ack.SID { t.Errorf("bad pong: %+v ack=%+v", pong, ack) } } @@ -390,13 +390,80 @@ func TestSessionReset(t *testing.T) { if ack.SID == "" || ack.Proto != protocolName { t.Errorf("bad hello_ack: %+v", ack) } - sendMsg(t, cm5, protoPing{Type: "ping", TS: 55, SID: "s2"}) + sendMsg(t, cm5, protoPing{Type: "ping", SID: "s2"}) pong := readMsg[protoPong](t, cm5) - if pong.TS != 55 || pong.SID != ack.SID { + if pong.SID != ack.SID { t.Errorf("bad pong: %+v ack=%+v", pong, ack) } } +func TestDuplicateSameSIDHelloRefreshesWithoutReset(t *testing.T) { + tr := &captureTransport{} + sink := &fakeTransferSink{} + s := session{ + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + nodeID: "mcu", + peerID: "bigbox-cm5", + peerSID: "s1", + peerNode: "bigbox-cm5", + incomingTransfer: &incomingTransfer{ + meta: transferMeta{ID: "xfer-1"}, + sink: sink, + }, + } + + s.onHello(&protoHello{Type: msgHello, Proto: protocolName, Node: "bigbox-cm5", SID: "s1"}) + + if len(tr.writes) != 1 { + t.Fatalf("hello_ack writes = %d, want 1", len(tr.writes)) + } + var ack protoHelloAck + if err := json.Unmarshal(tr.writes[0], &ack); err != nil { + t.Fatalf("hello_ack decode failed: %v", err) + } + if ack.Type != msgHelloAck || ack.SID != "mcu-sid-test" || ack.Node != "mcu" { + t.Fatalf("bad hello_ack: %+v", ack) + } + if s.incomingTransfer == nil || len(sink.abortReasons) != 0 { + t.Fatalf("same-SID hello reset transfer: incoming=%v aborts=%v", s.incomingTransfer != nil, sink.abortReasons) + } + if s.peerSID != "s1" || s.peerNode != "bigbox-cm5" { + t.Fatalf("peer identity changed incorrectly: sid=%q node=%q", s.peerSID, s.peerNode) + } +} + +func TestDuplicateSameSIDHelloAckRefreshesWithoutReset(t *testing.T) { + tr := &captureTransport{} + sink := &fakeTransferSink{} + s := session{ + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + nodeID: "mcu", + peerID: "bigbox-cm5", + peerSID: "s1", + peerNode: "bigbox-cm5", + incomingTransfer: &incomingTransfer{ + meta: transferMeta{ID: "xfer-1"}, + sink: sink, + }, + } + + s.onHelloAck(&protoHelloAck{Type: msgHelloAck, Proto: protocolName, Node: "bigbox-cm5", SID: "s1"}) + + if len(tr.writes) != 0 { + t.Fatalf("hello_ack refresh wrote %d frames, want 0", len(tr.writes)) + } + if s.incomingTransfer == nil || len(sink.abortReasons) != 0 { + t.Fatalf("same-SID hello_ack reset transfer: incoming=%v aborts=%v", s.incomingTransfer != nil, sink.abortReasons) + } + if s.peerSID != "s1" || s.peerNode != "bigbox-cm5" { + t.Fatalf("peer identity changed incorrectly: sid=%q node=%q", s.peerSID, s.peerNode) + } +} + func TestRejectsWrongNode(t *testing.T) { mcu, cm5 := pipePair() b := newBus() @@ -433,6 +500,20 @@ func TestRejectsWrongNode(t *testing.T) { } } +func TestRejectsWrongNodeHelloAck(t *testing.T) { + s := session{peerID: "bigbox-cm5"} + s.onHelloAck(&protoHelloAck{ + Type: msgHelloAck, + Proto: protocolName, + Node: "cm5-wrong", + SID: "s1", + }) + + if s.link == linkUp || s.peerSID != "" || s.peerNode != "" { + t.Fatalf("wrong-node hello_ack changed session: link=%v peer_sid=%q peer_node=%q", s.link, s.peerSID, s.peerNode) + } +} + func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { mcu, cm5 := pipePair() b := newBus() @@ -478,9 +559,9 @@ func TestPingPong(t *testing.T) { defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) ack := bringUp(t, cm5) - sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: "s1"}) + sendMsg(t, cm5, protoPing{Type: "ping", SID: "s1"}) pong := readMsg[protoPong](t, cm5) - if pong.TS != 42 || pong.SID != ack.SID { + if pong.SID != ack.SID { t.Errorf("bad pong: %+v ack=%+v", pong, ack) } } @@ -493,11 +574,11 @@ func TestEchoedPingIgnored(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) ack := bringUp(t, cm5) - sendMsg(t, cm5, protoPing{Type: "ping", TS: 41, SID: ack.SID}) - sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: testCM5SID}) + sendMsg(t, cm5, protoPing{Type: "ping", SID: ack.SID}) + sendMsg(t, cm5, protoPing{Type: "ping", SID: testCM5SID}) pong := readMsg[protoPong](t, cm5) - if pong.TS != 42 || pong.SID != ack.SID { + if pong.SID != ack.SID { t.Errorf("bad pong after echoed ping: %+v ack=%+v", pong, ack) } } @@ -511,10 +592,10 @@ func TestEchoedTransferControlIgnored(t *testing.T) { bringUp(t, cm5) sendMsg(t, cm5, protoXferNeed{Type: msgXferNeed, XferID: "echoed", Next: 0}) - sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: testCM5SID}) + sendMsg(t, cm5, protoPing{Type: "ping", SID: testCM5SID}) pong := readMsg[protoPong](t, cm5) - if pong.TS != 42 { + if pong.Type != msgPong { t.Errorf("bad pong after echoed transfer control: %+v", pong) } } @@ -558,19 +639,38 @@ func TestSessionPingsUnconditionally(t *testing.T) { } func TestReadyHeldUntilExportHoldoff(t *testing.T) { - // session_ctl.lua / rpc_bridge.lua: ready == established and rpc_ready, - // where rpc_ready edges true only after retained replay completes. // The Go side gates rpcReady on exportReadyAt elapsing post-handshake. mcu, cm5 := pipePair() b := newBus() observer := b.NewConnection("observer") sub := observer.Subscribe(bus.T("state", "fabric", "link", "mcu-uart0")) defer observer.Unsubscribe(sub) + publisher := b.NewConnection("publisher") + publisher.Publish(publisher.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "mcu-new", "boot_id": "boot-new"}, + true, + )) + publisher.Publish(publisher.NewMessage( + bus.T("state", "self", "updater"), + map[string]string{"state": "running"}, + true, + )) + publisher.Publish(publisher.NewMessage( + bus.T("state", "self", "health"), + map[string]string{"state": "ok"}, + true, + )) ctx, cancel := context.WithCancel(context.Background()) defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) + go func() { + for i := 0; i < len(criticalExportTopics); i++ { + _, _ = cm5.ReadLine() + } + }() var sawNotReady, sawReady bool deadline := time.After(3 * time.Second) @@ -858,10 +958,10 @@ func TestUnknownTypeIgnored(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) cm5.WriteLine([]byte(`{"type":"future_msg"}`)) - sendMsg(t, cm5, protoPing{Type: "ping", TS: 1}) + sendMsg(t, cm5, protoPing{Type: "ping", SID: testCM5SID}) pong := readMsg[protoPong](t, cm5) - if pong.TS != 1 { - t.Errorf("pong.TS = %d", pong.TS) + if pong.Type != msgPong { + t.Errorf("bad pong: %+v", pong) } } @@ -873,10 +973,10 @@ func TestMalformedJSONIgnored(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) bringUp(t, cm5) cm5.WriteLine([]byte("not json")) - sendMsg(t, cm5, protoPing{Type: "ping", TS: 2}) + sendMsg(t, cm5, protoPing{Type: "ping", SID: testCM5SID}) pong := readMsg[protoPong](t, cm5) - if pong.TS != 2 { - t.Errorf("pong.TS = %d", pong.TS) + if pong.Type != msgPong { + t.Errorf("bad pong: %+v", pong) } } @@ -904,12 +1004,33 @@ func TestLinkStatePublishedOnHandshake(t *testing.T) { observer := b.NewConnection("observer") sub := observer.Subscribe(bus.T("state", "fabric", "link", "mcu-uart0")) defer observer.Unsubscribe(sub) + publisher := b.NewConnection("publisher") + publisher.Publish(publisher.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "mcu-new", "boot_id": "boot-new"}, + true, + )) + publisher.Publish(publisher.NewMessage( + bus.T("state", "self", "updater"), + map[string]string{"state": "running"}, + true, + )) + publisher.Publish(publisher.NewMessage( + bus.T("state", "self", "health"), + map[string]string{"state": "ok"}, + true, + )) ctx, cancel := context.WithCancel(context.Background()) defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) ack := bringUp(t, cm5) + go func() { + for i := 0; i < len(criticalExportTopics); i++ { + _, _ = cm5.ReadLine() + } + }() var sawOpening bool deadline := time.After(2 * time.Second) @@ -1130,8 +1251,8 @@ func TestDrainExportsPausesDuringIncomingTransfer(t *testing.T) { defer s.teardownExports() pubConn.Publish(pubConn.NewMessage( - bus.T("state", "self", "runtime", "memory"), - map[string]int{"alloc_bytes": 241376}, + bus.T("state", "self", "software"), + map[string]string{"image_id": "mcu-new", "boot_id": "boot-new"}, true, )) s.drainExports() @@ -1173,6 +1294,11 @@ func TestDrainExportsPausesAfterPrepareCall(t *testing.T) { Topic: []string{"cap", "self", "updater", "main", "rpc", "prepare-update"}, }) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "mcu-new", "boot_id": "boot-new"}, + true, + )) pubConn.Publish(pubConn.NewMessage( bus.T("state", "self", "updater"), map[string]any{ @@ -1182,6 +1308,11 @@ func TestDrainExportsPausesAfterPrepareCall(t *testing.T) { }, true, )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "health"), + map[string]string{"state": "ok"}, + true, + )) s.drainExports() if len(tr.writes) != 0 { @@ -1197,18 +1328,485 @@ func TestDrainExportsPausesAfterPrepareCall(t *testing.T) { } } +func TestDrainExportsAllowsOnlyCriticalFactsDuringPostTransferQuiet(t *testing.T) { + b := bus.NewBus(16, "+", "#") + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + s := session{ + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), + transferQuietUntil: time.Now().Add(time.Second), + transferQuietReason: "xfer_done", + } + + s.setupExports() + defer s.teardownExports() + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "memory"), + map[string]int{"alloc_bytes": 241376}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "mcu-new", "boot_id": "boot-new"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "updater"), + map[string]string{"state": "rebooting"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "health"), + map[string]string{"state": "ok"}, + true, + )) + + for i := 0; i < len(criticalExportTopics)+4; i++ { + s.drainExports() + } + if len(tr.writes) != len(criticalExportTopics) { + t.Fatalf("writes during post-transfer quiet = %d, want %d critical facts", + len(tr.writes), len(criticalExportTopics)) + } + want := [][]string{ + {"state", "self", "software"}, + {"state", "self", "updater"}, + {"state", "self", "health"}, + } + for i, topic := range want { + pub := decodePubWrite(t, tr.writes[i]) + if !slicesEqual(pub.Topic, topic) { + t.Fatalf("write %d topic = %v, want %v", i, pub.Topic, topic) + } + } +} + +func TestDrainExportsPrioritizesCriticalRetainedFacts(t *testing.T) { + b := bus.NewBus(16, "+", "#") + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "memory"), + map[string]int{"alloc_bytes": 241376}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "health"), + map[string]string{"state": "ok"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "updater"), + map[string]string{"state": "running"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "mcu-new", "boot_id": "boot-new"}, + true, + )) + + s := session{ + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), + } + s.setupExports() + defer s.teardownExports() + + for i := 0; i < 3; i++ { + s.drainExports() + } + if len(tr.writes) != 3 { + t.Fatalf("writes after critical drains = %d, want 3", len(tr.writes)) + } + + want := [][]string{ + {"state", "self", "software"}, + {"state", "self", "updater"}, + {"state", "self", "health"}, + } + for i, topic := range want { + pub := decodePubWrite(t, tr.writes[i]) + if !slicesEqual(pub.Topic, topic) { + t.Fatalf("write %d topic = %v, want %v", i, pub.Topic, topic) + } + if !pub.Retain { + t.Fatalf("write %d retain = false, want true", i) + } + } + + for i := 0; i < 8; i++ { + s.drainExports() + } + counts := map[string]int{} + for _, write := range tr.writes { + pub := decodePubWrite(t, write) + counts[wireTopicString(pub.Topic)]++ + } + for _, topic := range want { + key := wireTopicString(topic) + if counts[key] != 1 { + t.Fatalf("critical topic %s sent %d times, want exactly once", key, counts[key]) + } + } + if counts["state/self/runtime/memory"] != 1 { + t.Fatalf("telemetry topic sent %d times, want once", counts["state/self/runtime/memory"]) + } +} + +func TestDrainCriticalExportsCoalescesLatestRetainedFact(t *testing.T) { + b := bus.NewBus(16, "+", "#") + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "old", "boot_id": "boot-old"}, + true, + )) + + s := session{ + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), + } + s.setupExports() + defer s.teardownExports() + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "new", "boot_id": "boot-new"}, + true, + )) + + s.drainExports() + if len(tr.writes) != 1 { + t.Fatalf("writes = %d, want 1", len(tr.writes)) + } + pub := decodePubWrite(t, tr.writes[0]) + if !slicesEqual(pub.Topic, []string{"state", "self", "software"}) { + t.Fatalf("topic = %v, want state/self/software", pub.Topic) + } + var payload map[string]string + if err := json.Unmarshal(pub.Payload, &payload); err != nil { + t.Fatalf("payload unmarshal: %v", err) + } + if payload["image_id"] != "new" || payload["boot_id"] != "boot-new" { + t.Fatalf("payload = %+v, want newest software fact", payload) + } +} + +func TestReadyWaitsForQueuedCriticalReplayAdmission(t *testing.T) { + b := bus.NewBus(16, "+", "#") + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + watchConn := b.NewConnection("watch") + linkSub := watchConn.Subscribe(bus.T("state", "fabric", "link", defaultLinkID)) + defer watchConn.Unsubscribe(linkSub) + tr := &captureTransport{} + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "mcu-new", "boot_id": "boot-new"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "updater"), + map[string]string{"state": "idle"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "health"), + map[string]string{"state": "ok"}, + true, + )) + + s := session{ + linkID: defaultLinkID, + peerID: "mcu", + localSID: "mcu-sid", + peerSID: "cm5-sid", + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), + } + s.setupExports() + defer s.teardownExports() + + s.tickReady(time.Now()) + if s.rpcReady { + t.Fatal("rpcReady raised before critical replay drain") + } + if _, ok := readLinkState(linkSub); ok { + t.Fatal("link state published before critical replay drain") + } + + for i := 0; i < len(criticalExportTopics)-1; i++ { + s.drainExports() + s.tickReady(time.Now()) + if s.rpcReady { + t.Fatalf("rpcReady raised after %d critical writes, want still false", i+1) + } + if _, ok := readLinkState(linkSub); ok { + t.Fatalf("link state published after %d critical writes, want none", i+1) + } + } + + s.drainExports() + s.tickReady(time.Now()) + if !s.rpcReady { + t.Fatal("rpcReady did not raise after critical replay drain") + } + state, ok := readLinkState(linkSub) + if !ok { + t.Fatal("missing ready link state publish") + } + if !state.Ready || state.Status != statusReady { + t.Fatalf("link state = %+v, want ready", state) + } + if len(tr.writes) != len(criticalExportTopics) { + t.Fatalf("critical writes = %d, want %d", len(tr.writes), len(criticalExportTopics)) + } +} + +func TestReadyBlocksWhenCriticalReplayFactsAreAbsentAndSuppressesTelemetry(t *testing.T) { + b := bus.NewBus(16, "+", "#") + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "memory"), + map[string]int{"alloc_bytes": 241376}, + true, + )) + + s := session{ + linkID: defaultLinkID, + peerID: "mcu", + localSID: "mcu-sid", + peerSID: "cm5-sid", + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), + } + s.setupExports() + defer s.teardownExports() + + s.tickReady(time.Now()) + if s.rpcReady { + t.Fatal("rpcReady raised before critical replay drain") + } + for i := 0; i < 3; i++ { + s.drainExports() + s.tickReady(time.Now()) + } + if s.rpcReady { + t.Fatal("rpcReady raised after absent critical replay facts") + } + if len(tr.writes) != 0 { + t.Fatalf("writes = %d, want no telemetry while critical replay is absent", len(tr.writes)) + } +} + +func TestLateCriticalExportsDrainBeforeWildcardTelemetryAndReady(t *testing.T) { + b := bus.NewBus(16, "+", "#") + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "memory"), + map[string]int{"alloc_bytes": 241376}, + true, + )) + + s := session{ + linkID: defaultLinkID, + peerID: "mcu", + localSID: "mcu-sid", + peerSID: "cm5-sid", + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), + } + s.setupExports() + defer s.teardownExports() + + s.drainExports() + s.tickReady(time.Now()) + if s.rpcReady { + t.Fatal("rpcReady raised before initial critical replay facts") + } + if len(tr.writes) != 0 { + t.Fatalf("initial writes = %d, want no telemetry before critical replay", len(tr.writes)) + } + start := len(tr.writes) + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "cpu"), + map[string]int{"load_pct": 42}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "health"), + map[string]string{"state": "ok"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "updater"), + map[string]string{"state": "running"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "mcu-new", "boot_id": "boot-new"}, + true, + )) + + wantCritical := [][]string{ + {"state", "self", "software"}, + {"state", "self", "updater"}, + {"state", "self", "health"}, + } + for i, topic := range wantCritical { + s.drainExports() + pub := decodePubWrite(t, tr.writes[start+i]) + if !slicesEqual(pub.Topic, topic) { + t.Fatalf("post-ready write %d topic = %v, want %v", i, pub.Topic, topic) + } + s.tickReady(time.Now()) + if i < len(wantCritical)-1 && s.rpcReady { + t.Fatalf("rpcReady raised after %d critical writes, want still false", i+1) + } + } + if !s.rpcReady { + t.Fatal("rpcReady did not raise after all critical facts were exported") + } + + for i := 0; i < 8; i++ { + s.drainExports() + } + counts := map[string]int{} + for _, write := range tr.writes[start:] { + pub := decodePubWrite(t, write) + counts[wireTopicString(pub.Topic)]++ + } + for _, topic := range wantCritical { + key := wireTopicString(topic) + if counts[key] != 1 { + t.Fatalf("post-ready critical topic %s sent %d times, want exactly once", key, counts[key]) + } + } + if counts["state/self/runtime/cpu"] != 1 { + t.Fatalf("post-ready telemetry sent %d times, want once", counts["state/self/runtime/cpu"]) + } + + start = len(tr.writes) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "temperature"), + map[string]int{"deci_c": 421}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "health"), + map[string]string{"state": "ok", "reason": "ready-edge"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "updater"), + map[string]string{"state": "idle"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "software"), + map[string]string{"image_id": "mcu-newer", "boot_id": "boot-newer"}, + true, + )) + for i, topic := range wantCritical { + s.drainExports() + pub := decodePubWrite(t, tr.writes[start+i]) + if !slicesEqual(pub.Topic, topic) { + t.Fatalf("post-ready write %d topic = %v, want %v", i, pub.Topic, topic) + } + } + for i := 0; i < 8; i++ { + s.drainExports() + } + counts = map[string]int{} + for _, write := range tr.writes[start:] { + pub := decodePubWrite(t, write) + counts[wireTopicString(pub.Topic)]++ + } + for _, topic := range wantCritical { + key := wireTopicString(topic) + if counts[key] != 1 { + t.Fatalf("post-ready critical topic %s sent %d times, want exactly once", key, counts[key]) + } + } + if counts["state/self/runtime/temperature"] != 1 { + t.Fatalf("post-ready telemetry sent %d times, want once", counts["state/self/runtime/temperature"]) + } +} + +func decodePubWrite(t *testing.T, line []byte) protoPub { + t.Helper() + var pub protoPub + if err := json.Unmarshal(line, &pub); err != nil { + t.Fatalf("Unmarshal pub %q: %v", line, err) + } + if pub.Type != msgPub { + t.Fatalf("frame type = %q, want %q", pub.Type, msgPub) + } + return pub +} + +func readLinkState(sub *bus.Subscription) (linkStatePayload, bool) { + select { + case msg, ok := <-sub.Channel(): + if !ok || msg == nil { + return linkStatePayload{}, false + } + state, ok := msg.Payload.(linkStatePayload) + return state, ok + default: + return linkStatePayload{}, false + } +} + func TestPongAllowedDuringIncomingTransfer(t *testing.T) { tr := &captureTransport{} s := session{ tr: tr, link: linkUp, localSID: "mcu-sid-test", + peerSID: "cm5-sid", incomingTransfer: &incomingTransfer{ meta: transferMeta{ID: "xfer-1"}, }, } - s.onPing(&protoPing{Type: msgPing, TS: 42, SID: "cm5-sid"}) + s.onPing(&protoPing{Type: msgPing, SID: "cm5-sid"}) if len(tr.writes) != 1 { t.Fatalf("pong writes during transfer = %d, want 1", len(tr.writes)) @@ -1217,25 +1815,99 @@ func TestPongAllowedDuringIncomingTransfer(t *testing.T) { if err := json.Unmarshal(tr.writes[0], &pong); err != nil { t.Fatalf("pong decode failed: %v", err) } - if pong.Type != msgPong || pong.SID != "mcu-sid-test" || pong.TS != 42 { + if pong.Type != msgPong || pong.SID != "mcu-sid-test" { + t.Fatalf("bad pong: %+v", pong) + } +} + +func TestPongAllowedDuringPrepareQuietForEstablishedPeer(t *testing.T) { + tr := &captureTransport{} + s := session{ + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + peerSID: "cm5-sid", + transferQuietUntil: time.Now().Add(time.Second), + transferQuietReason: "prepare_call_rx", + } + + s.onPing(&protoPing{Type: msgPing, SID: "cm5-sid"}) + + if len(tr.writes) != 1 { + t.Fatalf("pong writes during prepare quiet = %d, want 1", len(tr.writes)) + } + var pong protoPong + if err := json.Unmarshal(tr.writes[0], &pong); err != nil { + t.Fatalf("pong decode failed: %v", err) + } + if pong.Type != msgPong || pong.SID != "mcu-sid-test" { t.Fatalf("bad pong: %+v", pong) } } -func TestPongSuppressedDuringPrepareQuiet(t *testing.T) { +func TestPongRejectsWrongSIDDuringPrepareQuiet(t *testing.T) { + tr := &captureTransport{} + s := session{ + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + peerSID: "cm5-sid", + transferQuietUntil: time.Now().Add(time.Second), + transferQuietReason: "prepare_call_rx", + } + + s.onPing(&protoPing{Type: msgPing, SID: "other-sid"}) + + if len(tr.writes) != 0 { + t.Fatalf("pong writes for wrong sid = %d, want 0", len(tr.writes)) + } +} + +func TestWrongSIDPingPongDoNotRefreshLiveness(t *testing.T) { + tr := &captureTransport{} + oldRx := time.Now().Add(-time.Hour) + s := session{ + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + peerSID: "cm5-sid", + lastRxAt: oldRx, + } + + s.dispatch(marshal(protoPing{Type: msgPing, SID: "other-sid"})) + if !s.lastRxAt.Equal(oldRx) { + t.Fatalf("wrong-sid ping refreshed liveness: got %v want %v", s.lastRxAt, oldRx) + } + if len(tr.writes) != 0 { + t.Fatalf("pong writes for wrong sid = %d, want 0", len(tr.writes)) + } + + s.dispatch(marshal(protoPong{Type: msgPong, SID: "other-sid"})) + if !s.lastRxAt.Equal(oldRx) { + t.Fatalf("wrong-sid pong refreshed liveness: got %v want %v", s.lastRxAt, oldRx) + } + + s.dispatch(marshal(protoPong{Type: msgPong, SID: "cm5-sid"})) + if !s.lastRxAt.After(oldRx) { + t.Fatalf("current peer pong did not refresh liveness: got %v old %v", s.lastRxAt, oldRx) + } +} + +func TestPongRejectsSelfSIDDuringPrepareQuiet(t *testing.T) { tr := &captureTransport{} s := session{ tr: tr, link: linkUp, localSID: "mcu-sid-test", + peerSID: "mcu-sid-test", transferQuietUntil: time.Now().Add(time.Second), transferQuietReason: "prepare_call_rx", } - s.onPing(&protoPing{Type: msgPing, TS: 42, SID: "cm5-sid"}) + s.onPing(&protoPing{Type: msgPing, SID: "mcu-sid-test"}) if len(tr.writes) != 0 { - t.Fatalf("pong writes during prepare quiet = %d, want 0", len(tr.writes)) + t.Fatalf("pong writes for self sid = %d, want 0", len(tr.writes)) } } diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index a5d3bca..cf2b5a5 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -52,13 +52,11 @@ type protoHelloAck struct { type protoPing struct { Type string `json:"type"` - TS int64 `json:"ts"` SID string `json:"sid,omitempty"` } type protoPong struct { Type string `json:"type"` - TS int64 `json:"ts"` SID string `json:"sid,omitempty"` } @@ -177,6 +175,14 @@ func marshal(v any) []byte { // Returns "" if the line isn't a JSON object, the top-level "type" key // is missing, or its value isn't a string. func protoType(line []byte) string { + return protoTopString(line, "type") +} + +func protoXferID(line []byte) string { + return protoTopString(line, "xfer_id") +} + +func protoTopString(line []byte, field string) string { n := len(line) i := skipJSONSpace(line, 0) if i >= n || line[i] != '{' { @@ -213,10 +219,7 @@ func protoType(line []byte) string { if i >= n { return "" } - isType := keyEnd-1-keyStart == 4 && - line[keyStart] == 't' && line[keyStart+1] == 'y' && - line[keyStart+2] == 'p' && line[keyStart+3] == 'e' - if isType { + if jsonKeyEquals(line[keyStart:keyEnd-1], field) { if line[i] != '"' { return "" } @@ -234,6 +237,18 @@ func protoType(line []byte) string { } } +func jsonKeyEquals(key []byte, field string) bool { + if len(key) != len(field) { + return false + } + for i := 0; i < len(field); i++ { + if key[i] != field[i] { + return false + } + } + return true +} + func skipJSONSpace(line []byte, i int) int { for i < len(line) { switch line[i] { diff --git a/services/fabric/remap.go b/services/fabric/remap.go index c6993c3..7a3b4db 100644 --- a/services/fabric/remap.go +++ b/services/fabric/remap.go @@ -44,6 +44,12 @@ var ( wireUpdaterCommit = []string{"cap", "self", "updater", "main", "rpc", "commit-update"} ) +var criticalExportTopics = []bus.Topic{ + bus.T("state", "self", "software"), + bus.T("state", "self", "updater"), + bus.T("state", "self", "health"), +} + // cap/self/updater/main/rpc/{prepare-update,commit-update} land here from // the wire and are routed to local rpc/updater/{prepare,commit} where the // updater service binds. The updater package re-uses the same local topic @@ -96,6 +102,18 @@ func exportPatterns() []bus.Topic { return exportPatternsFor(exportPublishRules) } +func isCriticalExportTopic(t bus.Topic) bool { + if t == nil { + return false + } + for _, want := range criticalExportTopics { + if topicEquals(t, want) { + return true + } + } + return false +} + func exportCallTopic(t bus.Topic) []string { return busExport(t, exportCallRules) } diff --git a/services/fabric/session.go b/services/fabric/session.go index 3ad29b5..a75bb08 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -1,9 +1,11 @@ package fabric import ( + "bytes" "context" "encoding/json" "errors" + "io" "strings" "time" @@ -50,15 +52,18 @@ const ( exportTickInterval = 50 * time.Millisecond errPayloadMarshal = "payload_marshal_failed" - // The USB/UART path used during OTA echoes MCU-originated JSONL back into - // the MCU receiver. If exported retained state is in flight while CM5 starts - // an OTA transfer, the echoed line can contain CM5's xfer_begin spliced into - // the middle of the state pub. Hold exports quiet from prepare until either - // xfer_begin arrives or this window expires. + // Temporary transport recovery policy: the USB/UART path used during OTA + // can echo MCU-originated JSONL back into the MCU receiver. If exported + // retained state is in flight while CM5 starts an OTA transfer, the echoed + // line can contain CM5's xfer_begin spliced into the middle of the state pub. + // Hold exports quiet from prepare until either xfer_begin arrives or this + // window expires. Revisit after CM5 update-admission hardening so this does + // not become OTA semantics. transferPrepareQuiet = 10 * time.Second - // Keep telemetry/state exports quiet long enough for the host to send the - // follow-up updater commit call after xfer_done. On echo-prone UART links, - // retained export backlog can otherwise splice into the commit JSONL frame. + // Temporary transport recovery policy: keep telemetry/state exports quiet + // long enough for the host to send the follow-up updater commit call after + // xfer_done. On echo-prone UART links, retained export backlog can otherwise + // splice into the commit JSONL frame. transferCompleteQuiet = 10 * time.Second ) @@ -141,21 +146,24 @@ type session struct { exportReadyAt time.Time exportsEnabled bool - exportSubs []*bus.Subscription - exportCallSubs []*bus.Subscription - inboundCalls []*inboundCall - outboundCalls []*outboundCall - nextOutboundID uint64 - nextPingAt time.Time - txControl txLane - txRPC txLane - txBulk txLane - importedRetained []bus.Topic // local topics currently retained on the bus due to wire imports - rpcReady bool // bridge replay complete; gates linkStatePayload.Ready - incomingTransfer *incomingTransfer - transferQuietUntil time.Time - transferQuietReason string - beginTransfer func(transferMeta) (transferSink, error) + criticalExportSubs []*bus.Subscription + criticalExportReplayPending []bool + exportSubs []*bus.Subscription + exportCallSubs []*bus.Subscription + inboundCalls []*inboundCall + outboundCalls []*outboundCall + nextOutboundID uint64 + nextPingAt time.Time + txControl txLane + txRPC txLane + txBulk txLane + importedRetained []bus.Topic // local topics currently retained on the bus due to wire imports + rpcReady bool // bridge replay complete; gates linkStatePayload.Ready + incomingTransfer *incomingTransfer + completedTransfers []completedTransfer + transferQuietUntil time.Time + transferQuietReason string + beginTransfer func(transferMeta) (transferSink, error) } func (s *session) log(msg string) { @@ -237,8 +245,11 @@ func (s *session) run(ctx context.Context) { s.handleLinkDown(reasonTransportDown, res.err.Error()) return } + beforeRx := s.lastRxAt s.dispatch(res.line) - resetTimer(stale, s.cfg.LivenessTimeout) + if s.lastRxAt.After(beforeRx) { + resetTimer(stale, s.cfg.LivenessTimeout) + } case <-exportTick.C: now := time.Now() @@ -345,6 +356,7 @@ func (s *session) handleLinkDown(reason, err string) { s.teardownOutbound(pendingReason) s.teardownImportedRetained() s.abortTransfer(pendingReason) + s.clearCompletedTransfers() s.publishLinkState(reason, err) if err != "" { s.logKV("link down", "err", err) @@ -374,6 +386,7 @@ func (s *session) promoteLink(reason string) { s.teardownInbound() s.teardownOutbound(reason) s.teardownImportedRetained() + s.clearCompletedTransfers() } s.link = linkUp s.rpcReady = false @@ -413,8 +426,8 @@ func (s *session) untrackImportedRetain(t bus.Topic) { } // tickReady promotes rpcReady once the post-handshake export holdoff has -// elapsed, mirroring rpc_bridge.lua's emit_rpc_ready(true) after retained -// replay. Re-publishes link state so consumers observe the ready edge. +// elapsed and the initial exact critical retained replay has been drained. +// Re-publishes link state so consumers observe the ready edge. func (s *session) tickReady(now time.Time) { if s.link != linkUp || s.rpcReady { return @@ -422,6 +435,9 @@ func (s *session) tickReady(now time.Time) { if s.exportReadyAt.IsZero() || now.Before(s.exportReadyAt) { return } + if !s.criticalExportReplayDrained() { + return + } s.rpcReady = true s.publishLinkState("", "") } @@ -434,14 +450,13 @@ func (s *session) dispatch(line []byte) { s.logMalformed(line, nil) return } - s.markRx() switch t { case msgHello: - typedDispatch(s, line, s.onHello) + typedDispatch(s, t, line, s.onHello) return case msgHelloAck: - typedDispatch(s, line, s.onHelloAck) + typedDispatch(s, t, line, s.onHelloAck) return } @@ -451,25 +466,25 @@ func (s *session) dispatch(line []byte) { switch t { case msgPing: - typedDispatch(s, line, s.onPing) + typedDispatch(s, t, line, s.onPing) case msgPong: - typedDispatch(s, line, s.onPong) + typedDispatch(s, t, line, s.onPong) case msgPub: - typedDispatch(s, line, s.onPub) + typedDispatch(s, t, line, s.onPub) case msgUnretain: - typedDispatch(s, line, s.onUnretain) + typedDispatch(s, t, line, s.onUnretain) case msgCall: - typedDispatch(s, line, s.onCall) + typedDispatch(s, t, line, s.onCall) case msgReply: - typedDispatch(s, line, s.onReply) + typedDispatch(s, t, line, s.onReply) case msgXferBegin: - typedDispatch(s, line, s.onTransferBegin) + typedDispatch(s, t, line, s.onTransferBegin) case msgXferChunk: - typedDispatch(s, line, s.onTransferChunk) + typedDispatch(s, t, line, s.onTransferChunk) case msgXferCommit: - typedDispatch(s, line, s.onTransferCommit) + typedDispatch(s, t, line, s.onTransferCommit) case msgXferAbort: - typedDispatch(s, line, s.onTransferAbort) + typedDispatch(s, t, line, s.onTransferAbort) case msgXferReady, msgXferNeed, msgXferDone: s.logKV("echoed transfer control ignored", "type", t) default: @@ -477,15 +492,47 @@ func (s *session) dispatch(line []byte) { } } -func typedDispatch[T any](s *session, line []byte, handler func(*T)) { +func typedDispatch[T any](s *session, msgType string, line []byte, handler func(*T)) { var msg T - if err := json.Unmarshal(line, &msg); err != nil { + dec := json.NewDecoder(bytes.NewReader(line)) + dec.DisallowUnknownFields() + if err := dec.Decode(&msg); err != nil { s.logMalformed(line, err) + s.retryMalformedTransferFrame(msgType, line) + return + } + var extra any + if err := dec.Decode(&extra); err != io.EOF { + if err == nil { + err = errors.New("trailing_json") + } + s.logMalformed(line, err) + s.retryMalformedTransferFrame(msgType, line) return } handler(&msg) } +func (s *session) retryMalformedTransferFrame(msgType string, line []byte) { + if msgType != msgXferChunk { + return + } + cur := s.incomingTransfer + if cur == nil { + return + } + id := protoXferID(line) + if id == "" { + s.logKV("malformed xfer_chunk dropped", "why", "missing_xfer_id") + return + } + if id != cur.meta.ID { + s.logKV("malformed xfer_chunk dropped", "id", id) + return + } + s.retryCorruptTransferFrame("bad_message") +} + func (s *session) requireLinkUp(t string) bool { if s.link != linkUp { s.logKV("dropped before handshake", "type", t) @@ -511,20 +558,9 @@ func (s *session) logMalformed(line []byte, err error) { ) } - // If a transfer is in flight, the dropped frame was very likely a - // corrupted xfer_chunk. Without an explicit signal CM5 keeps - // streaming chunks past the gap and the receiver silently drops - // them as out-of-order; the transfer eventually fails on the - // phase timeout. Re-request the next expected byte so CM5 - // retransmits from the gap. Cheap if it wasn't actually a chunk - // (the sender just gets one stale need frame and ignores it once - // it has caught up). - if cur := s.incomingTransfer; cur != nil { - s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) - // Refresh the idle-chunk deadline so a stream of malformed frames can - // recover instead of tripping phase_timeout mid-retry. - cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) - } + // Transfer retry signaling is handled by typedDispatch only for + // malformed active xfer_chunk frames. Other malformed frames are logged + // and dropped without consuming the transfer corruption budget. } // notePeerIdentity records the remote peer's node, SID, and protocol name. @@ -590,6 +626,18 @@ func wireTopicString(topic []string) string { return strings.Join(topic, "/") } +func validWireTopic(topic []string) bool { + if len(topic) == 0 { + return false + } + for _, part := range topic { + if part == "" { + return false + } + } + return true +} + func (s *session) extendTransferQuiet(reason string, d time.Duration) { now := time.Now() until := now.Add(d) @@ -613,6 +661,15 @@ func (s *session) transferQuiet(now time.Time) (bool, string) { return false, "" } +func quietAllowsCriticalExports(reason string) bool { + switch reason { + case "xfer_commit_target", "xfer_target_rejected", "xfer_done": + return true + default: + return false + } +} + func (s *session) onHello(msg *protoHello) { if msg.Proto != protocolName { s.log("hello dropped: unsupported proto") @@ -626,6 +683,7 @@ func (s *session) onHello(msg *protoHello) { s.log("hello dropped: wrong node") return } + s.markRx() reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) s.logKV("hello rx", "peer_sid", msg.SID) @@ -638,7 +696,9 @@ func (s *session) onHello(msg *protoHello) { return } s.log("hello_ack tx") - s.promoteLink(reason) + if s.link != linkUp || reason != "" { + s.promoteLink(reason) + } } func (s *session) onHelloAck(msg *protoHelloAck) { @@ -654,21 +714,29 @@ func (s *session) onHelloAck(msg *protoHelloAck) { s.log("hello_ack dropped: missing identity") return } + if s.peerID != "" && msg.Node != s.peerID { + s.log("hello_ack dropped: wrong node") + return + } + s.markRx() reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) s.logKV("hello_ack rx", "peer_sid", msg.SID) - s.promoteLink(reason) + if s.link != linkUp || reason != "" { + s.promoteLink(reason) + } } func (s *session) onPing(msg *protoPing) { - if s.isSelfControlFrame("", msg.SID) { - s.log("echoed ping ignored") + if s.link != linkUp || msg.SID != s.peerSID { return } - if !s.transferQuietUntil.IsZero() && time.Now().Before(s.transferQuietUntil) { + if s.isSelfControlFrame("", msg.SID) { + s.log("echoed ping ignored") return } + s.markRx() s.logKV("ping rx", "peer_sid", msg.SID) - if !s.sendControl(marshal(protoPong{Type: msgPong, TS: msg.TS, SID: s.localSID})) { + if !s.sendControl(marshal(protoPong{Type: msgPong, SID: s.localSID})) { return } s.log("pong tx") @@ -691,21 +759,29 @@ func (s *session) tickPing(now time.Time) { if s.nextPingAt.IsZero() || now.Before(s.nextPingAt) { return } - if !s.sendControl(marshal(protoPing{Type: msgPing, TS: now.UnixMilli(), SID: s.localSID})) { + if !s.sendControl(marshal(protoPing{Type: msgPing, SID: s.localSID})) { return } s.nextPingAt = now.Add(s.cfg.PingInterval) } func (s *session) onPong(msg *protoPong) { + if s.link != linkUp || msg.SID != s.peerSID { + return + } if s.isSelfControlFrame("", msg.SID) { s.log("echoed pong ignored") return } + s.markRx() s.lastPongAt = s.lastRxAt } func (s *session) onPub(msg *protoPub) { + if !validWireTopic(msg.Topic) { + s.log("incoming pub dropped: bad_topic") + return + } localTopic := importPublishTopic(msg.Topic) if localTopic == nil { if hasWirePrefix(msg.Topic, []string{"state"}) { @@ -716,6 +792,7 @@ func (s *session) onPub(msg *protoPub) { return } + s.markRx() s.conn.Publish(s.conn.NewMessage(localTopic, msg.Payload, msg.Retain)) if msg.Retain { s.trackImportedRetain(localTopic) @@ -728,16 +805,37 @@ func (s *session) onPub(msg *protoPub) { } func (s *session) onUnretain(msg *protoUnretain) { + if !validWireTopic(msg.Topic) { + s.log("incoming unretain dropped: bad_topic") + return + } localTopic := importPublishTopic(msg.Topic) if localTopic == nil { s.log("incoming unretain dropped: no_route") return } + s.markRx() s.conn.Publish(s.conn.NewMessage(localTopic, nil, true)) s.untrackImportedRetain(localTopic) } func (s *session) onCall(msg *protoCall) { + if msg.ID == "" { + s.log("incoming call dropped: missing_id") + return + } + if !validWireTopic(msg.Topic) { + s.log("incoming call dropped: bad_topic") + s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: "bad_topic"})) + return + } + for _, call := range s.inboundCalls { + if call.id == msg.ID { + s.logKV("incoming call dropped", "err", "duplicate_call_id") + s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: "duplicate_call_id"})) + return + } + } if len(s.inboundCalls) >= s.cfg.MaxInboundHelpers { s.log("incoming call dropped: busy") s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonBusy})) @@ -751,6 +849,7 @@ func (s *session) onCall(msg *protoCall) { return } + s.markRx() isTransferPrepare := wireTopicEquals(msg.Topic, wireUpdaterPrepare) if isTransferPrepare { s.extendTransferQuiet("prepare_call_rx", transferPrepareQuiet) @@ -773,10 +872,15 @@ func (s *session) onCall(msg *protoCall) { } func (s *session) onReply(msg *protoReply) { + if msg.Corr == "" { + s.log("reply dropped: missing_id") + return + } for i, call := range s.outboundCalls { if call.id != msg.Corr { continue } + s.markRx() s.outboundCalls = append(s.outboundCalls[:i], s.outboundCalls[i+1:]...) if !call.req.CanReply() { return @@ -842,6 +946,10 @@ func (s *session) setupExports() { if s.conn == nil { return } + for _, p := range criticalExportTopics { + s.criticalExportSubs = append(s.criticalExportSubs, s.conn.Subscribe(p)) + s.criticalExportReplayPending = append(s.criticalExportReplayPending, true) + } for _, p := range exportPatterns() { s.exportSubs = append(s.exportSubs, s.conn.Subscribe(p)) } @@ -851,6 +959,11 @@ func (s *session) setupExports() { } func (s *session) teardownExports() { + for _, sub := range s.criticalExportSubs { + s.conn.Unsubscribe(sub) + } + s.criticalExportSubs = nil + s.criticalExportReplayPending = nil for _, sub := range s.exportSubs { s.conn.Unsubscribe(sub) } @@ -880,6 +993,100 @@ func (s *session) teardownOutbound(reason string) { s.outboundCalls = nil } +func (s *session) sendExportMessage(m *bus.Message) (bool, bool) { + if m == nil { + return false, true + } + wire := exportTopic(m.Topic) + if wire == nil { + return false, true + } + if m.Retained && m.Payload == nil { + if !s.sendRPC(marshal(protoUnretain{ + Type: msgUnretain, + Topic: wire, + })) { + return false, false + } + return true, true + } + payload, err := marshalPayload(m.Payload) + if err != nil { + s.logKV("export payload dropped", "err", err.Error()) + return false, true + } + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "export pub tx", + "topic", wireTopicString(wire), + "retain", m.Retained, + "payload_len", strconvx.Itoa(len(payload)), + ) + } + if !s.sendRPC(marshal(protoPub{ + Type: msgPub, + Topic: wire, + Payload: payload, + Retain: m.Retained, + })) { + return false, false + } + return true, true +} + +func latestSubscriptionMessage(sub *bus.Subscription) *bus.Message { + var latest *bus.Message + for { + select { + case m, ok := <-sub.Channel(): + if !ok { + return latest + } + if m != nil { + latest = m + } + default: + return latest + } + } +} + +func (s *session) criticalExportReplayDrained() bool { + for _, pending := range s.criticalExportReplayPending { + if pending { + return false + } + } + return true +} + +func (s *session) drainCriticalExports(total *int) bool { + for i, sub := range s.criticalExportSubs { + if *total >= exportMaxPerTick { + return true + } + m := latestSubscriptionMessage(sub) + if m == nil { + if i < len(s.criticalExportReplayPending) && s.criticalExportReplayPending[i] { + return true + } + continue + } + sent, ok := s.sendExportMessage(m) + if !ok { + return false + } + if sent && i < len(s.criticalExportReplayPending) && s.criticalExportReplayPending[i] { + s.criticalExportReplayPending[i] = false + } + if sent { + (*total)++ + } + } + return true +} + // drainExports does a non-blocking read of each export subscription // and writes any messages to the wire. Called from the main loop. func (s *session) drainExports() { @@ -887,19 +1094,29 @@ func (s *session) drainExports() { return } now := time.Now() - if quiet, _ := s.transferQuiet(now); quiet { - // Avoid colliding telemetry/state exports with prepare/xfer traffic on - // echo-prone links. Queued retained state can be exported after the OTA - // control/data path has gone quiet. - return - } + quiet, quietReason := s.transferQuiet(now) if !s.exportsEnabled { return } if !s.exportReadyAt.IsZero() && now.Before(s.exportReadyAt) { return } + if quiet && !quietAllowsCriticalExports(quietReason) { + // Avoid colliding telemetry/state exports with prepare/xfer traffic on + // echo-prone links. Post-transfer quiet allows critical facts below so + // state=rebooting can reach CM5 before the reboot arm. + return + } total := 0 + if !s.drainCriticalExports(&total) { + return + } + if quiet { + return + } + if !s.criticalExportReplayDrained() { + return + } for _, sub := range s.exportSubs { for { if total >= exportMaxPerTick { @@ -910,43 +1127,16 @@ func (s *session) drainExports() { if !ok || m == nil { goto nextSub } - wire := exportTopic(m.Topic) - if wire == nil { + if len(s.criticalExportSubs) > 0 && isCriticalExportTopic(m.Topic) { continue } - if m.Retained && m.Payload == nil { - if !s.sendRPC(marshal(protoUnretain{ - Type: msgUnretain, - Topic: wire, - })) { - return - } - total++ - continue - } - payload, err := marshalPayload(m.Payload) - if err != nil { - s.logKV("export payload dropped", "err", err.Error()) - continue - } - if fabricTraceEnabled { - println( - "[fabric]", "sid", s.localSID, - "export pub tx", - "topic", wireTopicString(wire), - "retain", m.Retained, - "payload_len", strconvx.Itoa(len(payload)), - ) - } - if !s.sendRPC(marshal(protoPub{ - Type: msgPub, - Topic: wire, - Payload: payload, - Retain: m.Retained, - })) { + sent, ok := s.sendExportMessage(m) + if !ok { return } - total++ + if sent { + total++ + } default: goto nextSub } diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index b90aea3..a9001f1 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -14,6 +14,8 @@ import ( const transferTargetUpdaterMain = "updater/main" const transferIdleRetryLimit = 3 +const transferCorruptRetryLimit = 3 +const completedTransferCacheLimit = 4 // transferMeta captures xfer_begin contents. The transfer target is explicit // on the wire; firmware update uses target="updater/main". meta remains opaque @@ -33,6 +35,7 @@ type transferMeta struct { type transferInfo struct { BytesWritten uint32 SlotXIPAddr uint32 + Generation uint64 } // transferSink is the firmware-side write target for an incoming transfer. @@ -54,18 +57,32 @@ type transferSink interface { } type incomingTransfer struct { - meta transferMeta - sink transferSink - bytesWritten uint32 - chunksSeen uint32 - hasher *xxhash.Hasher - idleRetries uint8 + meta transferMeta + sink transferSink + bytesWritten uint32 + chunksSeen uint32 + hasher *xxhash.Hasher + idleRetries uint8 + corruptRetryOffset uint32 + corruptRetriesAtOffset uint8 // deadline is the idle-chunk watchdog: bumped on every accepted chunk // and on initial xfer_begin. checkTransferTimeout fires if now > deadline. // Mirrors transfer_mgr.lua: `active.deadline = runtime.now() + phase_timeout`. deadline time.Time } +type completedTransfer struct { + meta transferMeta +} + +func sameTransferTuple(a, b transferMeta) bool { + return a.ID == b.ID && + a.Target == b.Target && + a.Size == b.Size && + a.DigestAlg == b.DigestAlg && + a.Digest == b.Digest +} + func lowerHex(s string) string { return strings.ToLower(strings.TrimSpace(s)) } @@ -174,6 +191,54 @@ func (s *session) checkTransferTimeout(now time.Time) { s.sendTransferAbort(id, "timeout") } +func (s *session) retryCorruptTransferFrame(reason string) bool { + cur := s.incomingTransfer + if cur == nil { + return false + } + if cur.corruptRetryOffset != cur.bytesWritten { + cur.corruptRetryOffset = cur.bytesWritten + cur.corruptRetriesAtOffset = 0 + } + if cur.corruptRetriesAtOffset >= transferCorruptRetryLimit { + id := cur.meta.ID + s.abortTransfer(reason) + s.sendTransferAbort(id, reason) + return false + } + cur.corruptRetriesAtOffset++ + s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) + return true +} + +func (s *session) completedTransferFor(id string) (transferMeta, bool) { + for _, rec := range s.completedTransfers { + if rec.meta.ID == id { + return rec.meta, true + } + } + return transferMeta{}, false +} + +func (s *session) recordCompletedTransfer(meta transferMeta) { + for i, rec := range s.completedTransfers { + if rec.meta.ID == meta.ID { + s.completedTransfers = append(s.completedTransfers[:i], s.completedTransfers[i+1:]...) + break + } + } + s.completedTransfers = append(s.completedTransfers, completedTransfer{meta: meta}) + if len(s.completedTransfers) > completedTransferCacheLimit { + copy(s.completedTransfers, s.completedTransfers[len(s.completedTransfers)-completedTransferCacheLimit:]) + s.completedTransfers = s.completedTransfers[:completedTransferCacheLimit] + } +} + +func (s *session) clearCompletedTransfers() { + s.completedTransfers = nil +} + func validateTransferBegin(msg *protoXferBegin) (transferMeta, string) { if msg.XferID == "" { return transferMeta{}, "xfer_begin.xfer_id" @@ -214,21 +279,31 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { s.logKV("xfer_begin dropped", "err", errStr) return } + s.markRx() + now := time.Now() if s.incomingTransfer != nil { cur := s.incomingTransfer - if cur.meta.ID == meta.ID && - cur.meta.Size == meta.Size && - cur.meta.Target == meta.Target && - cur.meta.DigestAlg == meta.DigestAlg && - cur.meta.Digest == meta.Digest { + if sameTransferTuple(cur.meta, meta) { s.logKV("xfer_begin duplicate", "id", meta.ID) if s.sendTransferReady(meta.ID) { s.sendTransferNeed(meta.ID, cur.bytesWritten) } - cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) + cur.deadline = now.Add(s.cfg.PhaseTimeout) + return + } + reason := "busy" + if cur.meta.ID == meta.ID { + reason = "conflicting_transfer" + } + s.sendTransferAbort(meta.ID, reason) + return + } + if done, ok := s.completedTransferFor(meta.ID); ok { + if sameTransferTuple(done, meta) { + s.sendTransferDone(meta.ID) return } - s.sendTransferAbort(meta.ID, "busy") + s.sendTransferAbort(meta.ID, "conflicting_transfer") return } beginFn := s.beginTransfer @@ -244,7 +319,7 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { meta: meta, sink: sink, hasher: xxhash.New(0), - deadline: time.Now().Add(s.cfg.PhaseTimeout), + deadline: now.Add(s.cfg.PhaseTimeout), } if s.sendTransferReady(meta.ID) { s.sendTransferNeed(meta.ID, 0) @@ -259,19 +334,20 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { } id := cur.meta.ID if msg.Offset < cur.bytesWritten { + s.markRx() s.sendTransferNeed(id, cur.bytesWritten) cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) return } if msg.Offset > cur.bytesWritten { + s.markRx() s.sendTransferNeed(id, cur.bytesWritten) return } raw, errStr := decodeChunkData(msg.Data) if errStr != "" { s.logKV("xfer_chunk decode retry", "err", errStr) - s.sendTransferNeed(id, cur.bytesWritten) - cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) + s.retryCorruptTransferFrame(errStr) return } if len(raw) == 0 { @@ -292,32 +368,15 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { // byte offset instead of clearing the transfer. want, ok := canonicalXXHash32Hex(msg.ChunkDigest) if !ok { - reason := "invalid_chunk_digest" - if msg.ChunkDigest == "" { - reason = "missing_chunk_digest" - } - println( - "[fabric-xfer]", "abort_tx", - "id", id, - "reason", reason, - "offset", u32s(msg.Offset), - "digest_len", strconvx.Itoa(len(msg.ChunkDigest)), - "digest", msg.ChunkDigest, - "data_len", strconvx.Itoa(len(msg.Data)), - ) - s.abortTransfer(reason) - s.sendTransferAbort(id, reason) + s.retryCorruptTransferFrame("bad_message") return } got := xxhashHex(xxhash.Sum32(raw, 0)) if got != want { - s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) - // Recovery counts as progress — bump the deadline so a burst - // of digest-mismatched chunks doesn't trip the idle watchdog - // mid-recovery. - cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) + s.retryCorruptTransferFrame("chunk_digest_mismatch") return } + s.markRx() if err := cur.sink.WriteChunk(msg.Offset, raw); err != nil { reason := err.Error() s.logKV("transfer write failed", "err", reason) @@ -329,6 +388,8 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { cur.bytesWritten += uint32(len(raw)) cur.chunksSeen++ cur.idleRetries = 0 + cur.corruptRetryOffset = cur.bytesWritten + cur.corruptRetriesAtOffset = 0 cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) raw = nil // Keep transfer memory bounded on TinyGo. The receiver allocates while @@ -368,7 +429,8 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { s.sendTransferAbort(id, "digest_mismatch") return } - _, err := cur.sink.Commit() + s.markRx() + info, err := cur.sink.Commit() if err != nil { s.logKV("transfer commit failed", "err", err.Error()) reason := err.Error() @@ -382,35 +444,37 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { s.clearTransfer() bytesPayload := sink.Bytes() - ok, reason := s.invokeTransferTarget(meta, id, bytesPayload) + ok, reason := s.invokeTransferTarget(meta, id, info, bytesPayload) if !ok { s.extendTransferQuiet("xfer_target_rejected", transferCompleteQuiet) s.sendTransferAbort(id, reason) return } s.extendTransferQuiet("xfer_done", transferCompleteQuiet) + s.recordCompletedTransfer(meta) s.sendTransferDone(id) } -const targetCallTimeout = 5 * time.Second +var targetCallTimeout = 5 * time.Second // invokeTransferTarget calls the local updater staging RPC named by // xfer_begin.target. The wire no longer carries raw/member receiver topics; // target="updater/main" maps to an internal bus RPC owned by the updater // service. The reply gates whether fabric sends xfer_done or xfer_abort. -func (s *session) invokeTransferTarget(meta transferMeta, xferID string, artefact []byte) (bool, string) { +func (s *session) invokeTransferTarget(meta transferMeta, xferID string, info transferInfo, artefact []byte) (bool, string) { if meta.Target != transferTargetUpdaterMain { return false, "unsupported_target" } payload := updater.StagePayload{ - LinkID: s.linkID, - XferID: xferID, - Target: meta.Target, - Size: meta.Size, - DigestAlg: meta.DigestAlg, - Digest: meta.Digest, - Meta: meta.Meta, - Artefact: artefact, + LinkID: s.linkID, + XferID: xferID, + Generation: info.Generation, + Target: meta.Target, + Size: meta.Size, + DigestAlg: meta.DigestAlg, + Digest: meta.Digest, + Meta: meta.Meta, + Artefact: artefact, } msg := s.conn.NewMessage(updater.TopicStageRPC, payload, false) replySub := s.conn.Request(msg) @@ -419,14 +483,17 @@ func (s *session) invokeTransferTarget(meta transferMeta, xferID string, artefac select { case rep, ok := <-replySub.Channel(): if !ok || rep == nil { + updater.CancelStreamedStage(xferID, info.Generation, "stage_no_reply") return false, "stage_no_reply" } ok, reason := decodeStageReply(rep.Payload) if !ok { + updater.CancelStreamedStage(xferID, info.Generation, reason) return false, reason } return true, "" case <-time.After(targetCallTimeout): + updater.CancelStreamedStage(xferID, info.Generation, "stage_timeout") return false, "stage_timeout" } } @@ -495,6 +562,7 @@ func (s *session) onTransferAbort(msg *protoXferAbort) { if reason == "" { reason = "remote_abort" } + s.markRx() s.abortTransfer(reason) } diff --git a/services/fabric/transfer_sink_buffer.go b/services/fabric/transfer_sink_buffer.go index 9cf3832..c012c66 100644 --- a/services/fabric/transfer_sink_buffer.go +++ b/services/fabric/transfer_sink_buffer.go @@ -1,33 +1,42 @@ package fabric -import "errors" +import ( + "errors" -// bufferSink is the default transferSink for the fabric-update branch: -// it buffers the verified-by-wire (xxHash32) artefact in RAM and exposes -// the bytes via Bytes() so onTransferCommit can hand them to the -// updater/main staging RPC. The updater is responsible for signed-image -// verification and staging. + "devicecode-go/services/updater" +) + +// bufferSink is the default in-memory transferSink: it buffers the +// verified-by-wire (xxHash32) artefact in RAM and exposes the bytes via +// Bytes() so onTransferCommit can hand them to the updater/main staging +// RPC. The updater is responsible for signed-image verification and staging. // // Size cap is deliberately conservative: the smoke tests target small -// artefacts and large firmware images need a streaming-into-flash -// sink, which is fabric-security's job. Hitting the cap aborts the -// transfer cleanly via WriteChunk -> ErrArtefactTooLarge. +// artefacts and large firmware images need a streaming-into-flash sink. +// Hitting the cap aborts the transfer cleanly via WriteChunk -> +// ErrArtefactTooLarge. const maxArtefactBytes = 64 * 1024 var ErrArtefactTooLarge = errors.New("artefact_too_large") type bufferSink struct { - meta transferMeta - buf []byte - closed bool - committed bool + meta transferMeta + generation uint64 + buf []byte + closed bool + committed bool } -func newBufferSink(meta transferMeta) *bufferSink { - return &bufferSink{ - meta: meta, - buf: make([]byte, 0, sizeHint(meta.Size)), +func newBufferSink(meta transferMeta) (*bufferSink, error) { + generation, err := updater.BeginStreamedStage(meta.ID, meta.Size) + if err != nil { + return nil, err } + return &bufferSink{ + meta: meta, + generation: generation, + buf: make([]byte, 0, sizeHint(meta.Size)), + }, nil } func sizeHint(announced uint32) int { @@ -55,18 +64,24 @@ func (s *bufferSink) Commit() (transferInfo, error) { if s.closed { return transferInfo{}, errors.New("sink_closed") } + if s.generation != 0 { + if err := updater.CommitBufferedStage(s.meta.ID, s.generation); err != nil { + return transferInfo{}, err + } + } s.committed = true - return transferInfo{BytesWritten: uint32(len(s.buf))}, nil + return transferInfo{BytesWritten: uint32(len(s.buf)), Generation: s.generation}, nil } // Apply is a no-op for the buffer sink — the staged-image apply // (slot switch + reboot) belongs to the updater's commit RPC, not to -// fabric's transfer state machine. fabric-security wires the real -// apply path through `cap/self/updater/main/rpc/commit-update`. +// fabric's transfer state machine. func (s *bufferSink) Apply() error { return nil } func (s *bufferSink) Abort(reason string) error { - _ = reason + if s.generation != 0 { + updater.AbortStreamedStage(s.meta.ID, s.generation, reason) + } s.buf = nil s.closed = true return nil diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index 567392a..56ba8de 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -9,15 +9,18 @@ import ( ) type streamedStageSink struct { - accepted uint32 - closed bool + xferID string + generation uint64 + accepted uint32 + closed bool } func beginTransfer(meta transferMeta) (transferSink, error) { - if err := updater.BeginStreamedStage(meta.Size); err != nil { + generation, err := updater.BeginStreamedStage(meta.ID, meta.Size) + if err != nil { return nil, err } - return &streamedStageSink{}, nil + return &streamedStageSink{xferID: meta.ID, generation: generation}, nil } func (s *streamedStageSink) WriteChunk(off uint32, data []byte) error { @@ -27,7 +30,7 @@ func (s *streamedStageSink) WriteChunk(off uint32, data []byte) error { if s.accepted != off { return errors.New("unexpected_offset") } - if err := updater.WriteStreamedStage(data); err != nil { + if err := updater.WriteStreamedStage(s.xferID, s.generation, data); err != nil { return err } s.accepted += uint32(len(data)) @@ -38,24 +41,24 @@ func (s *streamedStageSink) Commit() (transferInfo, error) { if s.closed { return transferInfo{}, errors.New("sink_closed") } - written, err := updater.CommitStreamedStage() + written, err := updater.CommitStreamedStage(s.xferID, s.generation) if err != nil { return transferInfo{}, err } s.closed = true - return transferInfo{BytesWritten: written}, nil + return transferInfo{BytesWritten: written, Generation: s.generation}, nil } func (s *streamedStageSink) Apply() error { return nil } func (s *streamedStageSink) Abort(reason string) error { - _ = reason - updater.AbortStreamedStage() + updater.AbortStreamedStage(s.xferID, s.generation, reason) s.closed = true return nil } -// Bytes returns nil because the TinyGo RP2350 default path streams directly -// into the inactive slot. fabric still calls updater/main staging; the updater -// consumes the pre-staged descriptor instead of an in-RAM artefact. +// Bytes returns nil because the TinyGo RP2350 default path verifies the signed +// container while streaming and writes only the authenticated payload into the +// inactive slot. fabric still calls updater/main staging; the updater consumes +// the verified staged descriptor instead of an in-RAM artefact. func (s *streamedStageSink) Bytes() []byte { return nil } diff --git a/services/fabric/transfer_sink_stub.go b/services/fabric/transfer_sink_stub.go index f07a074..9554cff 100644 --- a/services/fabric/transfer_sink_stub.go +++ b/services/fabric/transfer_sink_stub.go @@ -7,5 +7,5 @@ package fabric func beginTransfer(meta transferMeta) (transferSink, error) { - return newBufferSink(meta), nil + return newBufferSink(meta) } diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index d3e9bcf..3eae5f7 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -4,6 +4,8 @@ import ( "context" "encoding/base64" "encoding/json" + "errors" + "io" "strings" "testing" "time" @@ -56,6 +58,28 @@ func (s *fakeTransferSink) Abort(reason string) error { // of the transferred bytes — it tracks per-chunk writes instead. func (s *fakeTransferSink) Bytes() []byte { return nil } +type blockingVerifier struct { + entered chan struct{} + release chan struct{} + manifest updater.Manifest +} + +func (v *blockingVerifier) Verify(r io.Reader, sink updater.SlotSink) (updater.Manifest, error) { + select { + case <-v.entered: + default: + close(v.entered) + } + <-v.release + if _, err := io.Copy(sink, r); err != nil { + return updater.Manifest{}, err + } + if err := sink.Commit(); err != nil { + return updater.Manifest{}, err + } + return v.manifest, nil +} + func runSessionWithSink(ctx context.Context, tr Transport, conn *bus.Connection, sink *fakeTransferSink) { s := session{ linkID: defaultLinkID, @@ -136,6 +160,88 @@ func installStageResponder(t *testing.T, b *bus.Bus, reply updater.StageReply) < return got } +func runUpdaterForFabricTest(t *testing.T, b *bus.Bus, opts updater.Options) (context.CancelFunc, *updater.Service) { + t.Helper() + if opts.Conn == nil { + opts.Conn = b.NewConnection("updater") + } + if opts.Identity.Version == "" { + opts.Identity = updater.Identity{Version: "0.0.0-test", Build: "build-test", ImageID: "img-test"} + } + probeConn := b.NewConnection("updater-probe") + probe := probeConn.Subscribe(updater.TopicSoftwareFact) + defer probeConn.Unsubscribe(probe) + svc := updater.New(opts) + ctx, cancel := context.WithCancel(context.Background()) + go svc.Run(ctx) + select { + case msg := <-probe.Channel(): + if msg == nil { + cancel() + t.Fatal("nil initial updater software fact") + } + case <-time.After(2 * time.Second): + cancel() + t.Fatal("timeout waiting for updater service start") + } + return cancel, svc +} + +func prepareUpdaterForFabricTest(t *testing.T, conn *bus.Connection) { + t.Helper() + msg := conn.NewMessage(updater.TopicPrepareRPC, updater.PrepareRequest{Target: updater.PrepareTargetMCU}, false) + sub := conn.Request(msg) + defer conn.Unsubscribe(sub) + select { + case rep := <-sub.Channel(): + if rep == nil { + t.Fatal("nil prepare reply") + } + reply, ok := rep.Payload.(updater.PrepareReply) + if !ok || !reply.Ready { + t.Fatalf("prepare reply = %#v, want ready", rep.Payload) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for prepare reply") + } +} + +func waitUpdaterFactForFabricTest(t *testing.T, sub *bus.Subscription, want func(updater.UpdaterFact) bool) updater.UpdaterFact { + t.Helper() + deadline := time.After(2 * time.Second) + for { + select { + case msg := <-sub.Channel(): + if msg == nil { + continue + } + fact, ok := msg.Payload.(updater.UpdaterFact) + if ok && (want == nil || want(fact)) { + return fact + } + case <-deadline: + t.Fatal("timeout waiting for updater fact") + } + } +} + +func requestUpdaterForFabricTest(t *testing.T, conn *bus.Connection, topic bus.Topic, payload any) any { + t.Helper() + msg := conn.NewMessage(topic, payload, false) + sub := conn.Request(msg) + defer conn.Unsubscribe(sub) + select { + case rep := <-sub.Channel(): + if rep == nil { + t.Fatal("nil updater reply") + } + return rep.Payload + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for updater reply") + } + return nil +} + func readTransferReady(t *testing.T, tr Transport, id string, next uint32) { t.Helper() ready := readMsg[protoXferReady](t, tr) @@ -148,6 +254,178 @@ func readTransferReady(t *testing.T, tr Transport, id string, next uint32) { } } +func readTransferNeed(t *testing.T, tr Transport, id string, next uint32) { + t.Helper() + need := readMsg[protoXferNeed](t, tr) + if need.Type != msgXferNeed || need.XferID != id || need.Next != next { + t.Fatalf("bad xfer_need: %+v, want id=%s next=%d", need, id, next) + } +} + +func readTransferAbort(t *testing.T, tr Transport, id, reason string) { + t.Helper() + abort := readMsg[protoXferAbort](t, tr) + if abort.Type != msgXferAbort || abort.XferID != id || abort.Err != reason { + t.Fatalf("bad xfer_abort: %+v, want id=%s err=%s", abort, id, reason) + } +} + +func writeRawLine(t *testing.T, tr Transport, line string) { + t.Helper() + if err := tr.WriteLine([]byte(line)); err != nil { + t.Fatalf("WriteLine: %v", err) + } +} + +func TestTransferBeginWithoutPrepareAbortsNoReady(t *testing.T) { + b := newBus() + cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{}) + defer cancelUpdater() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-no-prepare", payload, nil)) + abort := readMsg[protoXferAbort](t, cm5) + if abort.Type != msgXferAbort || abort.XferID != "xfer-no-prepare" || abort.Err != "stage_not_prepared" { + t.Fatalf("xfer_begin without prepare frame = %+v, want stage_not_prepared abort", abort) + } +} + +func TestPreparedTransferBeginSendsReadyThenNeedZero(t *testing.T) { + b := newBus() + cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{}) + defer cancelUpdater() + caller := b.NewConnection("caller") + observer := b.NewConnection("observer") + upSub := observer.Subscribe(updater.TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + prepareUpdaterForFabricTest(t, caller) + + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-prepared", payload, nil)) + readTransferReady(t, cm5, "xfer-prepared", 0) + waitUpdaterFactForFabricTest(t, upSub, func(f updater.UpdaterFact) bool { + return f.State == updater.StateReceiving + }) +} + +func TestInvalidTransferBeginRejectsNoActiveTransfer(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + beginCount := 0 + s := session{ + linkID: defaultLinkID, + nodeID: "mcu", + peerID: "bigbox-cm5", + localSID: "mcu-sid-test", + tr: mcu, + conn: b.NewConnection("fabric"), + beginTransfer: func(meta transferMeta) (transferSink, error) { + beginCount++ + return &fakeTransferSink{}, nil + }, + } + go s.run(ctx) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, protoXferBegin{ + Type: msgXferBegin, + XferID: "xfer-invalid", + Target: "other/target", + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: xxhashStr(payload), + }) + readTransferAbort(t, cm5, "xfer-invalid", "bad_message: unsupported_target") + if beginCount != 0 { + t.Fatalf("beginTransfer called %d times for invalid begin, want 0", beginCount) + } +} + +func TestTransferAbortCancelsUpdaterLease(t *testing.T) { + b := newBus() + cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{}) + defer cancelUpdater() + caller := b.NewConnection("caller") + observer := b.NewConnection("observer") + upSub := observer.Subscribe(updater.TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + prepareUpdaterForFabricTest(t, caller) + + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-abort-cancel", payload, nil)) + readTransferReady(t, cm5, "xfer-abort-cancel", 0) + sendMsg(t, cm5, protoXferAbort{Type: msgXferAbort, XferID: "xfer-abort-cancel", Err: "host_abort"}) + + fact := waitUpdaterFactForFabricTest(t, upSub, func(f updater.UpdaterFact) bool { + return f.State == updater.StateFailed + }) + if fact.LastError == nil || *fact.LastError != "host_abort" { + t.Fatalf("updater last_error = %v, want host_abort", fact.LastError) + } +} + +func TestTransferTargetRejectCancelsLeaseAndPreventsCommit(t *testing.T) { + b := newBus() + memMD := updater.NewMemoryMetadata() + cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{ + Verifier: updater.StubVerifier(), + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancelUpdater() + caller := b.NewConnection("caller") + prepareUpdaterForFabricTest(t, caller) + + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-target-reject", payload, nil)) + readTransferReady(t, cm5, "xfer-target-reject", 0) + sendMsg(t, cm5, xferChunk("xfer-target-reject", 0, payload)) + _ = readMsg[protoXferNeed](t, cm5) + sendMsg(t, cm5, xferCommit("xfer-target-reject", payload)) + + abort := readMsg[protoXferAbort](t, cm5) + if abort.XferID != "xfer-target-reject" || !strings.Contains(abort.Err, "verifier_stub") { + t.Fatalf("xfer_abort = %+v, want verifier_stub rejection", abort) + } + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatal("stage rejection left a staged descriptor") + } + + replyPayload := requestUpdaterForFabricTest(t, caller, updater.TopicCommitRPC, updater.CommitRequest{}) + reply, ok := replyPayload.(updater.Reply) + if !ok || reply.OK || reply.Error != updater.ErrNothingStaged { + t.Fatalf("commit after rejected transfer = %#v, want nothing_staged", replyPayload) + } +} + func TestTransferBeginPreservesMeta(t *testing.T) { // xfer_begin's meta is opaque to fabric-protocol but must be preserved // for updater/main staging diagnostics. @@ -270,6 +548,27 @@ func TestTransferReceiveSuccess(t *testing.T) { } } +func TestTransferAcceptedChunkAdvancesNeed(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-chunk-diag", payload, nil)) + readTransferReady(t, cm5, "xfer-chunk-diag", 0) + + sendMsg(t, cm5, xferChunk("xfer-chunk-diag", 0, payload)) + need := readMsg[protoXferNeed](t, cm5) + if need.Next != uint32(len(payload)) { + t.Fatalf("xfer_need.next = %d, want %d", need.Next, len(payload)) + } +} + func TestTransferChunkFutureOffsetRequestsCurrentAndCompletes(t *testing.T) { b := newBus() cm5, mcu := pipePair() @@ -410,7 +709,7 @@ func TestTransferChunkDecodeFailureRequestsSameOffset(t *testing.T) { } } -func TestTransferChunkMissingDigestAborts(t *testing.T) { +func TestTransferChunkMissingDigestRetriesThenAborts(t *testing.T) { b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -424,22 +723,64 @@ func TestTransferChunkMissingDigestAborts(t *testing.T) { sendMsg(t, cm5, xferBegin("xfer-missing-digest", payload, nil)) readTransferReady(t, cm5, "xfer-missing-digest", 0) + for i := 0; i < transferCorruptRetryLimit; i++ { + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-missing-digest", + Offset: 0, + Data: rawURL(payload), + }) + readTransferNeed(t, cm5, "xfer-missing-digest", 0) + } sendMsg(t, cm5, protoXferChunk{ Type: msgXferChunk, XferID: "xfer-missing-digest", Offset: 0, Data: rawURL(payload), }) - - abort := readMsg[protoXferAbort](t, cm5) - if abort.Err != "missing_chunk_digest" { - t.Fatalf("bad xfer_abort: %+v", abort) - } + readTransferAbort(t, cm5, "xfer-missing-digest", "bad_message") if len(sink.abortReasons) == 0 { t.Fatal("expected sink.Abort on missing chunk digest") } } +func TestTransferChunkInvalidBase64RetriesThenAborts(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-bad-b64", payload, nil)) + readTransferReady(t, cm5, "xfer-bad-b64", 0) + + for i := 0; i < transferCorruptRetryLimit; i++ { + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-bad-b64", + Offset: 0, + Data: "!!!not-base64!!!", + ChunkDigest: xxhashStr(payload), + }) + readTransferNeed(t, cm5, "xfer-bad-b64", 0) + } + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-bad-b64", + Offset: 0, + Data: "!!!not-base64!!!", + ChunkDigest: xxhashStr(payload), + }) + readTransferAbort(t, cm5, "xfer-bad-b64", "invalid_chunk_encoding") + if len(sink.abortReasons) == 0 || sink.abortReasons[0] != "invalid_chunk_encoding" { + t.Fatalf("sink.Abort reasons = %v, want invalid_chunk_encoding", sink.abortReasons) + } +} + func TestTransferChunkDigestMismatchRequestsSameOffset(t *testing.T) { b := newBus() cm5, mcu := pipePair() @@ -476,6 +817,264 @@ func TestTransferChunkDigestMismatchRequestsSameOffset(t *testing.T) { } } +func TestTransferChunkWriteErrorAborts(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{writeErr: errors.New("write_boom")} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-write-error", payload, nil)) + readTransferReady(t, cm5, "xfer-write-error", 0) + + sendMsg(t, cm5, xferChunk("xfer-write-error", 0, payload)) + readTransferAbort(t, cm5, "xfer-write-error", "write_boom") +} + +func TestTransferChunkDigestMismatchRetriesThenAborts(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-bad-digest-budget", payload, nil)) + readTransferReady(t, cm5, "xfer-bad-digest-budget", 0) + + for i := 0; i < transferCorruptRetryLimit; i++ { + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-bad-digest-budget", + Offset: 0, + Data: rawURL(payload), + ChunkDigest: "00000000", + }) + readTransferNeed(t, cm5, "xfer-bad-digest-budget", 0) + } + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-bad-digest-budget", + Offset: 0, + Data: rawURL(payload), + ChunkDigest: "00000000", + }) + readTransferAbort(t, cm5, "xfer-bad-digest-budget", "chunk_digest_mismatch") +} + +func TestTransferMalformedCurrentChunkJSONRetriesThenAborts(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + id := "xfer-malformed-json" + sendMsg(t, cm5, xferBegin(id, payload, nil)) + readTransferReady(t, cm5, id, 0) + + line := `{"type":"xfer_chunk","xfer_id":"` + id + `","offset":0,"data":"` + rawURL(payload) + `","chunk_digest":"` + xxhashStr(payload) + `","extra":true}` + for i := 0; i < transferCorruptRetryLimit; i++ { + writeRawLine(t, cm5, line) + readTransferNeed(t, cm5, id, 0) + } + writeRawLine(t, cm5, line) + readTransferAbort(t, cm5, id, "bad_message") +} + +func TestTransferMalformedWrongXferIDDoesNotChargeActiveTransfer(t *testing.T) { + payload := []byte("abcd") + activeID := "xfer-active-malformed" + cases := []struct { + name string + line string + }{ + { + name: "wrong_id", + line: `{"type":"xfer_chunk","xfer_id":"xfer-other","offset":0,"data":"` + rawURL(payload) + `","chunk_digest":"` + xxhashStr(payload) + `","extra":true}`, + }, + { + name: "missing_id", + line: `{"type":"xfer_chunk","offset":0,"data":"` + rawURL(payload) + `","chunk_digest":"` + xxhashStr(payload) + `","extra":true}`, + }, + { + name: "unreadable_id", + line: `{"type":"xfer_chunk","xfer_id":{"bad":true},"offset":0,"data":"` + rawURL(payload) + `","chunk_digest":"` + xxhashStr(payload) + `","extra":true}`, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + sink := &fakeTransferSink{} + tr := &captureTransport{} + s := &session{ + link: linkUp, + tr: tr, + incomingTransfer: &incomingTransfer{ + meta: transferMeta{ID: activeID}, + sink: sink, + deadline: time.Now().Add(time.Second), + }, + } + + for i := 0; i < transferCorruptRetryLimit+1; i++ { + s.dispatch([]byte(tc.line)) + } + if len(tr.writes) != 0 { + t.Fatalf("malformed non-current xfer_chunk emitted %d frames, want none", len(tr.writes)) + } + if s.incomingTransfer == nil { + t.Fatal("malformed non-current xfer_chunk cleared active transfer") + } + if got := s.incomingTransfer.corruptRetriesAtOffset; got != 0 { + t.Fatalf("corrupt retries at offset = %d, want 0", got) + } + if len(sink.abortReasons) != 0 { + t.Fatalf("sink aborted for non-current malformed chunk: %v", sink.abortReasons) + } + }) + } +} + +func TestTransferCorruptRetryBudgetResetsAfterAcceptedProgress(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + id := "xfer-retry-reset" + payload := []byte("abcdef") + first := []byte("abc") + second := []byte("def") + sendMsg(t, cm5, xferBegin(id, payload, nil)) + readTransferReady(t, cm5, id, 0) + + for i := 0; i < transferCorruptRetryLimit-1; i++ { + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: id, + Offset: 0, + Data: rawURL(first), + ChunkDigest: "00000000", + }) + readTransferNeed(t, cm5, id, 0) + } + sendMsg(t, cm5, xferChunk(id, 0, first)) + readTransferNeed(t, cm5, id, uint32(len(first))) + + for i := 0; i < transferCorruptRetryLimit; i++ { + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: id, + Offset: uint32(len(first)), + Data: rawURL(second), + ChunkDigest: "00000000", + }) + readTransferNeed(t, cm5, id, uint32(len(first))) + } + sendMsg(t, cm5, xferChunk(id, uint32(len(first)), second)) + readTransferNeed(t, cm5, id, uint32(len(payload))) + sendMsg(t, cm5, xferCommit(id, payload)) + done := readMsg[protoXferDone](t, cm5) + if done.Type != msgXferDone || done.XferID != id { + t.Fatalf("bad xfer_done: %+v", done) + } +} + +func TestTransferFutureOffsetDoesNotResetCorruptRetryBudget(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + id := "xfer-future-no-reset" + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin(id, payload, nil)) + readTransferReady(t, cm5, id, 0) + + for i := 0; i < transferCorruptRetryLimit; i++ { + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: id, + Offset: 0, + Data: rawURL(payload), + ChunkDigest: "00000000", + }) + readTransferNeed(t, cm5, id, 0) + } + sendMsg(t, cm5, xferChunk(id, 99, payload)) + readTransferNeed(t, cm5, id, 0) + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: id, + Offset: 0, + Data: rawURL(payload), + ChunkDigest: "00000000", + }) + readTransferAbort(t, cm5, id, "chunk_digest_mismatch") +} + +func TestTransferStaleOffsetDoesNotResetCorruptRetryBudget(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + id := "xfer-stale-no-reset" + payload := []byte("abcdef") + first := []byte("abc") + second := []byte("def") + sendMsg(t, cm5, xferBegin(id, payload, nil)) + readTransferReady(t, cm5, id, 0) + sendMsg(t, cm5, xferChunk(id, 0, first)) + readTransferNeed(t, cm5, id, uint32(len(first))) + + for i := 0; i < transferCorruptRetryLimit; i++ { + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: id, + Offset: uint32(len(first)), + Data: rawURL(second), + ChunkDigest: "00000000", + }) + readTransferNeed(t, cm5, id, uint32(len(first))) + } + sendMsg(t, cm5, xferChunk(id, 0, first)) + readTransferNeed(t, cm5, id, uint32(len(first))) + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: id, + Offset: uint32(len(first)), + Data: rawURL(second), + ChunkDigest: "00000000", + }) + readTransferAbort(t, cm5, id, "chunk_digest_mismatch") +} + func TestTransferChunkSizeOverflowAborts(t *testing.T) { b := newBus() cm5, mcu := pipePair() @@ -572,7 +1171,7 @@ func TestTransferTargetInvokedAfterCommit(t *testing.T) { gotPayload := installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) - sink := &bufferingSinkAdapter{bufferSink: newBufferSink(transferMeta{Size: 4})} + sink := &bufferingSinkAdapter{bufferSink: &bufferSink{meta: transferMeta{Size: 4}, buf: make([]byte, 0, 4)}} s := session{ linkID: defaultLinkID, nodeID: "mcu", @@ -623,6 +1222,111 @@ func TestTransferTargetInvokedAfterCommit(t *testing.T) { } } +func TestCompletedTransferDuplicateBeginSameTupleReplaysDone(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + stageCalls := installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) + sink := &fakeTransferSink{} + beginCount := 0 + s := session{ + linkID: defaultLinkID, + nodeID: "mcu", + peerID: "bigbox-cm5", + localSID: "mcu-sid-test", + tr: mcu, + conn: b.NewConnection("fabric"), + beginTransfer: func(meta transferMeta) (transferSink, error) { + beginCount++ + return sink, nil + }, + } + go s.run(ctx) + bringUp(t, cm5) + + id := "xfer-completed-replay" + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin(id, payload, nil)) + readTransferReady(t, cm5, id, 0) + sendMsg(t, cm5, xferChunk(id, 0, payload)) + readTransferNeed(t, cm5, id, uint32(len(payload))) + sendMsg(t, cm5, xferCommit(id, payload)) + select { + case p := <-stageCalls: + if p.XferID != id { + t.Fatalf("stage xfer_id = %q, want %q", p.XferID, id) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for first stage call") + } + done := readMsg[protoXferDone](t, cm5) + if done.Type != msgXferDone || done.XferID != id { + t.Fatalf("bad xfer_done: %+v", done) + } + if beginCount != 1 { + t.Fatalf("beginTransfer calls after first completion = %d, want 1", beginCount) + } + + sendMsg(t, cm5, xferBegin(id, payload, nil)) + done = readMsg[protoXferDone](t, cm5) + if done.Type != msgXferDone || done.XferID != id { + t.Fatalf("duplicate begin response = %+v, want xfer_done", done) + } + if beginCount != 1 { + t.Fatalf("duplicate completed begin reopened sink: beginCount=%d", beginCount) + } + select { + case p := <-stageCalls: + t.Fatalf("duplicate completed begin restaged transfer: %+v", p) + default: + } +} + +func TestCompletedTransferDuplicateBeginConflictingTupleAborts(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) + sink := &fakeTransferSink{} + beginCount := 0 + s := session{ + linkID: defaultLinkID, + nodeID: "mcu", + peerID: "bigbox-cm5", + localSID: "mcu-sid-test", + tr: mcu, + conn: b.NewConnection("fabric"), + beginTransfer: func(meta transferMeta) (transferSink, error) { + beginCount++ + return sink, nil + }, + } + go s.run(ctx) + bringUp(t, cm5) + + id := "xfer-completed-conflict" + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin(id, payload, nil)) + readTransferReady(t, cm5, id, 0) + sendMsg(t, cm5, xferChunk(id, 0, payload)) + readTransferNeed(t, cm5, id, uint32(len(payload))) + sendMsg(t, cm5, xferCommit(id, payload)) + done := readMsg[protoXferDone](t, cm5) + if done.Type != msgXferDone || done.XferID != id { + t.Fatalf("bad xfer_done: %+v", done) + } + + sendMsg(t, cm5, xferBegin(id, []byte("abcde"), nil)) + readTransferAbort(t, cm5, id, "conflicting_transfer") + if beginCount != 1 { + t.Fatalf("conflicting completed begin reopened sink: beginCount=%d", beginCount) + } +} + func TestTransferTargetRejectAbortsTransfer(t *testing.T) { // updater/main stage replies {ok=false, err=...}. fabric must send // xfer_abort with the stage reason rather than xfer_done. @@ -633,7 +1337,7 @@ func TestTransferTargetRejectAbortsTransfer(t *testing.T) { _ = installStageResponder(t, b, updater.StageReply{OK: false, Err: "manifest_check_failed"}) - sink := &bufferingSinkAdapter{bufferSink: newBufferSink(transferMeta{Size: 4})} + sink := &bufferingSinkAdapter{bufferSink: &bufferSink{meta: transferMeta{Size: 4}, buf: make([]byte, 0, 4)}} s := session{ linkID: defaultLinkID, nodeID: "mcu", @@ -665,6 +1369,63 @@ func TestTransferTargetRejectAbortsTransfer(t *testing.T) { } } +func TestTransferTargetStageTimeoutCancelsLeaseAndPreventsLateStagePersist(t *testing.T) { + b := newBus() + memMD := updater.NewMemoryMetadata() + verif := &blockingVerifier{ + entered: make(chan struct{}), + release: make(chan struct{}), + manifest: updater.Manifest{ + Version: "9.9.9", + BuildID: "build-9.9.9", + ImageID: "mcu-dev-9.9.9", + PayloadSHA256: strings.Repeat("a", 64), + PayloadLength: 4, + }, + } + cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{ + Verifier: verif, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancelUpdater() + caller := b.NewConnection("caller") + prepareUpdaterForFabricTest(t, caller) + + oldTimeout := targetCallTimeout + targetCallTimeout = 20 * time.Millisecond + defer func() { targetCallTimeout = oldTimeout }() + + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + bringUp(t, cm5) + + id := "xfer-stage-timeout" + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin(id, payload, nil)) + readTransferReady(t, cm5, id, 0) + sendMsg(t, cm5, xferChunk(id, 0, payload)) + readTransferNeed(t, cm5, id, uint32(len(payload))) + sendMsg(t, cm5, xferCommit(id, payload)) + select { + case <-verif.entered: + case <-time.After(2 * time.Second): + t.Fatal("verifier did not start before stage timeout") + } + readTransferAbort(t, cm5, id, "stage_timeout") + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatal("stage timeout persisted descriptor before verifier returned") + } + + close(verif.release) + time.Sleep(50 * time.Millisecond) + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatal("late verifier completion after stage timeout persisted descriptor") + } +} + func TestTransferIdleChunkWatchdog(t *testing.T) { // transfer_mgr.lua refreshes active.deadline = now + phase_timeout on // each accepted chunk and aborts with reason="timeout" if the deadline diff --git a/services/fabric/writer.go b/services/fabric/writer.go index 4ca94df..37286d5 100644 --- a/services/fabric/writer.go +++ b/services/fabric/writer.go @@ -10,7 +10,7 @@ import "errors" // Lane assignment for outbound MCU frames mirrors protocol.lua's // FRAME_CLASS map. The MCU never originates xfer_chunk so the bulk lane // is currently unused on the MCU side; it is wired in for symmetry and -// for future fabric-update telemetry that may want to route bulk frames. +// for future MCU-originated bulk-frame users. type lane uint8 diff --git a/services/reactor/qa_reactor.go b/services/reactor/qa_reactor.go index c615063..5b4770c 100644 --- a/services/reactor/qa_reactor.go +++ b/services/reactor/qa_reactor.go @@ -7,9 +7,9 @@ import ( "runtime" "time" - "devicecode-go/utilities" "devicecode-go/bus" "devicecode-go/types" + "devicecode-go/utilities" "devicecode-go/x/shmring" "devicecode-go/x/strconvx" ) @@ -165,16 +165,26 @@ type Reactor struct { // telemetry drop counters (bytes) droppedUART0Bytes int + bootBuyRC int32 +} + +type Options struct { + BootBuyRC int32 } func NewReactor(b *bus.Bus, uiConn *bus.Connection) *Reactor { + return NewReactorWithOptions(b, uiConn, Options{}) +} + +func NewReactorWithOptions(b *bus.Bus, uiConn *bus.Connection, opts Options) *Reactor { return &Reactor{ - bus: b, - uiConn: uiConn, - levelUp: true, - state: stateOff, - now: time.Now(), - ledTick: 0, + bus: b, + uiConn: uiConn, + levelUp: true, + state: stateOff, + now: time.Now(), + bootBuyRC: opts.BootBuyRC, + ledTick: 0, } } @@ -436,7 +446,7 @@ func (r *Reactor) emitMemSnapshot() { } func (r *Reactor) Run(ctx context.Context) { -// Subscriptions (env + power) + // Subscriptions (env + power) log.Println("[main] subscribing env + power …") tempSub := r.uiConn.Subscribe(tTempValue) tempDieSub := r.uiConn.Subscribe(tDieTempValue) diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 257eaa5..733c160 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -53,6 +53,42 @@ func waitFabricDone(done <-chan struct{}, timeout time.Duration) bool { } } +func waitForUpdaterCriticalFacts(ctx context.Context, conn *bus.Connection) bool { + if conn == nil { + return false + } + swSub := conn.Subscribe(updater.TopicSoftwareFact) + defer conn.Unsubscribe(swSub) + upSub := conn.Subscribe(updater.TopicUpdaterFact) + defer conn.Unsubscribe(upSub) + healthSub := conn.Subscribe(updater.TopicHealthFact) + defer conn.Unsubscribe(healthSub) + + softwareReady := false + updaterReady := false + healthReady := false + + for !(softwareReady && updaterReady && healthReady) { + select { + case <-ctx.Done(): + return false + case msg, ok := <-swSub.Channel(): + if ok && msg != nil && msg.Payload != nil { + softwareReady = true + } + case msg, ok := <-upSub.Channel(): + if ok && msg != nil && msg.Payload != nil { + updaterReady = true + } + case msg, ok := <-healthSub.Channel(): + if ok && msg != nil && msg.Payload != nil { + healthReady = true + } + } + } + return true +} + // ----------------------------------------------------------------------------- // Thresholds & timing // ----------------------------------------------------------------------------- @@ -197,20 +233,30 @@ type Reactor struct { ledTick int // throttles breathe commands // misc - now time.Time + now time.Time + bootBuyRC int32 // updater service handle used by the post-hello_ack republish hook. updater *updater.Service } +type Options struct { + BootBuyRC int32 +} + func NewReactor(b *bus.Bus, uiConn *bus.Connection) *Reactor { + return NewReactorWithOptions(b, uiConn, Options{}) +} + +func NewReactorWithOptions(b *bus.Bus, uiConn *bus.Connection, opts Options) *Reactor { return &Reactor{ - bus: b, - uiConn: uiConn, - levelUp: true, - state: stateOff, - now: time.Now(), - ledTick: 0, + bus: b, + uiConn: uiConn, + levelUp: true, + state: stateOff, + now: time.Now(), + bootBuyRC: opts.BootBuyRC, + ledTick: 0, } } @@ -445,13 +491,17 @@ func (r *Reactor) Run(ctx context.Context) { updaterConn := r.bus.NewConnection("updater") identity := firmwareIdentity() updaterSvc := updater.New(updater.Options{ - Conn: updaterConn, - Verifier: updater.PassthroughVerifier(identity), - Applier: updater.ProductionApplier(), - Identity: identity, + Conn: updaterConn, + Verifier: updater.PassthroughVerifier(identity), + Applier: updater.ProductionApplier(), + Identity: identity, + BootBuyRC: r.bootBuyRC, }) go updaterSvc.Run(ctx) r.updater = updaterSvc + if !waitForUpdaterCriticalFacts(ctx, r.bus.NewConnection("updater-ready")) { + return + } // Telemetry service: subscribes to HAL value topics and republishes // at state/self/* with integer engineering units; runs the charger diff --git a/services/reactor/reactor_test.go b/services/reactor/reactor_test.go index 5a9a1a3..34a7389 100644 --- a/services/reactor/reactor_test.go +++ b/services/reactor/reactor_test.go @@ -3,8 +3,12 @@ package reactor import ( + "context" "testing" "time" + + "devicecode-go/bus" + "devicecode-go/services/updater" ) func TestWaitFabricDoneNil(t *testing.T) { @@ -33,3 +37,70 @@ func TestWaitFabricDoneTimeout(t *testing.T) { t.Fatalf("timeout wait took too long: %s", elapsed) } } + +func TestNewReactorDefaultsBootBuyRCZero(t *testing.T) { + r := NewReactor(nil, nil) + if r.bootBuyRC != 0 { + t.Fatalf("bootBuyRC = %d, want 0", r.bootBuyRC) + } +} + +func TestNewReactorWithOptionsStoresBootBuyRC(t *testing.T) { + r := NewReactorWithOptions(nil, nil, Options{BootBuyRC: -42}) + if r.bootBuyRC != -42 { + t.Fatalf("bootBuyRC = %d, want -42", r.bootBuyRC) + } +} + +func TestWaitForUpdaterCriticalFactsRequiresAllThreeFacts(t *testing.T) { + b := bus.NewBus(16, "+", "#") + waitConn := b.NewConnection("wait") + pubConn := b.NewConnection("pub") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + done := make(chan bool, 1) + go func() { + done <- waitForUpdaterCriticalFacts(ctx, waitConn) + }() + + pubConn.Publish(pubConn.NewMessage( + updater.TopicSoftwareFact, + updater.SoftwareFact{ImageID: "img", Version: "1.0", BootID: "boot"}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + updater.TopicUpdaterFact, + updater.UpdaterFact{State: updater.StateRunning}, + true, + )) + select { + case got := <-done: + t.Fatalf("wait returned %t before health fact", got) + case <-time.After(20 * time.Millisecond): + } + + pubConn.Publish(pubConn.NewMessage( + updater.TopicHealthFact, + updater.HealthFact{State: "ok"}, + true, + )) + select { + case got := <-done: + if !got { + t.Fatal("wait returned false after all critical facts") + } + case <-time.After(time.Second): + t.Fatal("timeout waiting for critical facts") + } +} + +func TestWaitForUpdaterCriticalFactsStopsOnContextCancel(t *testing.T) { + b := bus.NewBus(16, "+", "#") + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + if waitForUpdaterCriticalFacts(ctx, b.NewConnection("wait")) { + t.Fatal("wait returned true after context cancellation") + } +} diff --git a/services/updater/boot_id.go b/services/updater/boot_id.go index 295ed80..7bfc24e 100644 --- a/services/updater/boot_id.go +++ b/services/updater/boot_id.go @@ -7,7 +7,7 @@ import ( "time" ) -// boot_id contract per master plan R3 / docs/firmware-alignment-update.md §W6: +// boot_id contract: // - Opaque 16-character lower-hex marker that must change on every // successful boot. // - Generated from 8 bytes of crypto/rand AFTER HAL init succeeds and @@ -15,12 +15,11 @@ import ( // publish on hello_ack. // - Held in RAM only. Not persisted to flash. Not added to the // abupdate metadata block (the regression guard test in -// fabric-update tests checks that abupdate metadata never grows a -// boot_id field). +// tests check that abupdate metadata never grows a boot_id field). // // The fallback path on rand failure is documented inline; this branch // drops to a process-startup counter rather than panicking, with a -// clear log so the failure-mode test suite (master R3) can assert it. +// clear log so failure-mode tests can assert it. var ( cachedBootID atomic.Pointer[string] diff --git a/services/updater/facts.go b/services/updater/facts.go index d7c9536..a004e79 100644 --- a/services/updater/facts.go +++ b/services/updater/facts.go @@ -23,6 +23,13 @@ func strPtrOrNil(v string) *string { return &v } +func int32PtrOrNil(v int32) *int32 { + if v == 0 { + return nil + } + return &v +} + // PublishUpdater emits the retained state/self/updater fact with the // canonical {state, last_error, pending_version} shape. Called on // every state transition (via transitionTo) and as part of the post- @@ -36,6 +43,7 @@ func (s *Service) PublishUpdater() { PendingImageID: strPtrOrNil(s.pendingImageID), StagedImageID: strPtrOrNil(s.stagedImageID), JobID: strPtrOrNil(s.jobID), + BootBuyRC: int32PtrOrNil(s.bootBuyRC), } s.mu.Unlock() s.conn.Publish(s.conn.NewMessage(TopicUpdaterFact, fact, true)) diff --git a/services/updater/prestage_host.go b/services/updater/prestage_host.go index c78c83a..531764d 100644 --- a/services/updater/prestage_host.go +++ b/services/updater/prestage_host.go @@ -2,11 +2,34 @@ package updater +import "errors" + type streamedStage struct { + Version string + BuildID string + ImageID string Length uint32 PayloadSHA256 string } -func consumeStreamedStage() (streamedStage, bool) { +func startStreamedStage(size uint32) error { + _ = size + return nil +} + +func writeStreamedStage(data []byte) error { + _ = data + return errors.New("streamed_stage_not_supported") +} + +func commitStreamedStage() (streamedStage, error) { + return streamedStage{}, errors.New("streamed_stage_not_supported") +} + +func abortStreamedStage() {} + +func consumeStreamedStageResult() (streamedStage, bool) { return streamedStage{}, false } + +func discardStreamedStageResult() {} diff --git a/services/updater/prestage_tinygo.go b/services/updater/prestage_tinygo.go index bef919d..4cdca0d 100644 --- a/services/updater/prestage_tinygo.go +++ b/services/updater/prestage_tinygo.go @@ -25,10 +25,7 @@ var ( streamedStageLen uint32 ) -// BeginStreamedStage prepares the inactive slot for a raw incoming transfer. -// The caller must subsequently call WriteStreamedStage and CommitStreamedStage -// or AbortStreamedStage. -func BeginStreamedStage(size uint32) error { +func startStreamedStage(size uint32) error { // A fresh prepare invalidates any prior stage, and retrying an update in // the same boot must not inherit abupdate's previous writing/complete // state. Recreate the updater before resolving the inactive slot. @@ -49,7 +46,7 @@ func BeginStreamedStage(size uint32) error { return nil } -func WriteStreamedStage(data []byte) error { +func writeStreamedStage(data []byte) error { if len(data) == 0 { return errors.New("empty_chunk") } @@ -65,30 +62,33 @@ func WriteStreamedStage(data []byte) error { return nil } -func CommitStreamedStage() (uint32, error) { +func commitStreamedStage() (streamedStage, error) { u, err := ensureUpdaterInited() if err != nil { - return 0, err + return streamedStage{}, err } if rc := u.FlushFinal(); rc != 0 { - return 0, errFromRC("flush_final", rc) + return streamedStage{}, errFromRC("flush_final", rc) } streamedStageDesc = streamedStage{ Length: streamedStageLen, PayloadSHA256: hex.EncodeToString(streamedStageHash.Sum(nil)), } streamedStageOK = true - return u.BytesWritten(), nil + if written := u.BytesWritten(); written != streamedStageDesc.Length { + streamedStageDesc.Length = written + } + return streamedStageDesc, nil } -func AbortStreamedStage() { +func abortStreamedStage() { streamedStageDesc = streamedStage{} streamedStageOK = false streamedStageLen = 0 streamedStageHash.Reset() } -func consumeStreamedStage() (streamedStage, bool) { +func consumeStreamedStageResult() (streamedStage, bool) { if !streamedStageOK { return streamedStage{}, false } @@ -99,3 +99,7 @@ func consumeStreamedStage() (streamedStage, bool) { streamedStageHash.Reset() return out, true } + +func discardStreamedStageResult() { + abortStreamedStage() +} diff --git a/services/updater/receiver.go b/services/updater/receiver.go index 51c33e8..f4fde09 100644 --- a/services/updater/receiver.go +++ b/services/updater/receiver.go @@ -2,6 +2,7 @@ package updater import ( "bytes" + "errors" "devicecode-go/bus" ) @@ -34,17 +35,33 @@ func (s *Service) handleStage(msg *bus.Message) { s.reply(msg, StageReply{OK: false, Err: "unsupported_digest_alg"}) return } - s.transitionTo(StateReceiving, "", "") + if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { + s.reply(msg, StageReply{OK: false, Err: err.Error()}) + return + } if len(payload.Artefact) == 0 { - staged, ok := consumeStreamedStage() + staged, ok := consumeStreamedStageResult() if !ok { - s.clearStagedImage() - s.transitionTo(StateFailed, "artefact_missing", "") + s.failStage(payload, "artefact_missing") s.reply(msg, StageReply{OK: false, Err: "artefact_missing"}) return } + if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { + s.failLateStage(payload, err) + s.reply(msg, StageReply{OK: false, Err: err.Error()}) + return + } stageIdentity, _ := identityFromStageMeta(s.identity, payload.Meta) + if staged.Version != "" { + stageIdentity.Version = staged.Version + } + if staged.BuildID != "" { + stageIdentity.Build = staged.BuildID + } + if staged.ImageID != "" { + stageIdentity.ImageID = staged.ImageID + } desc := StagedDescriptor{ Version: stageIdentity.Version, BuildID: stageIdentity.Build, @@ -54,12 +71,21 @@ func (s *Service) handleStage(msg *bus.Message) { PayloadSHA256: staged.PayloadSHA256, } if err := s.metadataWrite.WriteStagedDescriptor(desc); err != nil { - _ = s.metadataWrite.ClearStagedDescriptor() - s.clearStagedImage() - s.transitionTo(StateFailed, "metadata_write_failed:"+err.Error(), "") + s.failStage(payload, "metadata_write_failed:"+err.Error()) s.reply(msg, StageReply{OK: false, Err: "metadata_write_failed"}) return } + if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { + s.failLateStage(payload, err) + s.reply(msg, StageReply{OK: false, Err: err.Error()}) + return + } + if !s.releaseStreamedStageLease(payload.XferID, payload.Generation) { + err := errors.New("stage_cancelled") + s.failLateStage(payload, err) + s.reply(msg, StageReply{OK: false, Err: err.Error()}) + return + } s.setStagedImage(desc.ImageID, desc.Version) s.transitionTo(StateStaged, "", desc.Version) s.reply(msg, StageReply{OK: true, Stage: "staged"}) @@ -68,33 +94,27 @@ func (s *Service) handleStage(msg *bus.Message) { sink, err := newSlotSink(uint32(len(payload.Artefact))) if err != nil { - s.clearStagedImage() - s.transitionTo(StateFailed, "sink_init_failed:"+err.Error(), "") + s.failStage(payload, "sink_init_failed:"+err.Error()) s.reply(msg, StageReply{OK: false, Err: "sink_init_failed"}) return } + if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { + _ = sink.Abort() + s.failLateStage(payload, err) + s.reply(msg, StageReply{OK: false, Err: err.Error()}) + return + } manifest, err := s.verifier.Verify(bytes.NewReader(payload.Artefact), sink) if err != nil { // Verifier rejected the artefact. Clear any prior descriptor so a // following commit cannot apply stale firmware from an older stage. - _ = s.metadataWrite.ClearStagedDescriptor() - s.clearStagedImage() - s.transitionTo(StateFailed, err.Error(), "") + s.failStage(payload, err.Error()) s.reply(msg, StageReply{OK: false, Err: err.Error()}) return } - - // On verifier success the sink holds the verified payload bytes. - // Persist the staged descriptor via the abupdate metadata writer - // (W11) so the next prepare/commit RPC and the next boot's - // software fact see payload_sha256 + descriptor. The fabric-update - // branch ships an in-memory writer; fabric-security replaces it - // with a flash-backed implementation that survives reboots. - if err := sink.Commit(); err != nil { - _ = s.metadataWrite.ClearStagedDescriptor() - s.clearStagedImage() - s.transitionTo(StateFailed, "sink_commit_failed:"+err.Error(), "") - s.reply(msg, StageReply{OK: false, Err: "sink_commit_failed"}) + if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { + s.failLateStage(payload, err) + s.reply(msg, StageReply{OK: false, Err: err.Error()}) return } desc := StagedDescriptor{ @@ -102,17 +122,26 @@ func (s *Service) handleStage(msg *bus.Message) { BuildID: manifest.BuildID, ImageID: manifest.ImageID, Length: manifest.PayloadLength, - Slot: 0, // slot-pick comes from abupdate when fabric-security wires it + Slot: 0, // slot-pick comes from abupdate when hardware apply is wired PayloadSHA256: manifest.PayloadSHA256, } if err := s.metadataWrite.WriteStagedDescriptor(desc); err != nil { - _ = s.metadataWrite.ClearStagedDescriptor() - s.clearStagedImage() - s.transitionTo(StateFailed, "metadata_write_failed:"+err.Error(), "") + s.failStage(payload, "metadata_write_failed:"+err.Error()) s.reply(msg, StageReply{OK: false, Err: "metadata_write_failed"}) return } + if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { + s.failLateStage(payload, err) + s.reply(msg, StageReply{OK: false, Err: err.Error()}) + return + } + if !s.releaseStreamedStageLease(payload.XferID, payload.Generation) { + err := errors.New("stage_cancelled") + s.failLateStage(payload, err) + s.reply(msg, StageReply{OK: false, Err: err.Error()}) + return + } s.setStagedImage(desc.ImageID, manifest.Version) s.transitionTo(StateStaged, "", manifest.Version) // Do not republish the software fact here: PayloadSHA256 describes the @@ -120,6 +149,28 @@ func (s *Service) handleStage(msg *bus.Message) { s.reply(msg, StageReply{OK: true, Stage: "staged"}) } +func (s *Service) failStage(payload StagePayload, reason string) { + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + if payload.Generation != 0 { + s.cancelStreamedStageLease(payload.XferID, payload.Generation, reason) + } + s.transitionTo(StateFailed, reason, "") +} + +func (s *Service) failLateStage(payload StagePayload, err error) { + reason := "stage_cancelled" + if err != nil { + reason = err.Error() + } + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + if payload.Generation != 0 { + s.cancelStreamedStageLease(payload.XferID, payload.Generation, reason) + } + s.transitionTo(StateFailed, reason, "") +} + type stageMetadata struct { Version string `json:"version,omitempty"` Build string `json:"build,omitempty"` diff --git a/services/updater/rpc.go b/services/updater/rpc.go index b721eb3..a9a589c 100644 --- a/services/updater/rpc.go +++ b/services/updater/rpc.go @@ -18,13 +18,25 @@ func (s *Service) handlePrepare(msg *bus.Message) { } s.mu.Lock() - if s.preparing || s.state == StateCommitting || s.state == StateRebooting { + if s.preparing || + s.streamLeaseActive || + s.state == StateReceiving || + s.state == StateCommitting || + s.state == StateRebooting { s.mu.Unlock() s.reply(msg, Reply{OK: false, Error: ErrBusy}) return } s.preparing = true s.mu.Unlock() + prepareActive := true + finishPrepare := func() { + if prepareActive { + s.markPrepareDone() + prepareActive = false + } + } + defer finishPrepare() s.setJobContext(req.JobID, req.ExpectedImageID) s.transitionTo(StatePreparing, "", "") @@ -34,13 +46,19 @@ func (s *Service) handlePrepare(msg *bus.Message) { // which would be a real safety bug since the user-intent on // prepare(B) is "I want to stage B, throw away A". if err := s.metadataWrite.ClearStagedDescriptor(); err != nil { - s.markPrepareDone() - s.reply(msg, Reply{OK: false, Error: "metadata_clear_failed:" + err.Error()}) + errMsg := "metadata_clear_failed:" + err.Error() + s.transitionTo(StateFailed, errMsg, "") + finishPrepare() + s.reply(msg, Reply{OK: false, Error: errMsg}) return } + s.mu.Lock() + s.openStageGenerationLocked() + s.mu.Unlock() + s.transitionTo(StateReady, "", "") - s.markPrepareDone() + finishPrepare() s.reply(msg, PrepareReply{ Ready: true, Target: TargetUpdaterMain, @@ -62,8 +80,13 @@ func (s *Service) handleCommit(msg *bus.Message) { s.mu.Lock() stagedInState := s.state == StateStaged pendingImageID := s.pendingImageID + streamActive := s.streamLeaseActive s.mu.Unlock() + if streamActive { + s.reply(msg, Reply{OK: false, Error: ErrBusy}) + return + } if !present || !stagedInState { s.reply(msg, Reply{OK: false, Error: ErrNothingStaged}) return diff --git a/services/updater/stream_lease.go b/services/updater/stream_lease.go new file mode 100644 index 0000000..1926330 --- /dev/null +++ b/services/updater/stream_lease.go @@ -0,0 +1,240 @@ +package updater + +import ( + "errors" + "sync" +) + +var ( + activeServiceMu sync.Mutex + activeService *Service +) + +func registerActiveService(s *Service) func() { + activeServiceMu.Lock() + activeService = s + activeServiceMu.Unlock() + return func() { + activeServiceMu.Lock() + if activeService == s { + activeService = nil + } + activeServiceMu.Unlock() + } +} + +func currentService() *Service { + activeServiceMu.Lock() + defer activeServiceMu.Unlock() + return activeService +} + +// BeginStreamedStage acquires the updater-owned staging lease opened by the +// last successful prepare-update call. Fabric calls this from xfer_begin before +// any sink mutates flash or buffers transfer state. +func BeginStreamedStage(xferID string, size uint32) (uint64, error) { + s := currentService() + if s == nil { + return 0, errors.New("updater_not_running") + } + gen, err := s.beginStreamedStageLease(xferID) + if err != nil { + return 0, err + } + if err := startStreamedStage(size); err != nil { + s.cancelStreamedStageLease(xferID, gen, err.Error()) + return 0, err + } + if err := s.markStreamedStageReceiving(xferID, gen); err != nil { + abortStreamedStage() + s.cancelStreamedStageLease(xferID, gen, err.Error()) + return 0, err + } + return gen, nil +} + +func WriteStreamedStage(xferID string, generation uint64, data []byte) error { + s := currentService() + if s == nil { + return errors.New("updater_not_running") + } + if err := s.checkStreamedStageLease(xferID, generation, false); err != nil { + return err + } + return writeStreamedStage(data) +} + +func CommitStreamedStage(xferID string, generation uint64) (uint32, error) { + s := currentService() + if s == nil { + return 0, errors.New("updater_not_running") + } + if err := s.checkStreamedStageLease(xferID, generation, false); err != nil { + return 0, err + } + staged, err := commitStreamedStage() + if err != nil { + s.cancelStreamedStageLease(xferID, generation, err.Error()) + return 0, err + } + if err := s.markStreamedStageCommitted(xferID, generation); err != nil { + abortStreamedStage() + return 0, err + } + return staged.Length, nil +} + +func CommitBufferedStage(xferID string, generation uint64) error { + s := currentService() + if s == nil { + return errors.New("updater_not_running") + } + return s.markStreamedStageCommitted(xferID, generation) +} + +func AbortStreamedStage(xferID string, generation uint64, reason string) { + abortStreamedStage() + if s := currentService(); s != nil { + s.cancelStreamedStageLease(xferID, generation, reason) + } +} + +func CancelStreamedStage(xferID string, generation uint64, reason string) { + AbortStreamedStage(xferID, generation, reason) +} + +func (s *Service) openStageGenerationLocked() uint64 { + s.stageGeneration++ + if s.stageGeneration == 0 { + s.stageGeneration = 1 + } + s.streamLeaseActive = false + s.streamXferID = "" + s.streamCancelled = false + s.streamCommitted = false + discardStreamedStageResult() + return s.stageGeneration +} + +func (s *Service) beginStreamedStageLease(xferID string) (uint64, error) { + if xferID == "" { + return 0, errors.New("bad_message:xfer_id") + } + s.mu.Lock() + defer s.mu.Unlock() + if s.preparing || + s.state == StatePreparing || + s.state == StateCommitting || + s.state == StateRebooting || + s.state == StateReceiving || + s.streamLeaseActive { + return 0, errors.New(ErrBusy) + } + if s.state != StateReady || s.stageGeneration == 0 { + return 0, errors.New("stage_not_prepared") + } + s.streamLeaseActive = true + s.streamXferID = xferID + s.streamCancelled = false + s.streamCommitted = false + return s.stageGeneration, nil +} + +func (s *Service) markStreamedStageReceiving(xferID string, generation uint64) error { + s.mu.Lock() + if generation == 0 || generation != s.stageGeneration || xferID == "" || xferID != s.streamXferID { + s.mu.Unlock() + return errors.New("stage_generation_mismatch") + } + if s.streamCancelled { + s.mu.Unlock() + return errors.New("stage_cancelled") + } + if !s.streamLeaseActive { + s.mu.Unlock() + return errors.New("stage_not_active") + } + s.state = StateReceiving + s.lastError = "" + s.mu.Unlock() + s.PublishUpdater() + return nil +} + +func (s *Service) checkStreamedStageLease(xferID string, generation uint64, requireCommitted bool) error { + s.mu.Lock() + defer s.mu.Unlock() + if generation == 0 || generation != s.stageGeneration || xferID == "" || xferID != s.streamXferID { + return errors.New("stage_generation_mismatch") + } + if s.streamCancelled { + return errors.New("stage_cancelled") + } + if !s.streamLeaseActive { + return errors.New("stage_not_active") + } + if requireCommitted && !s.streamCommitted { + return errors.New("stage_not_committed") + } + return nil +} + +func (s *Service) markStreamedStageCommitted(xferID string, generation uint64) error { + s.mu.Lock() + defer s.mu.Unlock() + if generation == 0 || generation != s.stageGeneration || xferID == "" || xferID != s.streamXferID { + return errors.New("stage_generation_mismatch") + } + if s.streamCancelled { + return errors.New("stage_cancelled") + } + if !s.streamLeaseActive { + return errors.New("stage_not_active") + } + s.streamCommitted = true + return nil +} + +func (s *Service) cancelStreamedStageLease(xferID string, generation uint64, reason string) bool { + s.mu.Lock() + matches := generation != 0 && + generation == s.stageGeneration && + s.streamLeaseActive && + (xferID == "" || xferID == s.streamXferID) + if matches { + s.streamCancelled = true + s.streamLeaseActive = false + s.streamCommitted = false + s.streamXferID = "" + s.stagedImageID = "" + s.pendingVersion = "" + if s.state == StateReady || s.state == StateReceiving || s.state == StateStaged { + s.state = StateFailed + } + if reason != "" { + s.lastError = reason + } + } + s.mu.Unlock() + if matches { + _ = s.metadataWrite.ClearStagedDescriptor() + s.PublishUpdater() + } + return matches +} + +func (s *Service) releaseStreamedStageLease(xferID string, generation uint64) bool { + s.mu.Lock() + matches := generation != 0 && + generation == s.stageGeneration && + xferID == s.streamXferID && + s.streamLeaseActive && + !s.streamCancelled + if matches { + s.streamLeaseActive = false + s.streamCommitted = false + s.streamXferID = "" + } + s.mu.Unlock() + return matches +} diff --git a/services/updater/types.go b/services/updater/types.go index 7fc1231..095d8ea 100644 --- a/services/updater/types.go +++ b/services/updater/types.go @@ -29,10 +29,13 @@ func (s State) Allowed() bool { } const ( - PrepareTargetMCU = "mcu" - TargetUpdaterMain = "updater/main" - DigestAlgXXHash32 = "xxhash32" - DefaultMaxChunkSize uint32 = 2048 + PrepareTargetMCU = "mcu" + TargetUpdaterMain = "updater/main" + DigestAlgXXHash32 = "xxhash32" + // DefaultMaxChunkSize is the safe RP2350 Fabric OTA limit currently + // advertised by prepare-update. It is a target pacing limit, not a + // Fabric protocol maximum. + DefaultMaxChunkSize uint32 = 512 ) // PrepareRequest mirrors the current prepare-update payload. @@ -73,22 +76,21 @@ type Reply struct { // Refusal error strings — the Lua side compares against these. const ( - ErrBusy = "busy" - ErrNothingStaged = "nothing_staged" - ErrTargetMismatch = "target_mismatch" + ErrBusy = "busy" + ErrNothingStaged = "nothing_staged" + ErrTargetMismatch = "target_mismatch" + ErrABUpdateBuyFailed = "abupdate_buy_failed" // ErrApplyUnavailable is returned when the commit RPC sees a valid // staged descriptor but no Applier is wired to actually trigger - // the slot-switch + reboot. fabric-update ships with a refusing - // Applier so we never lie to the CM5 about apply success on a - // branch where the apply path doesn't exist; fabric-security - // supplies a real Applier and the refusal goes away. + // the slot-switch + reboot. Refusing by default means we never lie + // to the CM5 about apply success when the hardware apply path is not + // wired. ErrApplyUnavailable = "apply_unavailable" ) -// SoftwareFact is the retained payload at state/self/software per -// docs/firmware-alignment-update.md §"Identity facts". `boot_id` is -// generated per boot (W6, RAM-only); `payload_sha256` is bare 64-char -// lower-hex sourced from the abupdate metadata block. +// SoftwareFact is the retained payload at state/self/software. +// `boot_id` is generated per boot and kept in RAM only; `payload_sha256` +// is bare 64-char lower-hex sourced from the abupdate metadata block. type SoftwareFact struct { Version string `json:"version"` BuildID string `json:"build_id"` @@ -107,6 +109,7 @@ type UpdaterFact struct { PendingImageID *string `json:"pending_image_id"` StagedImageID *string `json:"staged_image_id"` JobID *string `json:"job_id"` + BootBuyRC *int32 `json:"boot_buy_rc,omitempty"` } // HealthFact is the retained payload at state/self/health. Lua extracts @@ -134,14 +137,15 @@ type StagedDescriptor struct { // the older meta.receiver/raw-member receive path; the CM5 supplies only // target="updater/main" on the wire. type StagePayload struct { - LinkID string `json:"link_id"` - XferID string `json:"xfer_id"` - Target string `json:"target"` - Size uint32 `json:"size"` - DigestAlg string `json:"digest_alg"` - Digest string `json:"digest"` - Meta any `json:"meta,omitempty"` - Artefact []byte `json:"artefact,omitempty"` + LinkID string `json:"link_id"` + XferID string `json:"xfer_id"` + Generation uint64 `json:"generation,omitempty"` + Target string `json:"target"` + Size uint32 `json:"size"` + DigestAlg string `json:"digest_alg"` + Digest string `json:"digest"` + Meta any `json:"meta,omitempty"` + Artefact []byte `json:"artefact,omitempty"` } type StageReply struct { diff --git a/services/updater/updater.go b/services/updater/updater.go index b5dc335..66726dd 100644 --- a/services/updater/updater.go +++ b/services/updater/updater.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "sync" + "time" "devicecode-go/bus" ) @@ -22,10 +23,10 @@ var ( TopicHealthFact = bus.T("state", "self", "health") // TopicFabricLink is the wildcard the updater watches to drive the - // post-hello_ack republish (W10). The fabric session retains a + // post-hello_ack republish. The fabric session retains a // payload at state/fabric/link/ on every link-state edge; - // we pick out Ready-true transitions and call Republish() so the - // CM5 sees fresh state/self/* facts on every newly established + // we pick out Ready-true transitions and call PublishCriticalFacts() + // so the CM5 sees fresh state/self/* facts on every newly established // session, warm or cold. TopicFabricLink = bus.T("state", "fabric", "link", "+") ) @@ -40,9 +41,8 @@ type Identity struct { // MetadataReader is the read side of the abupdate metadata block — the // updater pulls payload_sha256 and the staged descriptor (if any) from -// here at boot. The fabric-update branch only requires reads from this -// interface; the matching MetadataWriter handles staging-side -// persistence in W11. +// here at boot. The matching MetadataWriter handles staging-side +// persistence. type MetadataReader interface { PayloadSHA256() string StagedDescriptor() (StagedDescriptor, bool) @@ -51,10 +51,8 @@ type MetadataReader interface { // MetadataWriter is the write side: updater/main staging hands a verified // StagedDescriptor + payload_sha256 here so the next boot's // MetadataReader observes them. A default in-memory implementation is -// supplied (NewMemoryMetadata) for the fabric-update branch; the -// pico2-a-b/abupdate flash-backed implementation lands later (it -// touches the metadata sector at offset 0x000FF000 — see master -// plan §abupdate metadata block). +// supplied for host tests and non-persistent builds; flash-backed +// implementations may persist this in the abupdate metadata block. type MetadataWriter interface { WriteStagedDescriptor(d StagedDescriptor) error ClearStagedDescriptor() error @@ -88,9 +86,9 @@ type MemoryMetadata struct { func NewMemoryMetadata() *MemoryMetadata { return &MemoryMetadata{} } // SetRunningPayloadSHA records the hash of the currently-running -// image. fabric-security wires this from the active slot's flash -// metadata at boot; tests can call it directly. Bare 64-char -// lower-hex per the spec. +// image. Hardware builds can source this from the active slot's flash +// metadata at boot; tests can call it directly. Bare 64-char lower-hex +// per the spec. func (m *MemoryMetadata) SetRunningPayloadSHA(sha string) { m.mu.Lock() defer m.mu.Unlock() @@ -159,23 +157,62 @@ type Service struct { stagedImageID string jobID string preparing bool + bootBuyRC int32 + + stageGeneration uint64 + streamLeaseActive bool + streamXferID string + streamCancelled bool + streamCommitted bool applyResults chan applyRebootResult + criticalRepublish CriticalRepublishConfig + // Logger seam — left as a small helper so tests can plug in. nil in // tests means stderr-style println. logf func(string, ...any) } +// CriticalRepublishConfig controls level-triggered publication of the MCU +// critical facts while a Fabric peer remains ready. Zero values use the +// hardware-safe defaults. +type CriticalRepublishConfig struct { + BurstInterval time.Duration + BurstDuration time.Duration + SteadyInterval time.Duration +} + +const ( + defaultCriticalRepublishBurstInterval = time.Second + defaultCriticalRepublishBurstDuration = 10 * time.Second + defaultCriticalRepublishSteadyInterval = 15 * time.Second +) + +func normalizeCriticalRepublishConfig(cfg CriticalRepublishConfig) CriticalRepublishConfig { + if cfg.BurstInterval <= 0 { + cfg.BurstInterval = defaultCriticalRepublishBurstInterval + } + if cfg.BurstDuration <= 0 { + cfg.BurstDuration = defaultCriticalRepublishBurstDuration + } + if cfg.SteadyInterval <= 0 { + cfg.SteadyInterval = defaultCriticalRepublishSteadyInterval + } + return cfg +} + // Options bundle the constructor parameters so Service can grow new // dependencies without churning callers. type Options struct { - Conn *bus.Connection - Verifier Verifier - Applier Applier - Identity Identity - Metadata MetadataReader - MetadataWrite MetadataWriter + Conn *bus.Connection + Verifier Verifier + Applier Applier + Identity Identity + Metadata MetadataReader + MetadataWrite MetadataWriter + BootBuyRC int32 + CriticalRepublish CriticalRepublishConfig } // New builds a Service. Verifier defaults to the rejecting StubVerifier @@ -206,7 +243,7 @@ func New(opts Options) *Service { // Reader-only: writes from staging become no-ops. mw = noopMetadataWriter{} } - return &Service{ + s := &Service{ conn: opts.Conn, verifier: v, applier: a, @@ -214,8 +251,17 @@ func New(opts Options) *Service { metadata: mr, metadataWrite: mw, state: StateRunning, + bootBuyRC: opts.BootBuyRC, applyResults: make(chan applyRebootResult, 1), + criticalRepublish: normalizeCriticalRepublishConfig( + opts.CriticalRepublish, + ), + } + if opts.BootBuyRC != 0 { + s.state = StateFailed + s.lastError = ErrABUpdateBuyFailed } + return s } // noopMetadataWriter is the writer-side fallback when the caller @@ -231,8 +277,11 @@ func (noopMetadataWriter) ClearStagedDescriptor() error { // Run binds the RPC + staging topics, publishes the initial fact // surface, and watches the fabric link-state retain for ready-true -// edges (W10). Blocks until ctx is cancelled. +// edges. Blocks until ctx is cancelled. func (s *Service) Run(ctx context.Context) { + unregister := registerActiveService(s) + defer unregister() + prepareSub := s.conn.Subscribe(TopicPrepareRPC) defer s.conn.Unsubscribe(prepareSub) @@ -247,13 +296,71 @@ func (s *Service) Run(ctx context.Context) { // Initial fact publish: tells the CM5 we're alive and reports // build identity + the freshly generated boot_id. - s.PublishSoftware() - s.PublishUpdater() - s.PublishHealth("ok", "") - - // Track per-link ready state so we only republish on the - // !Ready -> Ready edge, not on every retain churn. - prevReady := map[string]bool{} + s.PublishCriticalFacts() + + // Track per-link ready/session identity so a CM5 SID change republishes + // critical retained facts even if the link does not emit a clean false edge. + linkState := map[string]linkObservation{} + var criticalTimer *time.Timer + var criticalTimerC <-chan time.Time + var burstUntil time.Time + stopCriticalTimer := func() { + if criticalTimer == nil { + return + } + if !criticalTimer.Stop() { + select { + case <-criticalTimer.C: + default: + } + } + criticalTimerC = nil + } + defer stopCriticalTimer() + armCriticalTimer := func(delay time.Duration) { + if delay <= 0 { + delay = s.criticalRepublish.SteadyInterval + } + if criticalTimer == nil { + criticalTimer = time.NewTimer(delay) + } else { + if !criticalTimer.Stop() { + select { + case <-criticalTimer.C: + default: + } + } + criticalTimer.Reset(delay) + } + criticalTimerC = criticalTimer.C + } + anyReadyLink := func() bool { + for _, obs := range linkState { + if obs.Ready { + return true + } + } + return false + } + startCriticalCadence := func(now time.Time) { + burstUntil = now.Add(s.criticalRepublish.BurstDuration) + armCriticalTimer(s.criticalRepublish.BurstInterval) + } + runCriticalCadence := func(now time.Time) { + if !anyReadyLink() { + burstUntil = time.Time{} + stopCriticalTimer() + return + } + if !burstUntil.IsZero() && now.Before(burstUntil) { + s.PublishCriticalFacts() + armCriticalTimer(s.criticalRepublish.BurstInterval) + return + } + burstUntil = time.Time{} + s.PublishCriticalFacts() + armCriticalTimer(s.criticalRepublish.SteadyInterval) + } for { select { @@ -276,30 +383,39 @@ func (s *Service) Run(ctx context.Context) { s.handleStage(msg) case result := <-s.applyResults: s.failRebootIfCurrent(result.desc, result.err) + case now := <-criticalTimerC: + runCriticalCadence(now) case msg, ok := <-linkSub.Channel(): if !ok || msg == nil { continue } - linkID, ready := decodeLinkState(msg) + linkID, obs := decodeLinkState(msg) if linkID == "" { continue } - was := prevReady[linkID] - if ready && !was { - // W10: post-hello_ack republish. Mirrors the spec line - // "republished after every successful boot AND on every - // newly established session (hello_ack), warm or cold". - s.Republish() + prev, hadPrev := linkState[linkID] + if reason := republishReason(prev, obs, hadPrev); reason != "" { + // Post-hello_ack republish. Mirrors the contract that state + // facts are republished after every successful boot and on + // every newly established session, warm or cold. + s.logRepublish(reason, linkID, obs) + s.PublishCriticalFacts() + linkState[linkID] = obs + startCriticalCadence(time.Now()) + continue + } + linkState[linkID] = obs + if !obs.Ready && !anyReadyLink() { + burstUntil = time.Time{} + stopCriticalTimer() } - prevReady[linkID] = ready } } } -// Republish re-emits all retained `state/self/*` facts. Wired up to -// fabric's session lifecycle so every new hello_ack triggers a fresh -// retain — required by the spec for warm-and-cold session resumes. -func (s *Service) Republish() { +// PublishCriticalFacts re-emits the retained state/self facts that CM5 update +// reconcile treats as mandatory for the MCU component. +func (s *Service) PublishCriticalFacts() { s.PublishSoftware() s.PublishUpdater() s.PublishHealth("ok", "") @@ -312,6 +428,7 @@ func (s *Service) transitionTo(next State, lastError, pendingVersion string) Sta s.mu.Lock() prev := s.state s.state = next + s.bootBuyRC = 0 if lastError != "" || (next != StateFailed && next != StateRollbackDetected) { s.lastError = lastError } @@ -321,10 +438,24 @@ func (s *Service) transitionTo(next State, lastError, pendingVersion string) Sta s.pendingVersion = "" } s.mu.Unlock() - s.PublishUpdater() + s.PublishCriticalFacts() return prev } +func (s *Service) logRepublish(reason, linkID string, obs linkObservation) { + if s.logf != nil { + s.logf("updater republish reason=%s link=%s peer_sid=%s local_sid=%s", reason, linkID, obs.PeerSID, obs.LocalSID) + return + } + println( + "updater", "republish", + "reason="+reason, + "link="+linkID, + "peer_sid="+obs.PeerSID, + "local_sid="+obs.LocalSID, + ) +} + func (s *Service) failRebootIfCurrent(desc StagedDescriptor, err error) bool { if err == nil { return false @@ -393,46 +524,76 @@ func (s *Service) reply(msg *bus.Message, payload any) { s.conn.Reply(msg, payload, false) } -// decodeLinkState extracts the link_id and ready flag from a +type linkObservation struct { + Ready bool + PeerSID string + LocalSID string +} + +func republishReason(prev, cur linkObservation, hadPrev bool) string { + if !cur.Ready { + return "" + } + if !hadPrev || !prev.Ready { + return "ready_edge" + } + if prev.PeerSID != cur.PeerSID { + return "peer_sid_changed" + } + if prev.LocalSID != cur.LocalSID { + return "local_sid_changed" + } + return "" +} + +// decodeLinkState extracts the link_id plus ready/session identity from a // state/fabric/link/ retain. Tolerates both the typed payload // shape published by services/fabric/session.go and a generic -// map[string]any (in-process test harnesses). Returns ("", false) +// map[string]any (in-process test harnesses). Returns ("", zero) // for any payload it can't make sense of — the caller treats that // as "no edge". -func decodeLinkState(msg *bus.Message) (string, bool) { +func decodeLinkState(msg *bus.Message) (string, linkObservation) { + var obs linkObservation if msg == nil { - return "", false + return "", obs } // Pull link_id from the topic tail (state/fabric/link/). t := msg.Topic if t == nil || t.Len() < 4 { - return "", false + return "", obs } last := t.At(t.Len() - 1) linkID, _ := last.(string) if linkID == "" { - return "", false + return "", obs } switch p := msg.Payload.(type) { case nil: - return linkID, false + return linkID, obs case map[string]any: - ready, _ := p["ready"].(bool) - return linkID, ready + obs.Ready, _ = p["ready"].(bool) + obs.PeerSID, _ = p["peer_sid"].(string) + obs.LocalSID, _ = p["local_sid"].(string) + return linkID, obs } // Fall back to JSON probe for the typed-struct payload that // fabric publishes via its linkStatePayload type. b, err := json.Marshal(msg.Payload) if err != nil { - return linkID, false + return linkID, obs } var probe struct { - Ready bool `json:"ready"` + Ready bool `json:"ready"` + PeerSID string `json:"peer_sid"` + LocalSID string `json:"local_sid"` } if err := json.Unmarshal(b, &probe); err != nil { - return linkID, false + return linkID, obs } - return linkID, probe.Ready + obs.Ready = probe.Ready + obs.PeerSID = probe.PeerSID + obs.LocalSID = probe.LocalSID + return linkID, obs } // jsonDecode is a small helper that tolerates both already-typed diff --git a/services/updater/updater_test.go b/services/updater/updater_test.go index 7bfadb5..d0bccf2 100644 --- a/services/updater/updater_test.go +++ b/services/updater/updater_test.go @@ -30,6 +30,11 @@ func (f *fakeVerifierAccept) Verify(r io.Reader, sink SlotSink) (Manifest, error } else { _, _ = io.Copy(sink, r) } + if sink != nil { + if err := sink.Commit(); err != nil { + return Manifest{}, err + } + } return f.manifest, nil } @@ -52,11 +57,23 @@ type fakeMetadata struct { func (f *fakeMetadata) PayloadSHA256() string { return f.sha } func (f *fakeMetadata) StagedDescriptor() (StagedDescriptor, bool) { return f.staged, f.has } +type failingClearMetadata struct { + *MemoryMetadata + err error +} + +func (f *failingClearMetadata) ClearStagedDescriptor() error { + if f.err != nil { + return f.err + } + return f.MemoryMetadata.ClearStagedDescriptor() +} + // fakeApplier always succeeds — used by tests that need the commit RPC // to drive the state machine through committing/rebooting without // actually rebooting (production wiring uses RefusingApplier so the -// commit RPC returns apply_unavailable until fabric-security supplies -// the real abupdate-backed implementation). +// commit RPC returns apply_unavailable until the real abupdate-backed +// implementation is supplied). // // canCalls and rebootCalls are kept separate so tests can verify the commit // ordering: CanApply first, publish rebooting + reply accepted, then ArmReboot. @@ -102,7 +119,7 @@ func (f *fakeApplier) rebootCall(i int) StagedDescriptor { return f.rebootCalls[i] } -// ---- boot_id (W6) --------------------------------------------------- +// ---- boot_id -------------------------------------------------------- func TestBootIDIs16HexChars(t *testing.T) { resetBootIDForTest() @@ -128,8 +145,8 @@ func TestBootIDIsCachedAcrossCalls(t *testing.T) { func TestBootIDChangesAfterReset(t *testing.T) { // resetBootIDForTest mimics a successful boot. 10 successive boots - // must all produce unique values (master R3 failure-mode list: - // "RNG-never-seeded / from-constant" guard). + // must all produce unique values ("RNG-never-seeded / from-constant" + // guard). seen := make(map[string]struct{}) for i := 0; i < 10; i++ { resetBootIDForTest() @@ -153,7 +170,7 @@ func TestBootIDIsNotAllZero(t *testing.T) { } } -// ---- state machine + RPC handlers (W4) ------------------------------ +// ---- state machine + RPC handlers ---------------------------------- func waitForFact[T any](t *testing.T, sub *bus.Subscription, want func(T) bool) T { t.Helper() @@ -177,6 +194,45 @@ func waitForFact[T any](t *testing.T, sub *bus.Subscription, want func(T) bool) } } +func waitForCriticalFactSet(t *testing.T, swSub, upSub, hSub *bus.Subscription) (SoftwareFact, UpdaterFact, HealthFact) { + t.Helper() + sw := waitForFact[SoftwareFact](t, swSub, nil) + up := waitForFact[UpdaterFact](t, upSub, nil) + h := waitForFact[HealthFact](t, hSub, nil) + return sw, up, h +} + +func drainSubscription(sub *bus.Subscription) { + for { + select { + case <-sub.Channel(): + default: + return + } + } +} + +func publishTestLinkState(conn *bus.Connection, ready bool, peerSID, localSID string) { + conn.Publish(conn.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{ + "ready": ready, + "established": ready, + "peer_sid": peerSID, + "local_sid": localSID, + }, + true, + )) +} + +func testCriticalRepublishConfig() CriticalRepublishConfig { + return CriticalRepublishConfig{ + BurstInterval: 20 * time.Millisecond, + BurstDuration: 45 * time.Millisecond, + SteadyInterval: 30 * time.Millisecond, + } +} + func strValue(p *string) string { if p == nil { return "" @@ -184,6 +240,13 @@ func strValue(p *string) string { return *p } +func int32Value(p *int32) int32 { + if p == nil { + return 0 + } + return *p +} + func testStagePayload(id string, artefact []byte) StagePayload { return StagePayload{ LinkID: "mcu-uart0", @@ -196,6 +259,41 @@ func testStagePayload(id string, artefact []byte) StagePayload { } } +func preparedStagePayload(t *testing.T, caller *bus.Connection, svc *Service, id string, artefact []byte) StagePayload { + t.Helper() + req := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) + sub := caller.Request(req) + defer caller.Unsubscribe(sub) + select { + case msg := <-sub.Channel(): + if msg == nil { + t.Fatal("nil prepare reply") + } + switch reply := msg.Payload.(type) { + case PrepareReply: + if !reply.Ready { + t.Fatalf("prepare reply not ready: %+v", reply) + } + case Reply: + t.Fatalf("prepare failed: %+v", reply) + default: + t.Fatalf("prepare payload type = %T", msg.Payload) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for prepare reply") + } + generation, err := svc.beginStreamedStageLease(id) + if err != nil { + t.Fatalf("begin stage lease: %v", err) + } + if err := svc.markStreamedStageCommitted(id, generation); err != nil { + t.Fatalf("commit stage lease: %v", err) + } + payload := testStagePayload(id, artefact) + payload.Generation = generation + return payload +} + func runService(t *testing.T, b *bus.Bus, opts Options) (*Service, context.CancelFunc) { t.Helper() resetBootIDForTest() @@ -263,6 +361,83 @@ func TestPublishesInitialFactsOnRun(t *testing.T) { } } +func TestBootBuyFailurePublishesInitialUpdaterFailure(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + upSub := observer.Subscribe(TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + + _, cancel := runService(t, b, Options{ + Conn: conn, + BootBuyRC: -42, + }) + defer cancel() + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + if strValue(up.LastError) != ErrABUpdateBuyFailed { + t.Fatalf("last_error = %q, want %q", strValue(up.LastError), ErrABUpdateBuyFailed) + } + if int32Value(up.BootBuyRC) != -42 { + t.Fatalf("boot_buy_rc = %d, want -42", int32Value(up.BootBuyRC)) + } +} + +func TestBootBuyFailureRepublishPreservesFields(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + upSub := observer.Subscribe(TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + + _, cancel := runService(t, b, Options{ + Conn: conn, + BootBuyRC: -7, + }) + defer cancel() + + _ = waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + + publisher := b.NewConnection("fabric-test") + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": true, "established": true, "peer_sid": "cm5-a", "local_sid": "mcu-a"}, + true, + )) + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { + return f.State == StateFailed && int32Value(f.BootBuyRC) == -7 + }) + if strValue(up.LastError) != ErrABUpdateBuyFailed { + t.Fatalf("last_error = %q, want %q", strValue(up.LastError), ErrABUpdateBuyFailed) + } +} + +func TestBootBuyRCClearsOnExplicitUpdaterTransition(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + upSub := observer.Subscribe(TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + + svc, cancel := runService(t, b, Options{ + Conn: conn, + BootBuyRC: -9, + }) + defer cancel() + + _ = waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + svc.transitionTo(StatePreparing, "", "") + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StatePreparing }) + if up.BootBuyRC != nil { + t.Fatalf("boot_buy_rc = %d, want nil after transition", *up.BootBuyRC) + } + if strValue(up.LastError) != "" { + t.Fatalf("last_error = %q, want empty after transition", strValue(up.LastError)) + } +} + func TestPrepareTransitionsToReady(t *testing.T) { b := newTestBus() conn := b.NewConnection("updater") @@ -299,6 +474,301 @@ func TestPrepareTransitionsToReady(t *testing.T) { } } +func prepareUpdaterForLease(t *testing.T, caller *bus.Connection) { + t.Helper() + req := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) + sub := caller.Request(req) + defer caller.Unsubscribe(sub) + select { + case msg := <-sub.Channel(): + if msg == nil { + t.Fatal("nil prepare reply") + } + reply, ok := msg.Payload.(PrepareReply) + if !ok || !reply.Ready { + t.Fatalf("prepare reply = %#v, want ready", msg.Payload) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for prepare reply") + } +} + +func requestUpdaterReply(t *testing.T, caller *bus.Connection, topic bus.Topic, payload any) any { + t.Helper() + msg := caller.NewMessage(topic, payload, false) + sub := caller.Request(msg) + defer caller.Unsubscribe(sub) + select { + case rep := <-sub.Channel(): + if rep == nil { + t.Fatal("nil reply") + } + return rep.Payload + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for reply") + } + return nil +} + +func TestBeginStreamedStageBeforePrepareReturnsStageNotPrepared(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + if gen, err := BeginStreamedStage("xfer-before-prepare", 4); err == nil || err.Error() != "stage_not_prepared" || gen != 0 { + t.Fatalf("BeginStreamedStage before prepare = gen=%d err=%v, want stage_not_prepared", gen, err) + } +} + +func TestPrepareOpensSingleReceivingStreamLeaseAndClearsStaleDescriptor(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + memMD := NewMemoryMetadata() + _ = memMD.WriteStagedDescriptor(StagedDescriptor{Version: "old", ImageID: "old-image", PayloadSHA256: "old"}) + _, cancel := runService(t, b, Options{Conn: conn, Metadata: memMD, MetadataWrite: memMD}) + defer cancel() + + prepareUpdaterForLease(t, caller) + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatal("prepare did not clear stale staged descriptor") + } + + gen, err := BeginStreamedStage("xfer-lease", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + if gen == 0 { + t.Fatal("BeginStreamedStage returned generation 0") + } + defer CancelStreamedStage("xfer-lease", gen, "test_done") + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateReceiving }) + if strValue(up.LastError) != "" { + t.Fatalf("receiving last_error = %q, want empty", strValue(up.LastError)) + } + + if _, err := BeginStreamedStage("xfer-second", 4); err == nil || err.Error() != ErrBusy { + t.Fatalf("second BeginStreamedStage err = %v, want busy", err) + } + if err := CommitBufferedStage("wrong-xfer", gen); err == nil || err.Error() != "stage_generation_mismatch" { + t.Fatalf("wrong xfer CommitBufferedStage err = %v, want generation mismatch", err) + } + if err := CommitBufferedStage("xfer-lease", gen+1); err == nil || err.Error() != "stage_generation_mismatch" { + t.Fatalf("wrong generation CommitBufferedStage err = %v, want generation mismatch", err) + } + if err := CommitBufferedStage("xfer-lease", gen); err != nil { + t.Fatalf("matching CommitBufferedStage: %v", err) + } +} + +func TestPrepareAndCommitRejectWhileStreamLeaseActive(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + prepareUpdaterForLease(t, caller) + gen, err := BeginStreamedStage("xfer-active", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + defer CancelStreamedStage("xfer-active", gen, "test_done") + + prepPayload := requestUpdaterReply(t, caller, TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}) + prepReply, ok := prepPayload.(Reply) + if !ok || prepReply.OK || prepReply.Error != ErrBusy { + t.Fatalf("prepare while receiving = %#v, want busy", prepPayload) + } + + commitPayload := requestUpdaterReply(t, caller, TopicCommitRPC, CommitRequest{}) + commitReply, ok := commitPayload.(Reply) + if !ok || commitReply.OK || commitReply.Error != ErrBusy { + t.Fatalf("commit while stream active = %#v, want busy", commitPayload) + } +} + +func TestPrepareRejectsWhileCommittingOrRebooting(t *testing.T) { + for _, state := range []State{StateCommitting, StateRebooting} { + t.Run(string(state), func(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + svc, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + svc.transitionTo(state, "", "9.9.9") + payload := requestUpdaterReply(t, caller, TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}) + reply, ok := payload.(Reply) + if !ok || reply.OK || reply.Error != ErrBusy { + t.Fatalf("prepare while %s = %#v, want busy", state, payload) + } + }) + } +} + +func TestCancelStreamedStagePreventsLateStageSuccess(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + memMD := NewMemoryMetadata() + verif := &fakeVerifierAccept{manifest: Manifest{ + Version: "9.9.9", + ImageID: "ix", + PayloadSHA256: strings.Repeat("a", 64), + PayloadLength: 4, + }} + + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + prepareUpdaterForLease(t, caller) + gen, err := BeginStreamedStage("xfer-cancel", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + if err := CommitBufferedStage("xfer-cancel", gen); err != nil { + t.Fatalf("CommitBufferedStage: %v", err) + } + CancelStreamedStage("xfer-cancel", gen, "test_cancel") + + stage := testStagePayload("xfer-cancel", []byte("blob")) + stage.Generation = gen + payload := requestUpdaterReply(t, caller, TopicStageRPC, stage) + reply, ok := payload.(StageReply) + if !ok || reply.OK { + t.Fatalf("late stage after cancel = %#v, want rejection", payload) + } + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatal("late stage after cancel persisted staged descriptor") + } +} + +func TestReleasedStagedLeaseIgnoresLateCancel(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + memMD := NewMemoryMetadata() + app := &fakeApplier{rebootCh: make(chan StagedDescriptor, 1)} + verif := &fakeVerifierAccept{manifest: Manifest{ + Version: "9.9.9", + BuildID: "build-9.9.9", + ImageID: "mcu-dev-9.9.9", + PayloadSHA256: strings.Repeat("c", 64), + PayloadLength: 4, + }} + + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Applier: app, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + prepareUpdaterForLease(t, caller) + gen, err := BeginStreamedStage("xfer-released", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + if err := CommitBufferedStage("xfer-released", gen); err != nil { + t.Fatalf("CommitBufferedStage: %v", err) + } + + stage := testStagePayload("xfer-released", []byte("blob")) + stage.Generation = gen + payload := requestUpdaterReply(t, caller, TopicStageRPC, stage) + reply, ok := payload.(StageReply) + if !ok || !reply.OK { + t.Fatalf("stage reply = %#v, want ok", payload) + } + waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateStaged }) + if _, ok := memMD.StagedDescriptor(); !ok { + t.Fatal("stage did not persist descriptor") + } + + CancelStreamedStage("xfer-released", gen, "late_cancel") + if _, ok := memMD.StagedDescriptor(); !ok { + t.Fatal("late cancel cleared released staged descriptor") + } + + commitPayload := requestUpdaterReply(t, caller, TopicCommitRPC, CommitRequest{}) + commit, ok := commitPayload.(CommitReply) + if !ok || !commit.Accepted || !commit.RebootRequired { + t.Fatalf("commit after late cancel = %#v, want accepted", commitPayload) + } +} + +func TestStaleGenerationAndWrongXferCannotMutateStreamedStage(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + memMD := NewMemoryMetadata() + verif := &fakeVerifierAccept{manifest: Manifest{ + Version: "9.9.9", + ImageID: "ix", + PayloadSHA256: strings.Repeat("b", 64), + PayloadLength: 4, + }} + + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + prepareUpdaterForLease(t, caller) + gen, err := BeginStreamedStage("xfer-current", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + defer CancelStreamedStage("xfer-current", gen, "test_done") + + if err := WriteStreamedStage("wrong-xfer", gen, []byte("data")); err == nil || err.Error() != "stage_generation_mismatch" { + t.Fatalf("wrong xfer WriteStreamedStage err = %v, want generation mismatch", err) + } + if _, err := CommitStreamedStage("xfer-current", gen+1); err == nil || err.Error() != "stage_generation_mismatch" { + t.Fatalf("stale generation CommitStreamedStage err = %v, want generation mismatch", err) + } + if err := CommitBufferedStage("xfer-current", gen); err != nil { + t.Fatalf("CommitBufferedStage: %v", err) + } + + for _, tc := range []struct { + name string + id string + gen uint64 + }{ + {name: "wrong_xfer", id: "wrong-xfer", gen: gen}, + {name: "stale_generation", id: "xfer-current", gen: gen + 1}, + } { + t.Run(tc.name, func(t *testing.T) { + stage := testStagePayload(tc.id, []byte("blob")) + stage.Generation = tc.gen + payload := requestUpdaterReply(t, caller, TopicStageRPC, stage) + reply, ok := payload.(StageReply) + if !ok || reply.OK { + t.Fatalf("stage with stale identity = %#v, want rejection", payload) + } + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatal("stage with stale identity persisted descriptor") + } + }) + } +} + func TestCommitWithoutStagedReturnsNothingStaged(t *testing.T) { b := newTestBus() conn := b.NewConnection("updater") @@ -372,7 +842,7 @@ func TestCommitWithoutApplierReturnsApplyUnavailable(t *testing.T) { verif := &fakeVerifierAccept{manifest: Manifest{Version: "9.9.9", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: 4}} memMD := NewMemoryMetadata() - _, cancel := runService(t, b, Options{ + svc, cancel := runService(t, b, Options{ Conn: conn, Verifier: verif, Metadata: memMD, @@ -382,7 +852,7 @@ func TestCommitWithoutApplierReturnsApplyUnavailable(t *testing.T) { defer cancel() // Drive updater/main staging to staged state. - rreq := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-x", []byte("blob")), false) + rreq := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "xfer-x", []byte("blob")), false) rsub := caller.Request(rreq) defer caller.Unsubscribe(rsub) <-rsub.Channel() @@ -427,7 +897,7 @@ func TestCommitWithFakeApplierTransitionsToRebooting(t *testing.T) { verif := &fakeVerifierAccept{manifest: Manifest{Version: "9.9.9", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: 4}} memMD := NewMemoryMetadata() app := &fakeApplier{rebootCh: make(chan StagedDescriptor, 1)} - _, cancel := runService(t, b, Options{ + svc, cancel := runService(t, b, Options{ Conn: conn, Verifier: verif, Applier: app, @@ -437,7 +907,7 @@ func TestCommitWithFakeApplierTransitionsToRebooting(t *testing.T) { defer cancel() // Stage via updater/main. - rreq := caller.NewMessage(TopicStageRPC, testStagePayload("x", []byte("blob")), false) + rreq := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "x", []byte("blob")), false) <-caller.Request(rreq).Channel() // Commit. @@ -490,7 +960,7 @@ func TestCommitApplyRebootErrorPublishesFailedAfterAcceptedReply(t *testing.T) { }} memMD := NewMemoryMetadata() app := &fakeApplier{rebootErr: errors.New("apply_reboot_failed:reboot_into_slot:-1")} - _, cancel := runService(t, b, Options{ + svc, cancel := runService(t, b, Options{ Conn: conn, Verifier: verif, Applier: app, @@ -499,7 +969,7 @@ func TestCommitApplyRebootErrorPublishesFailedAfterAcceptedReply(t *testing.T) { }) defer cancel() - rreq := caller.NewMessage(TopicStageRPC, testStagePayload("x", []byte("blob")), false) + rreq := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "x", []byte("blob")), false) <-caller.Request(rreq).Channel() creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) @@ -565,10 +1035,10 @@ func TestStageStubVerifierPublishesFailed(t *testing.T) { upSub := caller.Subscribe(TopicUpdaterFact) defer caller.Unsubscribe(upSub) - _, cancel := runService(t, b, Options{Conn: conn, Verifier: StubVerifier()}) + svc, cancel := runService(t, b, Options{Conn: conn, Verifier: StubVerifier()}) defer cancel() - req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-1", []byte("blob")), false) + req := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "xfer-1", []byte("blob")), false) replySub := caller.Request(req) defer caller.Unsubscribe(replySub) @@ -592,10 +1062,10 @@ func TestStageStubVerifierPublishesFailed(t *testing.T) { } func TestStageFakeAcceptWritesStagedDescriptor(t *testing.T) { - // W11: on verifier success staging writes the manifest's - // fields to the metadata writer. A subsequent commit RPC reads - // the descriptor back via the matching reader and transitions - // to rebooting with the same pending_version. + // On verifier success, staging writes the manifest fields to the + // metadata writer. A subsequent commit RPC reads the descriptor back + // via the matching reader and transitions to rebooting with the same + // pending_version. b := newTestBus() conn := b.NewConnection("updater") caller := b.NewConnection("caller") @@ -610,7 +1080,7 @@ func TestStageFakeAcceptWritesStagedDescriptor(t *testing.T) { PayloadLength: 4, }} memMD := NewMemoryMetadata() - _, cancel := runService(t, b, Options{ + svc, cancel := runService(t, b, Options{ Conn: conn, Verifier: verif, Applier: &fakeApplier{}, // success path; production default refuses @@ -620,7 +1090,7 @@ func TestStageFakeAcceptWritesStagedDescriptor(t *testing.T) { defer cancel() // Drive updater/main staging to verifier success. - req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-w11", []byte("blob")), false) + req := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "xfer-w11", []byte("blob")), false) replySub := caller.Request(req) defer caller.Unsubscribe(replySub) select { @@ -684,7 +1154,7 @@ func TestStageFailureClearsStaleStagedDescriptor(t *testing.T) { // Service uses a verifier that always rejects. verif := &fakeVerifierReject{err: errString("bad_signature")} - _, cancel := runService(t, b, Options{ + svc, cancel := runService(t, b, Options{ Conn: conn, Verifier: verif, Applier: &fakeApplier{}, @@ -694,7 +1164,7 @@ func TestStageFailureClearsStaleStagedDescriptor(t *testing.T) { defer cancel() // Drive updater/main staging to failure. - rreq := caller.NewMessage(TopicStageRPC, testStagePayload("x", []byte("blob")), false) + rreq := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "x", []byte("blob")), false) rsub := caller.Request(rreq) defer caller.Unsubscribe(rsub) select { @@ -761,10 +1231,69 @@ func TestPrepareClearsStaleStagedDescriptor(t *testing.T) { } } +func TestPrepareClearFailureTransitionsToFailedAndAllowsRetry(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + memMD := NewMemoryMetadata() + _ = memMD.WriteStagedDescriptor(StagedDescriptor{Version: "1.0.0", PayloadSHA256: "old"}) + failMD := &failingClearMetadata{MemoryMetadata: memMD, err: errors.New("flash_busy")} + + _, cancel := runService(t, b, Options{ + Conn: conn, + Metadata: memMD, + MetadataWrite: failMD, + Applier: &fakeApplier{}, + }) + defer cancel() + + _ = waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateRunning }) + + preq := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) + psub := caller.Request(preq) + defer caller.Unsubscribe(psub) + select { + case msg := <-psub.Channel(): + reply, ok := msg.Payload.(Reply) + if !ok { + t.Fatalf("reply payload type = %T", msg.Payload) + } + if reply.OK || reply.Error != "metadata_clear_failed:flash_busy" { + t.Fatalf("prepare reply = %+v, want metadata clear failure", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for failed prepare reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + if strValue(up.LastError) != "metadata_clear_failed:flash_busy" { + t.Fatalf("last_error = %q, want metadata clear failure", strValue(up.LastError)) + } + + failMD.err = nil + retry := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) + retrySub := caller.Request(retry) + defer caller.Unsubscribe(retrySub) + select { + case msg := <-retrySub.Channel(): + reply, ok := msg.Payload.(PrepareReply) + if !ok { + t.Fatalf("retry reply payload type = %T", msg.Payload) + } + if !reply.Ready { + t.Fatalf("retry prepare reply = %+v, want ready", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for retry prepare reply") + } +} + func TestStageFakeAcceptPublishesStaged(t *testing.T) { - // Test fake exercises the success path that fabric-security will - // flesh out in production. State -> staged, pending_version mirrors - // the manifest's build version, reply.OK = true. + // Test fake exercises the success path. State -> staged, + // pending_version mirrors the manifest's build version, reply.OK = true. b := newTestBus() conn := b.NewConnection("updater") caller := b.NewConnection("caller") @@ -772,10 +1301,10 @@ func TestStageFakeAcceptPublishesStaged(t *testing.T) { defer caller.Unsubscribe(upSub) verif := &fakeVerifierAccept{manifest: Manifest{Version: "9.9.9", BuildID: "bx", ImageID: "ix", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: 4}} - _, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) + svc, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) defer cancel() - req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-2", []byte("blob")), false) + req := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "xfer-2", []byte("blob")), false) replySub := caller.Request(req) defer caller.Unsubscribe(replySub) @@ -803,10 +1332,10 @@ func TestStageFakeRejectPublishesFailed(t *testing.T) { defer caller.Unsubscribe(upSub) verif := &fakeVerifierReject{err: errString("manifest_check_failed")} - _, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) + svc, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) defer cancel() - req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-3", []byte("blob")), false) + req := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "xfer-3", []byte("blob")), false) replySub := caller.Request(req) defer caller.Unsubscribe(replySub) @@ -829,11 +1358,64 @@ func TestStageFakeRejectPublishesFailed(t *testing.T) { } } -func TestRepublishOnLinkReadyEdge(t *testing.T) { - // W10 contract: the updater republishes its retained state/self/* - // surface on every !Ready -> Ready transition observed on - // state/fabric/link/. Verifies the edge is detected without - // double-firing on subsequent retains that keep Ready=true. +func TestRepublishReasonTracksReadyAndSessionEdges(t *testing.T) { + for _, tc := range []struct { + name string + hadPrev bool + prev linkObservation + cur linkObservation + want string + }{ + { + name: "first ready", + cur: linkObservation{Ready: true, PeerSID: "peer-1", LocalSID: "local-1"}, + want: "ready_edge", + }, + { + name: "false to true", + hadPrev: true, + prev: linkObservation{Ready: false, PeerSID: "peer-1", LocalSID: "local-1"}, + cur: linkObservation{Ready: true, PeerSID: "peer-1", LocalSID: "local-1"}, + want: "ready_edge", + }, + { + name: "same ready identity", + hadPrev: true, + prev: linkObservation{Ready: true, PeerSID: "peer-1", LocalSID: "local-1"}, + cur: linkObservation{Ready: true, PeerSID: "peer-1", LocalSID: "local-1"}, + }, + { + name: "peer sid changed", + hadPrev: true, + prev: linkObservation{Ready: true, PeerSID: "peer-1", LocalSID: "local-1"}, + cur: linkObservation{Ready: true, PeerSID: "peer-2", LocalSID: "local-1"}, + want: "peer_sid_changed", + }, + { + name: "local sid changed", + hadPrev: true, + prev: linkObservation{Ready: true, PeerSID: "peer-1", LocalSID: "local-1"}, + cur: linkObservation{Ready: true, PeerSID: "peer-1", LocalSID: "local-2"}, + want: "local_sid_changed", + }, + { + name: "not ready", + hadPrev: true, + prev: linkObservation{Ready: true, PeerSID: "peer-1", LocalSID: "local-1"}, + cur: linkObservation{Ready: false, PeerSID: "peer-2", LocalSID: "local-2"}, + }, + } { + t.Run(tc.name, func(t *testing.T) { + if got := republishReason(tc.prev, tc.cur, tc.hadPrev); got != tc.want { + t.Fatalf("republishReason() = %q, want %q", got, tc.want) + } + }) + } +} + +func TestRepublishOnLinkReadyAndSessionEdges(t *testing.T) { + // The updater republishes its retained state/self/* surface on every + // !Ready -> Ready transition, and on SID changes while Ready remains true. b := newTestBus() conn := b.NewConnection("updater") observer := b.NewConnection("observer") @@ -851,7 +1433,7 @@ func TestRepublishOnLinkReadyEdge(t *testing.T) { publisher := b.NewConnection("test-fabric") publisher.Publish(publisher.NewMessage( bus.T("state", "fabric", "link", "mcu-uart0"), - map[string]any{"ready": false, "established": false}, + map[string]any{"ready": false, "established": false, "peer_sid": "cm5-x", "local_sid": "mcu-a"}, true, )) // Brief wait then drop everything that's already in the channel. @@ -864,24 +1446,124 @@ func TestRepublishOnLinkReadyEdge(t *testing.T) { // software-fact republish. publisher.Publish(publisher.NewMessage( bus.T("state", "fabric", "link", "mcu-uart0"), - map[string]any{"ready": true, "established": true, "peer_sid": "cm5-x"}, + map[string]any{"ready": true, "established": true, "peer_sid": "cm5-x", "local_sid": "mcu-a"}, true, )) _ = waitForFact[SoftwareFact](t, swSub, nil) // Subsequent Ready=true retain (no edge) should NOT trigger another - // republish. We assert by checking the channel is empty after a - // short settle window. + // immediate edge republish. The default level-triggered cadence does + // not fire inside this short settle window. + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": true, "established": true, "peer_sid": "cm5-x", "local_sid": "mcu-a", "last_rx_ms": int64(123)}, + true, + )) + assertNoSoftwareRepublish(t, swSub, 150*time.Millisecond) + publisher.Publish(publisher.NewMessage( bus.T("state", "fabric", "link", "mcu-uart0"), - map[string]any{"ready": true, "established": true, "peer_sid": "cm5-x", "last_rx_ms": int64(123)}, + map[string]any{"ready": true, "established": true, "peer_sid": "cm5-y", "local_sid": "mcu-a"}, true, )) - settled := time.After(150 * time.Millisecond) + _ = waitForFact[SoftwareFact](t, swSub, nil) + + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": true, "established": true, "peer_sid": "cm5-y", "local_sid": "mcu-b"}, + true, + )) + _ = waitForFact[SoftwareFact](t, swSub, nil) +} + +func TestCriticalFactsRepublishOnUpdaterStateChange(t *testing.T) { + b := bus.NewBus(32, "+", "#") + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + swSub := observer.Subscribe(TopicSoftwareFact) + defer observer.Unsubscribe(swSub) + upSub := observer.Subscribe(TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + hSub := observer.Subscribe(TopicHealthFact) + defer observer.Unsubscribe(hSub) + + svc, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + _, _, _ = waitForCriticalFactSet(t, swSub, upSub, hSub) + + svc.transitionTo(StateReady, "", "") + _, up, h := waitForCriticalFactSet(t, swSub, upSub, hSub) + if up.State != StateReady { + t.Fatalf("updater state = %q, want %q", up.State, StateReady) + } + if h.State != "ok" { + t.Fatalf("health state = %q, want ok", h.State) + } +} + +func TestCriticalRepublishCadenceWhileReady(t *testing.T) { + b := bus.NewBus(64, "+", "#") + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + swSub := observer.Subscribe(TopicSoftwareFact) + defer observer.Unsubscribe(swSub) + upSub := observer.Subscribe(TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + hSub := observer.Subscribe(TopicHealthFact) + defer observer.Unsubscribe(hSub) + + _, cancel := runService(t, b, Options{ + Conn: conn, + CriticalRepublish: testCriticalRepublishConfig(), + }) + defer cancel() + _, _, _ = waitForCriticalFactSet(t, swSub, upSub, hSub) + + publisher := b.NewConnection("fabric-test") + publishTestLinkState(publisher, true, "cm5-a", "mcu-a") + + // Ready edge publishes immediately, then the level-triggered cadence + // keeps publishing the three critical facts while the link remains ready. + _, _, _ = waitForCriticalFactSet(t, swSub, upSub, hSub) + _, _, _ = waitForCriticalFactSet(t, swSub, upSub, hSub) + _, _, _ = waitForCriticalFactSet(t, swSub, upSub, hSub) +} + +func TestCriticalRepublishCadenceStopsWhenLinkNotReady(t *testing.T) { + b := bus.NewBus(32, "+", "#") + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + swSub := observer.Subscribe(TopicSoftwareFact) + defer observer.Unsubscribe(swSub) + + _, cancel := runService(t, b, Options{ + Conn: conn, + CriticalRepublish: CriticalRepublishConfig{ + BurstInterval: 100 * time.Millisecond, + BurstDuration: 200 * time.Millisecond, + SteadyInterval: 100 * time.Millisecond, + }, + }) + defer cancel() + _ = waitForFact[SoftwareFact](t, swSub, nil) + + publisher := b.NewConnection("fabric-test") + publishTestLinkState(publisher, true, "cm5-a", "mcu-a") + _ = waitForFact[SoftwareFact](t, swSub, nil) + + publishTestLinkState(publisher, false, "cm5-a", "mcu-a") + time.Sleep(20 * time.Millisecond) + drainSubscription(swSub) + assertNoSoftwareRepublish(t, swSub, 150*time.Millisecond) +} + +func assertNoSoftwareRepublish(t *testing.T, sub *bus.Subscription, d time.Duration) { + t.Helper() + settled := time.After(d) for { select { - case <-swSub.Channel(): - t.Fatal("unexpected republish on subsequent Ready=true retain") + case <-sub.Channel(): + t.Fatal("unexpected software fact republish") case <-settled: return }