From 3252b27895ebaf9d45593ded855d3b0b1948bdff Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 2 Jun 2026 16:55:38 +0000 Subject: [PATCH 1/6] security: enforce signed MCU image staging --- services/reactor/reactor.go | 2 +- services/updater/prestage_tinygo.go | 76 +++++++++++---------- services/updater/receiver.go | 85 +----------------------- services/updater/security.go | 62 +++++++++++++++++ services/updater/updater_test.go | 77 +++++++++++++++++++++ services/updater/verifier.go | 19 +++--- services/updater/verifier_passthrough.go | 50 -------------- 7 files changed, 190 insertions(+), 181 deletions(-) create mode 100644 services/updater/security.go delete mode 100644 services/updater/verifier_passthrough.go diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 733c160..1cb0b3a 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -492,7 +492,7 @@ func (r *Reactor) Run(ctx context.Context) { identity := firmwareIdentity() updaterSvc := updater.New(updater.Options{ Conn: updaterConn, - Verifier: updater.PassthroughVerifier(identity), + Verifier: updater.SignedImageVerifier(), Applier: updater.ProductionApplier(), Identity: identity, BootBuyRC: r.bootBuyRC, diff --git a/services/updater/prestage_tinygo.go b/services/updater/prestage_tinygo.go index 4cdca0d..793e891 100644 --- a/services/updater/prestage_tinygo.go +++ b/services/updater/prestage_tinygo.go @@ -3,17 +3,19 @@ package updater import ( - "crypto/sha256" - "encoding/hex" "errors" "pico2-a-b/abupdate" + "pico2-a-b/imagev1" ) -// streamedStage tracks a raw transfer that fabric has already streamed into -// the inactive A/B slot. It is the TinyGo bring-up path used before imagev1 -// verification can stream directly from the transfer source. +// streamedStage tracks a signed transfer that verified successfully while +// fabric streamed it. Only the signed payload bytes are written to the inactive +// A/B slot; the container header, manifest and signature are never staged. type streamedStage struct { + Version string + BuildID string + ImageID string Length uint32 PayloadSHA256 string } @@ -21,8 +23,7 @@ type streamedStage struct { var ( streamedStageDesc streamedStage streamedStageOK bool - streamedStageHash = sha256.New() - streamedStageLen uint32 + streamedVerifier *imagev1.StreamVerifier ) func startStreamedStage(size uint32) error { @@ -32,17 +33,12 @@ func startStreamedStage(size uint32) error { sharedUpdater = abupdate.Updater{} sharedUpdaterInit = false - u, err := ensureUpdaterInited() - if err != nil { - return err - } - if rc := u.BeginUpdate(size); rc != 0 { - return errFromRC("begin_update", rc) - } - streamedStageHash.Reset() - streamedStageLen = 0 + _ = size streamedStageDesc = streamedStage{} streamedStageOK = false + streamedVerifier = imagev1.NewStreamVerifier(SignedImagePolicy(), func(payloadLen uint32) (imagev1.PayloadSink, error) { + return newSlotSink(payloadLen) + }) return nil } @@ -50,42 +46,41 @@ func writeStreamedStage(data []byte) error { if len(data) == 0 { return errors.New("empty_chunk") } - u, err := ensureUpdaterInited() - if err != nil { - return err + if streamedVerifier == nil { + return errors.New("streamed_stage_not_started") } - if rc := u.WriteChunk(data); rc != 0 { - return errFromRC("write_chunk", rc) - } - _, _ = streamedStageHash.Write(data) - streamedStageLen += uint32(len(data)) - return nil + _, err := streamedVerifier.Write(data) + return err } func commitStreamedStage() (streamedStage, error) { - u, err := ensureUpdaterInited() + if streamedVerifier == nil { + return streamedStage{}, errors.New("streamed_stage_not_started") + } + res, err := streamedVerifier.Commit() if err != nil { + streamedVerifier = nil return streamedStage{}, err } - if rc := u.FlushFinal(); rc != 0 { - return streamedStage{}, errFromRC("flush_final", rc) - } streamedStageDesc = streamedStage{ - Length: streamedStageLen, - PayloadSHA256: hex.EncodeToString(streamedStageHash.Sum(nil)), + Version: res.Version, + BuildID: res.BuildID, + ImageID: res.ImageID, + Length: res.PayloadLength, + PayloadSHA256: res.PayloadSHA256, } streamedStageOK = true - if written := u.BytesWritten(); written != streamedStageDesc.Length { - streamedStageDesc.Length = written - } + streamedVerifier = nil return streamedStageDesc, nil } func abortStreamedStage() { + if streamedVerifier != nil { + _ = streamedVerifier.Abort() + streamedVerifier = nil + } streamedStageDesc = streamedStage{} streamedStageOK = false - streamedStageLen = 0 - streamedStageHash.Reset() } func consumeStreamedStageResult() (streamedStage, bool) { @@ -95,11 +90,14 @@ func consumeStreamedStageResult() (streamedStage, bool) { out := streamedStageDesc streamedStageDesc = streamedStage{} streamedStageOK = false - streamedStageLen = 0 - streamedStageHash.Reset() return out, true } func discardStreamedStageResult() { - abortStreamedStage() + if streamedVerifier != nil { + _ = streamedVerifier.Abort() + streamedVerifier = nil + } + streamedStageDesc = streamedStage{} + streamedStageOK = false } diff --git a/services/updater/receiver.go b/services/updater/receiver.go index f4fde09..58efb25 100644 --- a/services/updater/receiver.go +++ b/services/updater/receiver.go @@ -52,20 +52,10 @@ func (s *Service) handleStage(msg *bus.Message) { s.reply(msg, StageReply{OK: false, Err: err.Error()}) return } - stageIdentity, _ := identityFromStageMeta(s.identity, payload.Meta) - if staged.Version != "" { - stageIdentity.Version = staged.Version - } - if staged.BuildID != "" { - stageIdentity.Build = staged.BuildID - } - if staged.ImageID != "" { - stageIdentity.ImageID = staged.ImageID - } desc := StagedDescriptor{ - Version: stageIdentity.Version, - BuildID: stageIdentity.Build, - ImageID: stageIdentity.ImageID, + Version: staged.Version, + BuildID: staged.BuildID, + ImageID: staged.ImageID, Length: staged.Length, Slot: 0, PayloadSHA256: staged.PayloadSHA256, @@ -170,72 +160,3 @@ func (s *Service) failLateStage(payload StagePayload, err error) { } s.transitionTo(StateFailed, reason, "") } - -type stageMetadata struct { - Version string `json:"version,omitempty"` - Build string `json:"build,omitempty"` - BuildID string `json:"build_id,omitempty"` - ImageID string `json:"image_id,omitempty"` - ExpectedImageID string `json:"expected_image_id,omitempty"` -} - -type stageMetadataEnvelope struct { - Metadata stageMetadata `json:"metadata,omitempty"` - Meta stageMetadata `json:"meta,omitempty"` - Request struct { - Metadata stageMetadata `json:"metadata,omitempty"` - Meta stageMetadata `json:"meta,omitempty"` - ExpectedImageID string `json:"expected_image_id,omitempty"` - } `json:"request,omitempty"` -} - -func applyStageMetadata(ident *Identity, md stageMetadata) bool { - applied := false - if md.Version != "" { - ident.Version = md.Version - applied = true - } - if md.BuildID != "" { - ident.Build = md.BuildID - applied = true - } else if md.Build != "" { - ident.Build = md.Build - applied = true - } - if md.ImageID != "" { - ident.ImageID = md.ImageID - applied = true - } else if md.ExpectedImageID != "" { - ident.ImageID = md.ExpectedImageID - applied = true - } - return applied -} - -func identityFromStageMeta(defaults Identity, meta any) (Identity, bool) { - ident := defaults - applied := false - md, ok := jsonDecode[stageMetadata](meta) - if ok { - applied = applyStageMetadata(&ident, md) || applied - } - - env, ok := jsonDecode[stageMetadataEnvelope](meta) - if !ok { - return ident, applied - } - applied = applyStageMetadata(&ident, env.Metadata) || applied - applied = applyStageMetadata(&ident, env.Meta) || applied - if env.Request.ExpectedImageID != "" && env.Request.Metadata.ExpectedImageID == "" { - env.Request.Metadata.ExpectedImageID = env.Request.ExpectedImageID - } - if env.Request.ExpectedImageID != "" && env.Request.Meta.ExpectedImageID == "" { - env.Request.Meta.ExpectedImageID = env.Request.ExpectedImageID - } - applied = applyStageMetadata(&ident, env.Request.Metadata) || applied - applied = applyStageMetadata(&ident, env.Request.Meta) || applied - if !applied { - return ident, false - } - return ident, true -} diff --git a/services/updater/security.go b/services/updater/security.go new file mode 100644 index 0000000..3dfe227 --- /dev/null +++ b/services/updater/security.go @@ -0,0 +1,62 @@ +package updater + +import ( + "errors" + "io" + + "pico2-a-b/imagev1" +) + +var ( + SignedImageProductFamily = "bigbox" + SignedImageHardwareProfile = "bb-v1-cm5-2" + SignedImageMCUBoardFamily = "rp2354a" + SignedImageTrustedKeyID = "" + SignedImageTrustedPublicKey = "" +) + +type signedImageVerifier struct{} + +func SignedImageVerifier() Verifier { + return signedImageVerifier{} +} + +func SignedImagePolicy() imagev1.Policy { + var keys []imagev1.TrustedKey + if SignedImageTrustedKeyID != "" && SignedImageTrustedPublicKey != "" { + pub, err := imagev1.ParsePublicKeyHex(SignedImageTrustedPublicKey) + if err == nil { + keys = append(keys, imagev1.TrustedKey{ + KeyID: SignedImageTrustedKeyID, + PublicKey: pub, + }) + } + } + return imagev1.Policy{ + Target: imagev1.Target{ + ProductFamily: SignedImageProductFamily, + HardwareProfile: SignedImageHardwareProfile, + MCUBoardFamily: SignedImageMCUBoardFamily, + }, + Keys: keys, + } +} + +func (signedImageVerifier) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + if sink == nil { + return Manifest{}, errors.New("signed_image: nil sink") + } + res, err := imagev1.Verify(r, SignedImagePolicy(), func(uint32) (imagev1.PayloadSink, error) { + return sink, nil + }) + if err != nil { + return Manifest{}, err + } + return Manifest{ + Version: res.Version, + BuildID: res.BuildID, + ImageID: res.ImageID, + PayloadSHA256: res.PayloadSHA256, + PayloadLength: res.PayloadLength, + }, nil +} diff --git a/services/updater/updater_test.go b/services/updater/updater_test.go index d0bccf2..d3b57c8 100644 --- a/services/updater/updater_test.go +++ b/services/updater/updater_test.go @@ -3,6 +3,7 @@ package updater import ( "bytes" "context" + "crypto/ed25519" "encoding/hex" "encoding/json" "errors" @@ -13,6 +14,7 @@ import ( "time" "devicecode-go/bus" + "pico2-a-b/imagev1" ) // ---- helpers -------------------------------------------------------- @@ -1061,6 +1063,81 @@ func TestStageStubVerifierPublishesFailed(t *testing.T) { } } +func TestStageSignedImageVerifierWritesManifestDescriptor(t *testing.T) { + seed := bytes.Repeat([]byte{0x42}, ed25519.SeedSize) + priv := ed25519.NewKeyFromSeed(seed) + pub := priv.Public().(ed25519.PublicKey) + target := imagev1.Target{ + ProductFamily: "bigbox", + HardwareProfile: "bb-v1-cm5-2", + MCUBoardFamily: "rp2354a", + } + artefact, _, err := imagev1.Pack([]byte("signed payload"), imagev1.PackOptions{ + Target: target, + Version: "13.0", + BuildID: "build-13.0", + ImageID: "mcu-dev-13.0", + KeyID: "test-key", + }, priv) + if err != nil { + t.Fatal(err) + } + + oldProduct := SignedImageProductFamily + oldProfile := SignedImageHardwareProfile + oldBoard := SignedImageMCUBoardFamily + oldKeyID := SignedImageTrustedKeyID + oldKey := SignedImageTrustedPublicKey + defer func() { + SignedImageProductFamily = oldProduct + SignedImageHardwareProfile = oldProfile + SignedImageMCUBoardFamily = oldBoard + SignedImageTrustedKeyID = oldKeyID + SignedImageTrustedPublicKey = oldKey + }() + SignedImageProductFamily = target.ProductFamily + SignedImageHardwareProfile = target.HardwareProfile + SignedImageMCUBoardFamily = target.MCUBoardFamily + SignedImageTrustedKeyID = "test-key" + SignedImageTrustedPublicKey = hex.EncodeToString(pub) + + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + memMD := NewMemoryMetadata() + svc, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: SignedImageVerifier(), + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + + req := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "signed-xfer", artefact), false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + select { + case msg := <-replySub.Channel(): + reply, _ := msg.Payload.(StageReply) + if !reply.OK { + t.Fatalf("stage reply not ok: %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage reply") + } + + desc, ok := memMD.StagedDescriptor() + if !ok { + t.Fatal("staged descriptor not persisted") + } + if desc.Version != "13.0" || desc.BuildID != "build-13.0" || desc.ImageID != "mcu-dev-13.0" { + t.Fatalf("descriptor wrong: %+v", desc) + } + if desc.Length != uint32(len("signed payload")) || len(desc.PayloadSHA256) != 64 { + t.Fatalf("descriptor payload metadata wrong: %+v", desc) + } +} + func TestStageFakeAcceptWritesStagedDescriptor(t *testing.T) { // On verifier success, staging writes the manifest fields to the // metadata writer. A subsequent commit RPC reads the descriptor back diff --git a/services/updater/verifier.go b/services/updater/verifier.go index 4cfac9f..bf376dd 100644 --- a/services/updater/verifier.go +++ b/services/updater/verifier.go @@ -5,10 +5,10 @@ import ( "io" ) -// Manifest is the small subset of image metadata that updater staging needs -// after verification succeeds. fabric-update fills it from the bring-up -// passthrough verifier; fabric-security fills the same interface from -// pico2-a-b/imagev1. +// Manifest is the small subset of the signed-image manifest that updater +// staging needs after verification succeeds. The full canonical manifest lives +// in pico2-a-b/imagev1; this type is the local interface carried across the +// staging -> updater -> state/self/updater pipeline. type Manifest struct { Version string BuildID string @@ -31,17 +31,18 @@ type SlotSink interface { Abort() error } -// Verifier is updater/main staging's verification hook. Tests may pass fakes, -// production wiring supplies an explicit verifier, and nil Options.Verifier -// falls back to the rejecting StubVerifier. +// Verifier is updater/main staging's hook into signed-image verification. +// Production wiring uses SignedImageVerifier; tests may pass fakes, and nil +// Options.Verifier falls back to the rejecting StubVerifier. type Verifier interface { // Verify reads the artefact bytes from r, validates the signed // envelope (header + manifest + signature), and on success streams // the verified payload into sink. Returns the trusted manifest the // staging path propagates to the staged descriptor and software fact. // - // On failure: sink.Abort is called by the verifier itself before - // returning so staging doesn't have to special-case it. + // On success, Verify owns sink.Commit before it returns the manifest. On + // failure, Verify owns sink.Abort before returning so staging does not + // have a second sink finalization path. Verify(r io.Reader, sink SlotSink) (Manifest, error) } diff --git a/services/updater/verifier_passthrough.go b/services/updater/verifier_passthrough.go deleted file mode 100644 index e1e6f91..0000000 --- a/services/updater/verifier_passthrough.go +++ /dev/null @@ -1,50 +0,0 @@ -package updater - -import ( - "crypto/sha256" - "encoding/hex" - "errors" - "io" -) - -// passthroughVerifier accepts any artefact, streams its bytes straight -// into sink while computing SHA-256, and returns a synthetic manifest -// with the artefact length + computed hash. Intended for the bringup -// stack on this branch where the signed-image v1 envelope (header + -// canonical manifest + Ed25519 signature) is not yet implemented. -// -// Replace with a real verifier when fabric-security lands; this exists -// so fw-update-e2e can drive the staging → applier → reboot path -// end-to-end without the signed-image scaffolding in place. -type passthroughVerifier struct { - identity Identity -} - -// PassthroughVerifier returns a Verifier that accepts any artefact and -// fills the manifest with identity (caller-supplied), the artefact -// length, and the SHA-256 of the streamed payload. Reboot-time apply -// is gated by the Applier; a passthrough verifier without a real -// applier still ends with state=failed(apply_unavailable) at commit. -func PassthroughVerifier(identity Identity) Verifier { - return passthroughVerifier{identity: identity} -} - -func (v passthroughVerifier) Verify(r io.Reader, sink SlotSink) (Manifest, error) { - if sink == nil { - return Manifest{}, errors.New("passthrough_verifier: nil sink") - } - hasher := sha256.New() - mw := io.MultiWriter(sink, hasher) - n, err := io.Copy(mw, r) - if err != nil { - _ = sink.Abort() - return Manifest{}, err - } - return Manifest{ - Version: v.identity.Version, - BuildID: v.identity.Build, - ImageID: v.identity.ImageID, - PayloadSHA256: hex.EncodeToString(hasher.Sum(nil)), - PayloadLength: uint32(n), - }, nil -} From a5af706a1c39615726987a949b6b68174b4a458e Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 2 Jun 2026 16:56:08 +0000 Subject: [PATCH 2/6] diagnostics: retain ota investigation breadcrumbs --- services/fabric/session.go | 24 +++ services/fabric/transfer.go | 234 ++++++++++++++++++--- services/fabric/transfer_test.go | 221 +++++++++++++++++++- services/hal/devices/ltc4015/builder.go | 1 + services/hal/devices/ltc4015/device.go | 19 ++ services/otadiag/otadiag.go | 253 +++++++++++++++++++++++ services/telemetry/alerts.go | 9 +- services/telemetry/telemetry.go | 113 +++++++--- services/telemetry/telemetry_test.go | 91 +++++++- services/updater/abupdate_diag_host.go | 27 +++ services/updater/abupdate_diag_tinygo.go | 57 +++++ services/updater/facts.go | 27 +++ services/updater/prestage_host.go | 15 +- services/updater/prestage_tinygo.go | 53 ++++- services/updater/rpc.go | 35 +++- services/updater/sink_tinygo.go | 14 ++ services/updater/stream_lease.go | 99 ++++++++- services/updater/updater.go | 19 ++ services/updater/updater_test.go | 139 +++++++++++++ types/capabilities.go | 21 +- types/power.go | 9 + 21 files changed, 1374 insertions(+), 106 deletions(-) create mode 100644 services/otadiag/otadiag.go create mode 100644 services/updater/abupdate_diag_host.go create mode 100644 services/updater/abupdate_diag_tinygo.go diff --git a/services/fabric/session.go b/services/fabric/session.go index a75bb08..0e04805 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -10,6 +10,7 @@ import ( "time" "devicecode-go/bus" + "devicecode-go/services/otadiag" "devicecode-go/types" "devicecode-go/x/strconvx" ) @@ -478,6 +479,10 @@ func (s *session) dispatch(line []byte) { case msgReply: typedDispatch(s, t, line, s.onReply) case msgXferBegin: + otadiag.Event( + "[fabric-xfer]", "begin_route_start", protoXferID(line), + otadiag.KV("line_len", len(line)), + ) typedDispatch(s, t, line, s.onTransferBegin) case msgXferChunk: typedDispatch(s, t, line, s.onTransferChunk) @@ -497,6 +502,14 @@ func typedDispatch[T any](s *session, msgType string, line []byte, handler func( dec := json.NewDecoder(bytes.NewReader(line)) dec.DisallowUnknownFields() if err := dec.Decode(&msg); err != nil { + if msgType == msgXferBegin { + otadiag.Event( + "[fabric-xfer]", "begin_decode_error", protoXferID(line), + otadiag.KV("err", err.Error()), + otadiag.KV("line_len", len(line)), + ) + otadiag.StopUpdateWindow("begin_decode_error") + } s.logMalformed(line, err) s.retryMalformedTransferFrame(msgType, line) return @@ -506,11 +519,22 @@ func typedDispatch[T any](s *session, msgType string, line []byte, handler func( if err == nil { err = errors.New("trailing_json") } + if msgType == msgXferBegin { + otadiag.Event( + "[fabric-xfer]", "begin_decode_error", protoXferID(line), + otadiag.KV("err", err.Error()), + otadiag.KV("line_len", len(line)), + ) + otadiag.StopUpdateWindow("begin_decode_error") + } s.logMalformed(line, err) s.retryMalformedTransferFrame(msgType, line) return } handler(&msg) + if msgType == msgXferBegin { + otadiag.Event("[fabric-xfer]", "begin_route_done", protoXferID(line)) + } } func (s *session) retryMalformedTransferFrame(msgType string, line []byte) { diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index a9001f1..f57ba2b 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -7,6 +7,7 @@ import ( "strings" "time" + "devicecode-go/services/otadiag" "devicecode-go/services/updater" "devicecode-go/x/strconvx" "devicecode-go/x/xxhash" @@ -16,6 +17,7 @@ const transferTargetUpdaterMain = "updater/main" const transferIdleRetryLimit = 3 const transferCorruptRetryLimit = 3 const completedTransferCacheLimit = 4 +const transferMemSampleStride = 64 * 1024 // transferMeta captures xfer_begin contents. The transfer target is explicit // on the wire; firmware update uses target="updater/main". meta remains opaque @@ -161,6 +163,8 @@ func (s *session) abortTransfer(reason string) { if cur == nil { return } + otadiag.Event("[fabric-xfer]", "abort_local", cur.meta.ID, otadiag.KV("reason", reason)) + otadiag.StopUpdateWindow(reason) if err := cur.sink.Abort(reason); err != nil { s.logKV("transfer abort failed", "err", err.Error()) } @@ -188,7 +192,8 @@ func (s *session) checkTransferTimeout(now time.Time) { } id := cur.meta.ID s.abortTransfer("timeout") - s.sendTransferAbort(id, "timeout") + abortOK := s.sendTransferAbort(id, "timeout") + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", "timeout"), otadiag.KV("ok", abortOK)) } func (s *session) retryCorruptTransferFrame(reason string) bool { @@ -203,11 +208,19 @@ func (s *session) retryCorruptTransferFrame(reason string) bool { if cur.corruptRetriesAtOffset >= transferCorruptRetryLimit { id := cur.meta.ID s.abortTransfer(reason) - s.sendTransferAbort(id, reason) + abortOK := s.sendTransferAbort(id, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) return false } cur.corruptRetriesAtOffset++ - s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + needOK := s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + otadiag.Event( + "[fabric-xfer]", "need_tx", cur.meta.ID, + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("ok", needOK), + otadiag.KV("retry", true), + otadiag.KV("reason", reason), + ) cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) return true } @@ -271,22 +284,50 @@ func validateTransferBegin(msg *protoXferBegin) (transferMeta, string) { func (s *session) onTransferBegin(msg *protoXferBegin) { s.extendTransferQuiet("xfer_begin_rx", transferPrepareQuiet) + otadiag.SetActiveXfer(msg.XferID) + otadiag.Event( + "[fabric-xfer]", "begin_rx", msg.XferID, + otadiag.KV("target", msg.Target), + otadiag.KV("size", msg.Size), + otadiag.KV("digest_alg", msg.DigestAlg), + otadiag.KV("digest", msg.Digest), + otadiag.KV("meta_len", len(msg.Meta)), + ) meta, errStr := validateTransferBegin(msg) if errStr != "" { if msg.XferID != "" { - s.sendTransferAbort(msg.XferID, "bad_message: "+errStr) + abortOK := s.sendTransferAbort(msg.XferID, "bad_message: "+errStr) + otadiag.Event( + "[fabric-xfer]", "begin_reject", msg.XferID, + otadiag.KV("reason", "bad_message:"+errStr), + otadiag.KV("abort_tx", abortOK), + ) + } else { + otadiag.Event( + "[fabric-xfer]", "begin_reject", msg.XferID, + otadiag.KV("reason", "bad_message:"+errStr), + otadiag.KV("abort_tx", false), + ) } + otadiag.StopUpdateWindow("begin_reject") s.logKV("xfer_begin dropped", "err", errStr) return } + otadiag.Event( + "[fabric-xfer]", "begin_validate_ok", meta.ID, + otadiag.KV("target", meta.Target), + ) s.markRx() now := time.Now() if s.incomingTransfer != nil { cur := s.incomingTransfer if sameTransferTuple(cur.meta, meta) { s.logKV("xfer_begin duplicate", "id", meta.ID) - if s.sendTransferReady(meta.ID) { - s.sendTransferNeed(meta.ID, cur.bytesWritten) + readyOK := s.sendTransferReady(meta.ID) + otadiag.Event("[fabric-xfer]", "ready_tx", meta.ID, otadiag.KV("ok", readyOK), otadiag.KV("duplicate", true)) + if readyOK { + needOK := s.sendTransferNeed(meta.ID, cur.bytesWritten) + otadiag.Event("[fabric-xfer]", "need_tx", meta.ID, otadiag.KV("next", cur.bytesWritten), otadiag.KV("ok", needOK), otadiag.KV("duplicate", true)) } cur.deadline = now.Add(s.cfg.PhaseTimeout) return @@ -295,34 +336,67 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { if cur.meta.ID == meta.ID { reason = "conflicting_transfer" } - s.sendTransferAbort(meta.ID, reason) + abortOK := s.sendTransferAbort(meta.ID, reason) + otadiag.Event( + "[fabric-xfer]", "begin_reject", meta.ID, + otadiag.KV("reason", reason), + otadiag.KV("active_xfer", cur.meta.ID), + otadiag.KV("abort_tx", abortOK), + ) + otadiag.StopUpdateWindow("begin_reject") return } if done, ok := s.completedTransferFor(meta.ID); ok { if sameTransferTuple(done, meta) { - s.sendTransferDone(meta.ID) + doneOK := s.sendTransferDone(meta.ID) + otadiag.Event("[fabric-xfer]", "begin_duplicate_done", meta.ID, otadiag.KV("done_tx", doneOK)) return } - s.sendTransferAbort(meta.ID, "conflicting_transfer") + abortOK := s.sendTransferAbort(meta.ID, "conflicting_transfer") + otadiag.Event("[fabric-xfer]", "begin_reject", meta.ID, otadiag.KV("reason", "conflicting_transfer"), otadiag.KV("abort_tx", abortOK)) + otadiag.StopUpdateWindow("begin_reject") return } beginFn := s.beginTransfer if beginFn == nil { beginFn = beginTransfer } + beginStart := time.Now() + otadiag.Event( + "[fabric-xfer]", "begin_transfer_start", meta.ID, + otadiag.KV("target", meta.Target), + otadiag.KV("size", meta.Size), + ) sink, err := beginFn(meta) if err != nil { - s.sendTransferAbort(meta.ID, err.Error()) + durMS := int(time.Since(beginStart) / time.Millisecond) + abortOK := s.sendTransferAbort(meta.ID, err.Error()) + otadiag.Event( + "[fabric-xfer]", "begin_transfer_error", meta.ID, + otadiag.KV("err", err.Error()), + otadiag.KV("dur_ms", durMS), + otadiag.KV("abort_tx", abortOK), + ) + otadiag.StopUpdateWindow("begin_transfer_error") return } + otadiag.Event( + "[fabric-xfer]", "begin_transfer_done", meta.ID, + otadiag.KV("dur_ms", int(time.Since(beginStart)/time.Millisecond)), + ) s.incomingTransfer = &incomingTransfer{ meta: meta, sink: sink, hasher: xxhash.New(0), deadline: now.Add(s.cfg.PhaseTimeout), } - if s.sendTransferReady(meta.ID) { - s.sendTransferNeed(meta.ID, 0) + readyOK := s.sendTransferReady(meta.ID) + otadiag.Event("[fabric-xfer]", "ready_tx", meta.ID, otadiag.KV("ok", readyOK)) + if readyOK { + needOK := s.sendTransferNeed(meta.ID, 0) + otadiag.Event("[fabric-xfer]", "need_tx", meta.ID, otadiag.KV("next", 0), otadiag.KV("ok", needOK)) + } else { + otadiag.Event("[fabric-xfer]", "need_tx", meta.ID, otadiag.KV("next", 0), otadiag.KV("ok", false), otadiag.KV("skipped", "ready_failed")) } } @@ -333,32 +407,71 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { return } id := cur.meta.ID + otadiag.Event( + "[fabric-xfer]", "chunk_rx", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("expected", u32s(cur.bytesWritten)), + otadiag.KV("encoded_len", strconvx.Itoa(len(msg.Data))), + ) if msg.Offset < cur.bytesWritten { s.markRx() - s.sendTransferNeed(id, cur.bytesWritten) + needOK := s.sendTransferNeed(id, cur.bytesWritten) + otadiag.Event( + "[fabric-xfer]", "chunk_stale_offset", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("expected", u32s(cur.bytesWritten)), + otadiag.KV("need_tx", needOK), + ) cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) return } if msg.Offset > cur.bytesWritten { s.markRx() - s.sendTransferNeed(id, cur.bytesWritten) + needOK := s.sendTransferNeed(id, cur.bytesWritten) + otadiag.Event( + "[fabric-xfer]", "chunk_future_offset", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("expected", u32s(cur.bytesWritten)), + otadiag.KV("need_tx", needOK), + ) return } + decodeStart := time.Now() raw, errStr := decodeChunkData(msg.Data) if errStr != "" { + otadiag.Event( + "[fabric-xfer]", "chunk_decode_done", id, + otadiag.KV("ok", false), + otadiag.KV("reason", errStr), + otadiag.KV("dur_ms", int(time.Since(decodeStart)/time.Millisecond)), + ) s.logKV("xfer_chunk decode retry", "err", errStr) s.retryCorruptTransferFrame(errStr) return } + otadiag.Event( + "[fabric-xfer]", "chunk_decode_done", id, + otadiag.KV("ok", true), + otadiag.KV("raw_len", strconvx.Itoa(len(raw))), + otadiag.KV("dur_ms", int(time.Since(decodeStart)/time.Millisecond)), + ) if len(raw) == 0 { s.abortTransfer("empty_chunk") - s.sendTransferAbort(id, "empty_chunk") + abortOK := s.sendTransferAbort(id, "empty_chunk") + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", "empty_chunk"), otadiag.KV("ok", abortOK)) return } if cur.bytesWritten+uint32(len(raw)) > cur.meta.Size { reason := "size_too_large" + otadiag.Event( + "[fabric-xfer]", "chunk_size_overflow", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("raw_len", strconvx.Itoa(len(raw))), + otadiag.KV("size", u32s(cur.meta.Size)), + ) s.abortTransfer(reason) - s.sendTransferAbort(id, reason) + abortOK := s.sendTransferAbort(id, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) return } // Per-chunk integrity is required by the current MCU contract. @@ -366,22 +479,56 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { // base64url data string: the bytes still decode, just to the wrong // values. On mismatch we ask the sender to resume at the current // byte offset instead of clearing the transfer. + digestStart := time.Now() want, ok := canonicalXXHash32Hex(msg.ChunkDigest) if !ok { + otadiag.Event( + "[fabric-xfer]", "chunk_digest_done", id, + otadiag.KV("ok", false), + otadiag.KV("reason", "bad_message"), + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("digest_len", strconvx.Itoa(len(msg.ChunkDigest))), + otadiag.KV("data_len", strconvx.Itoa(len(msg.Data))), + otadiag.KV("dur_ms", int(time.Since(digestStart)/time.Millisecond)), + ) s.retryCorruptTransferFrame("bad_message") return } got := xxhashHex(xxhash.Sum32(raw, 0)) if got != want { + otadiag.Event( + "[fabric-xfer]", "chunk_digest_done", id, + otadiag.KV("ok", false), + otadiag.KV("reason", "chunk_digest_mismatch"), + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("dur_ms", int(time.Since(digestStart)/time.Millisecond)), + ) s.retryCorruptTransferFrame("chunk_digest_mismatch") return } + otadiag.Event( + "[fabric-xfer]", "chunk_digest_done", id, + otadiag.KV("ok", true), + otadiag.KV("dur_ms", int(time.Since(digestStart)/time.Millisecond)), + ) s.markRx() + writeStart := time.Now() + otadiag.Event( + "[fabric-xfer]", "sink_write_start", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("raw_len", strconvx.Itoa(len(raw))), + ) if err := cur.sink.WriteChunk(msg.Offset, raw); err != nil { reason := err.Error() + otadiag.Event( + "[fabric-xfer]", "sink_write_error", id, + otadiag.KV("reason", reason), + otadiag.KV("dur_ms", int(time.Since(writeStart)/time.Millisecond)), + ) s.logKV("transfer write failed", "err", reason) s.abortTransfer(reason) - s.sendTransferAbort(id, reason) + abortOK := s.sendTransferAbort(id, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) return } _, _ = cur.hasher.Write(raw) @@ -391,12 +538,40 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { cur.corruptRetryOffset = cur.bytesWritten cur.corruptRetriesAtOffset = 0 cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) + otadiag.Event( + "[fabric-xfer]", "sink_write_done", id, + otadiag.KV("dur_ms", int(time.Since(writeStart)/time.Millisecond)), + otadiag.KV("next", u32s(cur.bytesWritten)), + ) raw = nil // Keep transfer memory bounded on TinyGo. The receiver allocates while // unmarshalling JSON and decoding base64 chunks; without regular collection // long updates can run out of heap before commit. + gcStart := time.Now() + otadiag.Event("[fabric-xfer]", "gc_start", id, otadiag.KV("next", u32s(cur.bytesWritten))) runtime.GC() - s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + otadiag.Event( + "[fabric-xfer]", "gc_done", id, + otadiag.KV("dur_ms", int(time.Since(gcStart)/time.Millisecond)), + otadiag.KV("next", u32s(cur.bytesWritten)), + ) + needOK := s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + otadiag.Event( + "[fabric-xfer]", "need_tx", cur.meta.ID, + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("ok", needOK), + otadiag.KV("accepted", true), + ) + if cur.bytesWritten != 0 && cur.bytesWritten%transferMemSampleStride == 0 { + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + otadiag.Event( + "[fabric-xfer]", "transfer_mem_sample", cur.meta.ID, + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("alloc", ms.Alloc), + otadiag.KV("heap", ms.HeapSys), + ) + } } func (s *session) onTransferCommit(msg *protoXferCommit) { @@ -409,24 +584,28 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { if msg.Size != cur.meta.Size || cur.bytesWritten != cur.meta.Size { reason := "short_transfer" s.abortTransfer(reason) - s.sendTransferAbort(id, reason) + abortOK := s.sendTransferAbort(id, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) return } if msg.DigestAlg != digestAlg { s.abortTransfer("unsupported_digest_alg") - s.sendTransferAbort(id, "unsupported_digest_alg") + abortOK := s.sendTransferAbort(id, "unsupported_digest_alg") + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", "unsupported_digest_alg"), otadiag.KV("ok", abortOK)) return } commitDigest, ok := canonicalXXHash32Hex(msg.Digest) if !ok { s.abortTransfer("invalid_digest") - s.sendTransferAbort(id, "invalid_digest") + abortOK := s.sendTransferAbort(id, "invalid_digest") + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", "invalid_digest"), otadiag.KV("ok", abortOK)) return } streamedHex := xxhashHex(cur.hasher.Sum32()) if commitDigest != cur.meta.Digest || streamedHex != cur.meta.Digest { s.abortTransfer("digest_mismatch") - s.sendTransferAbort(id, "digest_mismatch") + abortOK := s.sendTransferAbort(id, "digest_mismatch") + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", "digest_mismatch"), otadiag.KV("ok", abortOK)) return } s.markRx() @@ -435,7 +614,8 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { s.logKV("transfer commit failed", "err", err.Error()) reason := err.Error() s.abortTransfer(reason) - s.sendTransferAbort(id, reason) + abortOK := s.sendTransferAbort(id, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) return } sink := cur.sink @@ -447,12 +627,15 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { ok, reason := s.invokeTransferTarget(meta, id, info, bytesPayload) if !ok { s.extendTransferQuiet("xfer_target_rejected", transferCompleteQuiet) - s.sendTransferAbort(id, reason) + abortOK := s.sendTransferAbort(id, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) return } s.extendTransferQuiet("xfer_done", transferCompleteQuiet) s.recordCompletedTransfer(meta) - s.sendTransferDone(id) + doneOK := s.sendTransferDone(id) + otadiag.Event("[fabric-xfer]", "done_tx", id, otadiag.KV("ok", doneOK)) + otadiag.StopUpdateWindow("transfer_done") } var targetCallTimeout = 5 * time.Second @@ -564,6 +747,7 @@ func (s *session) onTransferAbort(msg *protoXferAbort) { } s.markRx() s.abortTransfer(reason) + otadiag.StopUpdateWindow("remote_abort") } // xxhashHex formats a uint32 xxHash32 digest as 8 lower-case hex characters, diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index 3eae5f7..edf4fa9 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -7,10 +7,12 @@ import ( "errors" "io" "strings" + "sync" "testing" "time" "devicecode-go/bus" + "devicecode-go/services/otadiag" "devicecode-go/services/updater" "devicecode-go/x/xxhash" ) @@ -58,6 +60,140 @@ func (s *fakeTransferSink) Abort(reason string) error { // of the transferred bytes — it tracks per-chunk writes instead. func (s *fakeTransferSink) Bytes() []byte { return nil } +type diagCapture struct { + mu sync.Mutex + lines []string +} + +func captureOTADiag(t *testing.T) *diagCapture { + t.Helper() + c := &diagCapture{} + restore := otadiag.SetSinkForTest(func(line string) { + c.mu.Lock() + c.lines = append(c.lines, line) + c.mu.Unlock() + }) + t.Cleanup(restore) + return c +} + +func (c *diagCapture) snapshot() []string { + c.mu.Lock() + defer c.mu.Unlock() + return append([]string(nil), c.lines...) +} + +func assertDiagContains(t *testing.T, lines []string, want ...string) { + t.Helper() + for _, line := range lines { + matched := true + for _, part := range want { + if !strings.Contains(line, part) { + matched = false + break + } + } + if matched { + return + } + } + t.Fatalf("diagnostics missing %v in:\n%s", want, strings.Join(lines, "\n")) +} + +func waitDiagContains(t *testing.T, c *diagCapture, want ...string) { + t.Helper() + deadline := time.Now().Add(time.Second) + for { + lines := c.snapshot() + for _, line := range lines { + matched := true + for _, part := range want { + if !strings.Contains(line, part) { + matched = false + break + } + } + if matched { + return + } + } + if time.Now().After(deadline) { + assertDiagContains(t, lines, want...) + } + time.Sleep(10 * time.Millisecond) + } +} + +func assertDiagNotContains(t *testing.T, lines []string, want ...string) { + t.Helper() + for _, line := range lines { + matched := true + for _, part := range want { + if !strings.Contains(line, part) { + matched = false + break + } + } + if matched { + t.Fatalf("diagnostics unexpectedly contained %v in:\n%s", want, strings.Join(lines, "\n")) + } + } +} + +func assertDiagOrder(t *testing.T, lines []string, wants ...[]string) { + t.Helper() + next := 0 + for _, line := range lines { + if next >= len(wants) { + return + } + matched := true + for _, part := range wants[next] { + if !strings.Contains(line, part) { + matched = false + break + } + } + if matched { + next++ + } + } + if next < len(wants) { + t.Fatalf("diagnostics missing ordered item %d %v in:\n%s", next, wants[next], strings.Join(lines, "\n")) + } +} + +func waitDiagOrder(t *testing.T, c *diagCapture, wants ...[]string) { + t.Helper() + deadline := time.Now().Add(time.Second) + for { + lines := c.snapshot() + next := 0 + for _, line := range lines { + if next >= len(wants) { + return + } + matched := true + for _, part := range wants[next] { + if !strings.Contains(line, part) { + matched = false + break + } + } + if matched { + next++ + } + } + if next >= len(wants) { + return + } + if time.Now().After(deadline) { + assertDiagOrder(t, lines, wants...) + } + time.Sleep(10 * time.Millisecond) + } +} + type blockingVerifier struct { entered chan struct{} release chan struct{} @@ -278,6 +414,7 @@ func writeRawLine(t *testing.T, tr Transport, line string) { } func TestTransferBeginWithoutPrepareAbortsNoReady(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{}) defer cancelUpdater() @@ -294,9 +431,12 @@ func TestTransferBeginWithoutPrepareAbortsNoReady(t *testing.T) { if abort.Type != msgXferAbort || abort.XferID != "xfer-no-prepare" || abort.Err != "stage_not_prepared" { t.Fatalf("xfer_begin without prepare frame = %+v, want stage_not_prepared abort", abort) } + waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-no-prepare", "ev begin_transfer_error", "err stage_not_prepared", "abort_tx true") + waitDiagContains(t, diag, "[updater-stream]", "xfer_id xfer-no-prepare", "ev lease_error", "err stage_not_prepared") } func TestPreparedTransferBeginSendsReadyThenNeedZero(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{}) defer cancelUpdater() @@ -318,9 +458,21 @@ func TestPreparedTransferBeginSendsReadyThenNeedZero(t *testing.T) { waitUpdaterFactForFabricTest(t, upSub, func(f updater.UpdaterFact) bool { return f.State == updater.StateReceiving }) + waitDiagOrder(t, diag, + []string{"[fabric-xfer]", "xfer_id xfer-prepared", "ev begin_route_start"}, + []string{"[fabric-xfer]", "xfer_id xfer-prepared", "ev begin_rx"}, + []string{"[fabric-xfer]", "xfer_id xfer-prepared", "ev begin_validate_ok", "target updater/main"}, + []string{"[fabric-xfer]", "xfer_id xfer-prepared", "ev begin_transfer_start"}, + []string{"[updater-stream]", "xfer_id xfer-prepared", "ev begin_entry"}, + []string{"[updater-stream]", "xfer_id xfer-prepared", "ev begin_exit"}, + []string{"[fabric-xfer]", "xfer_id xfer-prepared", "ev begin_transfer_done"}, + []string{"[fabric-xfer]", "xfer_id xfer-prepared", "ev ready_tx", "ok true"}, + []string{"[fabric-xfer]", "xfer_id xfer-prepared", "ev need_tx", "next 0", "ok true"}, + ) } -func TestInvalidTransferBeginRejectsNoActiveTransfer(t *testing.T) { +func TestInvalidTransferBeginEmitsRejectDiagnosticNoActiveTransfer(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -355,6 +507,7 @@ func TestInvalidTransferBeginRejectsNoActiveTransfer(t *testing.T) { if beginCount != 0 { t.Fatalf("beginTransfer called %d times for invalid begin, want 0", beginCount) } + waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-invalid", "ev begin_reject", "reason bad_message:unsupported_target", "abort_tx true") } func TestTransferAbortCancelsUpdaterLease(t *testing.T) { @@ -548,7 +701,8 @@ func TestTransferReceiveSuccess(t *testing.T) { } } -func TestTransferAcceptedChunkAdvancesNeed(t *testing.T) { +func TestTransferAcceptedChunkEmitsProcessingDiagnostics(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -567,9 +721,54 @@ func TestTransferAcceptedChunkAdvancesNeed(t *testing.T) { if need.Next != uint32(len(payload)) { t.Fatalf("xfer_need.next = %d, want %d", need.Next, len(payload)) } + waitDiagOrder(t, diag, + []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev chunk_rx", "offset 0", "expected 0"}, + []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev chunk_decode_done", "ok true", "raw_len 4"}, + []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev chunk_digest_done", "ok true"}, + []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev sink_write_start", "offset 0", "raw_len 4"}, + []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev sink_write_done", "next 4"}, + []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev gc_start", "next 4"}, + []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev gc_done", "next 4"}, + []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev need_tx", "next 4", "ok true", "accepted true"}, + ) + assertDiagNotContains(t, diag.snapshot(), "[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev transfer_mem_sample") +} + +func TestTransferAcceptedChunkEmitsSparseMemorySample(t *testing.T) { + diag := captureOTADiag(t) + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := make([]byte, transferMemSampleStride) + for i := range payload { + payload[i] = byte(i) + } + sendMsg(t, cm5, xferBegin("xfer-mem-diag", payload, nil)) + readTransferReady(t, cm5, "xfer-mem-diag", 0) + + const chunkSize = 2048 + for off := 0; off < len(payload); off += chunkSize { + end := off + chunkSize + if end > len(payload) { + end = len(payload) + } + sendMsg(t, cm5, xferChunk("xfer-mem-diag", uint32(off), payload[off:end])) + need := readMsg[protoXferNeed](t, cm5) + if need.Next != uint32(end) { + t.Fatalf("xfer_need.next = %d, want %d", need.Next, end) + } + } + waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-mem-diag", "ev transfer_mem_sample", "next 65536", "alloc", "heap") } func TestTransferChunkFutureOffsetRequestsCurrentAndCompletes(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -596,6 +795,7 @@ func TestTransferChunkFutureOffsetRequestsCurrentAndCompletes(t *testing.T) { if len(sink.abortReasons) != 0 { t.Fatalf("sink.Abort called on recoverable future offset: %v", sink.abortReasons) } + waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-future-offset", "ev chunk_future_offset", "offset 7", "expected 0") sendMsg(t, cm5, xferChunk("xfer-future-offset", 0, payload)) need = readMsg[protoXferNeed](t, cm5) @@ -617,6 +817,7 @@ func TestTransferChunkFutureOffsetRequestsCurrentAndCompletes(t *testing.T) { } func TestTransferChunkStaleOffsetRequestsCurrentAndCompletes(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -648,6 +849,7 @@ func TestTransferChunkStaleOffsetRequestsCurrentAndCompletes(t *testing.T) { if len(sink.abortReasons) != 0 { t.Fatalf("sink.Abort called on recoverable stale offset: %v", sink.abortReasons) } + waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-stale-offset", "ev chunk_stale_offset", "offset 0", "expected 3") sendMsg(t, cm5, xferChunk("xfer-stale-offset", 3, []byte("def"))) need = readMsg[protoXferNeed](t, cm5) @@ -782,6 +984,7 @@ func TestTransferChunkInvalidBase64RetriesThenAborts(t *testing.T) { } func TestTransferChunkDigestMismatchRequestsSameOffset(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -809,6 +1012,10 @@ func TestTransferChunkDigestMismatchRequestsSameOffset(t *testing.T) { if len(sink.writes) != 0 { t.Fatalf("sink received %d writes before digest passed", len(sink.writes)) } + lines := diag.snapshot() + assertDiagContains(t, lines, "[fabric-xfer]", "xfer_id xfer-bad-chunk-digest", "ev chunk_digest_done", "ok false", "reason chunk_digest_mismatch") + assertDiagNotContains(t, lines, "[fabric-xfer]", "xfer_id xfer-bad-chunk-digest", "ev sink_write_start") + assertDiagNotContains(t, lines, "[fabric-xfer]", "xfer_id xfer-bad-chunk-digest", "ev gc_start") sendMsg(t, cm5, xferChunk("xfer-bad-chunk-digest", 0, payload)) need = readMsg[protoXferNeed](t, cm5) @@ -817,7 +1024,8 @@ func TestTransferChunkDigestMismatchRequestsSameOffset(t *testing.T) { } } -func TestTransferChunkWriteErrorAborts(t *testing.T) { +func TestTransferChunkWriteErrorEmitsAbortDiagnostic(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -833,6 +1041,9 @@ func TestTransferChunkWriteErrorAborts(t *testing.T) { sendMsg(t, cm5, xferChunk("xfer-write-error", 0, payload)) readTransferAbort(t, cm5, "xfer-write-error", "write_boom") + + waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-write-error", "ev sink_write_error", "reason write_boom") + waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-write-error", "ev abort_tx", "reason write_boom", "ok true") } func TestTransferChunkDigestMismatchRetriesThenAborts(t *testing.T) { @@ -1223,6 +1434,7 @@ func TestTransferTargetInvokedAfterCommit(t *testing.T) { } func TestCompletedTransferDuplicateBeginSameTupleReplaysDone(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -1282,9 +1494,11 @@ func TestCompletedTransferDuplicateBeginSameTupleReplaysDone(t *testing.T) { t.Fatalf("duplicate completed begin restaged transfer: %+v", p) default: } + waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-completed-replay", "ev begin_duplicate_done", "done_tx true") } func TestCompletedTransferDuplicateBeginConflictingTupleAborts(t *testing.T) { + diag := captureOTADiag(t) b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -1325,6 +1539,7 @@ func TestCompletedTransferDuplicateBeginConflictingTupleAborts(t *testing.T) { if beginCount != 1 { t.Fatalf("conflicting completed begin reopened sink: beginCount=%d", beginCount) } + waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-completed-conflict", "ev begin_reject", "reason conflicting_transfer", "abort_tx true") } func TestTransferTargetRejectAbortsTransfer(t *testing.T) { diff --git a/services/hal/devices/ltc4015/builder.go b/services/hal/devices/ltc4015/builder.go index 3f6e411..1a97e92 100644 --- a/services/hal/devices/ltc4015/builder.go +++ b/services/hal/devices/ltc4015/builder.go @@ -86,6 +86,7 @@ func (builder) Build(ctx context.Context, in core.BuilderInput) (core.Device, er id: in.ID, aBat: core.CapAddr{Domain: domBat, Kind: types.KindBattery, Name: name}, aChg: core.CapAddr{Domain: domChg, Kind: types.KindCharger, Name: name}, + aCfg: core.CapAddr{Domain: domChg, Kind: types.KindChargerConfig, Name: name}, aTmp: core.CapAddr{Domain: domChg, Kind: types.KindTemperature, Name: name}, res: in.Res, diff --git a/services/hal/devices/ltc4015/device.go b/services/hal/devices/ltc4015/device.go index 1a32e6b..2538d89 100644 --- a/services/hal/devices/ltc4015/device.go +++ b/services/hal/devices/ltc4015/device.go @@ -19,6 +19,7 @@ type Device struct { id string aBat core.CapAddr // power/battery/ aChg core.CapAddr // power/charger/ + aCfg core.CapAddr // power/charger_config/ aTmp core.CapAddr // power/charger//temperature res core.Resources @@ -42,6 +43,7 @@ type Device struct { lastVinLo, lastVinHi int32 lastVsysLo, lastVsysHi int32 lastVbatLoCell, lastVbatHiCell int32 + lastBSRHigh uint32 lastNTCHi, lastNTCLo uint16 // Desired alert sources (user intent). Auto re-arming always applies. @@ -90,6 +92,10 @@ func (d *Device) Capabilities() []core.CapabilitySpec { Domain: d.aChg.Domain, Kind: types.KindCharger, Name: d.aChg.Name, Info: types.Info{SchemaVersion: 1, Driver: "ltc4015", Detail: ci}, }, + { + Domain: d.aCfg.Domain, Kind: types.KindChargerConfig, Name: d.aCfg.Name, + Info: types.Info{SchemaVersion: 1, Driver: "ltc4015", Detail: ci}, + }, { Domain: d.aTmp.Domain, Kind: types.KindTemperature, Name: d.aTmp.Name, Info: types.Info{ @@ -119,6 +125,7 @@ func (d *Device) Init(ctx context.Context) error { // Advertise initial state as degraded until the first good sample. _ = d.res.Pub.Emit(core.Event{Addr: d.aBat, Err: "initialising"}) _ = d.res.Pub.Emit(core.Event{Addr: d.aChg, Err: "initialising"}) + _ = d.res.Pub.Emit(core.Event{Addr: d.aCfg, Err: "initialising"}) _ = d.res.Pub.Emit(core.Event{Addr: d.aTmp, Err: "initialising"}) go d.worker(d.ctx) @@ -429,6 +436,7 @@ func (d *Device) worker(ctx context.Context) { d.applyConfigure(c) // After any configure: re-arm (opposite edge) then publish. d.rearm() + d.publishConfig() d.sampleAndPublish() } @@ -537,6 +545,7 @@ func (d *Device) applyConfigure(c types.ChargerConfigure) { if c.BSRHigh_uOhmPerCell != nil { if err := d.dev.SetBSRHigh_uOhmPerCell(*c.BSRHigh_uOhmPerCell); err == nil { d.desiredLimit |= ltc4015.BSRHi + d.lastBSRHigh = *c.BSRHigh_uOhmPerCell } else { d.errChg("set_bsr_high_failed", err) } @@ -770,6 +779,16 @@ func (d *Device) sampleAndPublish() { } } +func (d *Device) publishConfig() { + _ = d.res.Pub.Emit(core.Event{Addr: d.aCfg, Payload: types.ChargerConfigValue{ + Source: "ltc4015-programmed", + VinLo_mV: d.lastVinLo, + VinHi_mV: d.lastVinHi, + BSRHigh_uOhmPerCell: d.lastBSRHigh, + AlertMaskBits: uint16(d.desiredLimit), + }}) +} + // ---- Errors ---- func (d *Device) errBoth(tag string, err error) { diff --git a/services/otadiag/otadiag.go b/services/otadiag/otadiag.go new file mode 100644 index 0000000..2c0e3b5 --- /dev/null +++ b/services/otadiag/otadiag.go @@ -0,0 +1,253 @@ +package otadiag + +import ( + "strconv" + "strings" + "sync" + "sync/atomic" + "time" +) + +const XferNone = "-" + +type Field struct { + Key string + Value string +} + +var ( + startedAt = time.Now() + nextSeq atomic.Uint64 + + sinkMu sync.Mutex + sink func(string) + + windowMu sync.Mutex + windowStop chan struct{} + windowXferID string + updaterSnap StageSnapshot +) + +type StageSnapshot struct { + State string + Generation uint64 + LeaseActive bool + XferID string +} + +func KV(key string, value any) Field { + return Field{Key: key, Value: valueString(value)} +} + +func Event(prefix, event, xferID string, fields ...Field) { + if xferID == "" { + xferID = XferNone + } + seq := nextSeq.Add(1) + uptime := time.Since(startedAt).Milliseconds() + + var b strings.Builder + b.WriteString(prefix) + b.WriteString(" seq ") + b.WriteString(strconv.FormatUint(seq, 10)) + b.WriteString(" uptime_ms ") + b.WriteString(strconv.FormatInt(uptime, 10)) + b.WriteString(" xfer_id ") + b.WriteString(xferID) + b.WriteString(" ev ") + b.WriteString(event) + for _, f := range fields { + if f.Key == "" { + continue + } + b.WriteByte(' ') + b.WriteString(f.Key) + b.WriteByte(' ') + b.WriteString(f.Value) + } + emit(b.String()) +} + +func SetSinkForTest(fn func(string)) func() { + sinkMu.Lock() + prev := sink + sink = fn + sinkMu.Unlock() + return func() { + sinkMu.Lock() + sink = prev + sinkMu.Unlock() + } +} + +func SetUpdaterSnapshot(s StageSnapshot) { + if s.XferID == "" { + s.XferID = XferNone + } + windowMu.Lock() + updaterSnap = s + windowMu.Unlock() +} + +func StartUpdateWindow(reason, xferID string) { + if xferID == "" { + xferID = XferNone + } + windowMu.Lock() + if windowStop != nil { + close(windowStop) + } + stop := make(chan struct{}) + windowStop = stop + windowXferID = xferID + windowMu.Unlock() + + Event("[mcu-ota]", "heartbeat_start", xferID, KV("reason", reason)) + go heartbeatLoop(stop) +} + +func SetActiveXfer(xferID string) { + if xferID == "" { + xferID = XferNone + } + windowMu.Lock() + windowXferID = xferID + if updaterSnap.XferID == "" || updaterSnap.XferID == XferNone { + updaterSnap.XferID = xferID + } + windowMu.Unlock() +} + +func StopUpdateWindow(reason string) { + stopUpdateWindow(nil, reason) +} + +func stopUpdateWindow(expected <-chan struct{}, reason string) { + xferID := XferNone + stopped := false + windowMu.Lock() + if windowXferID != "" { + xferID = windowXferID + } + if windowStop != nil && (expected == nil || expected == windowStop) { + close(windowStop) + windowStop = nil + windowXferID = XferNone + stopped = true + } + windowMu.Unlock() + if stopped { + Event("[mcu-ota]", "heartbeat_stop", xferID, KV("reason", reason)) + } +} + +func WindowActive() bool { + windowMu.Lock() + active := windowStop != nil + windowMu.Unlock() + return active +} + +func heartbeatLoop(stop <-chan struct{}) { + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + deadline := time.NewTimer(45 * time.Second) + defer deadline.Stop() + + for { + select { + case <-stop: + return + case <-deadline.C: + stopUpdateWindow(stop, "heartbeat_deadline") + return + case <-ticker.C: + xferID, snap := heartbeatSnapshot() + Event( + "[mcu-ota]", "heartbeat", xferID, + KV("state", blank(snap.State)), + KV("generation", snap.Generation), + KV("lease_active", snap.LeaseActive), + KV("active_xfer", blank(snap.XferID)), + ) + } + } +} + +func heartbeatSnapshot() (string, StageSnapshot) { + windowMu.Lock() + defer windowMu.Unlock() + xferID := windowXferID + if xferID == "" || xferID == XferNone { + xferID = updaterSnap.XferID + } + if xferID == "" { + xferID = XferNone + } + return xferID, updaterSnap +} + +func emit(line string) { + sinkMu.Lock() + fn := sink + sinkMu.Unlock() + if fn != nil { + fn(line) + return + } + println(line) +} + +func blank(v string) string { + if v == "" { + return XferNone + } + return v +} + +func valueString(v any) string { + switch x := v.(type) { + case string: + if x == "" { + return XferNone + } + return compact(x) + case bool: + if x { + return "true" + } + return "false" + case int: + return strconv.Itoa(x) + case int32: + return strconv.FormatInt(int64(x), 10) + case int64: + return strconv.FormatInt(x, 10) + case uint: + return strconv.FormatUint(uint64(x), 10) + case uint32: + return strconv.FormatUint(uint64(x), 10) + case uint64: + return strconv.FormatUint(x, 10) + case time.Duration: + return strconv.FormatInt(x.Milliseconds(), 10) + default: + return XferNone + } +} + +func compact(s string) string { + var b strings.Builder + for i := 0; i < len(s); i++ { + c := s[i] + if c <= ' ' { + b.WriteByte('_') + continue + } + b.WriteByte(c) + } + if b.Len() == 0 { + return XferNone + } + return b.String() +} diff --git a/services/telemetry/alerts.go b/services/telemetry/alerts.go index ffb4982..aa3d0bd 100644 --- a/services/telemetry/alerts.go +++ b/services/telemetry/alerts.go @@ -2,10 +2,9 @@ package telemetry import "devicecode-go/types" -// chargerAlertFSM implements W8 from docs/firmware-alignment-update.md: -// hold previous bitfield state; on bit-set transition for a kind, emit -// one normal event with the canonical kind name. The 14 canonical kinds -// split into: +// chargerAlertFSM holds previous bitfield state; on bit-set transition +// for a kind, it emits one normal event with the canonical kind name. +// The 14 canonical kinds split into: // - 11 bit-driven kinds (state[] + status[]), compared against the // previous ChargerValue snapshot // - 3 analog kinds (vin_lo / vin_hi / bsr_high), compared against @@ -75,7 +74,7 @@ type AlertEvent struct { // the spec one place to grow if severity refines later. func alertSeverity(k AlertKind) string { switch k { - case AlertBatMissing, AlertBatShort, AlertMaxChargeTimeFault, AlertBsrHigh: + case AlertBatMissing, AlertBatShort, AlertMaxChargeTimeFault, AlertVinLo, AlertVinHi, AlertBsrHigh: return "warning" default: return "info" diff --git a/services/telemetry/telemetry.go b/services/telemetry/telemetry.go index c298d28..3c15b04 100644 --- a/services/telemetry/telemetry.go +++ b/services/telemetry/telemetry.go @@ -1,5 +1,5 @@ -// Package telemetry implements the W7/W8 retained-state and sparse- -// alert publishers from docs/firmware-alignment-update.md. It +// Package telemetry implements the retained-state and sparse-alert +// publishers from ../docs/updating.md. It // subscribes to the existing HAL value topics (hal/cap/env/..., // hal/cap/power/...) and republishes them under the canonical // state/self/* surface using integer engineering units, plus runs the @@ -23,8 +23,7 @@ import ( "devicecode-go/types" ) -// Topic constants. Mirrors the canonical fact schema in -// docs/firmware-alignment-update.md §"Telemetry/state facts". +// Topic constants mirror the canonical fact schema in ../docs/updating.md. var ( TopicBattery = bus.T("state", "self", "power", "battery") TopicCharger = bus.T("state", "self", "power", "charger") @@ -35,7 +34,7 @@ var ( TopicChargerAlert = bus.T("event", "self", "power", "charger", "alert") - // TopicFabricLink mirrors the updater's W10 watcher — telemetry + // TopicFabricLink mirrors the updater's link-ready watcher: telemetry // republishes the charger config retain on every link-ready edge // so the CM5 sees a fresh config fact on every newly established // session, warm or cold. (Per-value retains like @@ -57,9 +56,8 @@ var ( const MemSnapshotInterval = 3 * time.Second // ChargerThresholds carries the analog comparison thresholds used by -// both the state/self/power/charger/config retained fact (W7 finish) -// and the charger alert FSM's analog kinds (W8 finish — vin_lo, -// vin_hi, bsr_high). +// both the state/self/power/charger/config retained fact and the charger +// alert FSM's analog kinds: vin_lo, vin_hi, and bsr_high. // // These ARE the LTC4015 effective config in production; on this // branch they default to conservative bring-up values. @@ -70,11 +68,9 @@ type ChargerThresholds struct { } // ChargerAlertMask is the 14-bool mask matching the 14 canonical alert -// kinds. Pre-fabric-security the mask is informational only — the -// alert FSM ignores it for emission. Once the LTC4015 driver -// programs the chip's alert-enable register from this and reports it -// back, masking can flow through to the FSM. Names here are -// spec-frozen to match docs/firmware-alignment-update.md. +// kinds. The mask is informational until the LTC4015 driver programs the +// chip's alert-enable register from this and reports it back; after that, +// masking can flow through to the FSM. Names here are wire-stable. type ChargerAlertMask struct { VinLo bool `json:"vin_lo"` VinHi bool `json:"vin_hi"` @@ -192,7 +188,7 @@ func (s *Service) Run(ctx context.Context) { memTick := time.NewTicker(MemSnapshotInterval) defer memTick.Stop() - prevReady := map[string]bool{} + linkState := map[string]linkObservation{} for { select { @@ -221,15 +217,15 @@ func (s *Service) Run(ctx context.Context) { if !ok || msg == nil { continue } - linkID, ready := decodeLinkReady(msg) + linkID, obs := decodeLinkReady(msg) if linkID == "" { continue } - was := prevReady[linkID] - if ready && !was { + prev, hadPrev := linkState[linkID] + if linkReadyEdgeReason(prev, obs, hadPrev) != "" { s.publishChargerConfig() } - prevReady[linkID] = ready + linkState[linkID] = obs case <-memTick.C: s.publishRuntimeMem() } @@ -239,37 +235,67 @@ func (s *Service) Run(ctx context.Context) { // decodeLinkReady mirrors services/updater's helper but local to the // telemetry package — kept duplicated rather than reaching into // updater (cleaner package boundary). -func decodeLinkReady(msg *bus.Message) (string, bool) { +type linkObservation struct { + Ready bool + PeerSID string + LocalSID string +} + +func linkReadyEdgeReason(prev, cur linkObservation, hadPrev bool) string { + if !cur.Ready { + return "" + } + if !hadPrev || !prev.Ready { + return "ready_edge" + } + if prev.PeerSID != cur.PeerSID { + return "peer_sid_changed" + } + if prev.LocalSID != cur.LocalSID { + return "local_sid_changed" + } + return "" +} + +func decodeLinkReady(msg *bus.Message) (string, linkObservation) { + var obs linkObservation if msg == nil { - return "", false + return "", obs } t := msg.Topic if t == nil || t.Len() < 4 { - return "", false + return "", obs } id, _ := t.At(t.Len() - 1).(string) if id == "" { - return "", false + return "", obs } switch p := msg.Payload.(type) { case nil: - return id, false + return id, obs case map[string]any: - ready, _ := p["ready"].(bool) - return id, ready + obs.Ready, _ = p["ready"].(bool) + obs.PeerSID, _ = p["peer_sid"].(string) + obs.LocalSID, _ = p["local_sid"].(string) + return id, obs } // Probe via JSON for the typed-struct payload fabric publishes. b, err := json.Marshal(msg.Payload) if err != nil { - return id, false + return id, obs } var probe struct { - Ready bool `json:"ready"` + Ready bool `json:"ready"` + PeerSID string `json:"peer_sid"` + LocalSID string `json:"local_sid"` } if err := json.Unmarshal(b, &probe); err != nil { - return id, false + return id, obs } - return id, probe.Ready + obs.Ready = probe.Ready + obs.PeerSID = probe.PeerSID + obs.LocalSID = probe.LocalSID + return id, obs } // dispatchPower splits the power-domain wildcard into per-kind @@ -285,16 +311,35 @@ func (s *Service) dispatchPower(msg *bus.Message) { case types.ChargerValue: s.publishCharger(v) s.alertFSM.observe(s, v) + case types.ChargerConfigValue: + s.applyChargerConfig(v) } } +func (s *Service) applyChargerConfig(v types.ChargerConfigValue) { + source := v.Source + if source == "" { + source = "ltc4015-programmed" + } + s.chargerCfg = ChargerConfig{ + Source: source, + Thresholds: ChargerThresholds{ + VinLoMV: v.VinLo_mV, + VinHiMV: v.VinHi_mV, + BSRHighUohmPerCell: v.BSRHigh_uOhmPerCell, + }, + AlertMaskBits: v.AlertMaskBits, + } + s.publishChargerConfig() +} + // uptimeMs returns a service-monotonic uptime — close enough to a // boot-uptime for the consumers' purposes (within a few HAL-init ms). func (s *Service) uptimeMs() int64 { return time.Since(s.startedAt).Milliseconds() } -// ---- W7: retained-state publishers --------------------------------- +// ---- retained-state publishers ------------------------------------- // BatteryFact is the retained payload at state/self/power/battery. // All units are integer engineering units per the spec. @@ -325,8 +370,8 @@ func (s *Service) publishBattery(v types.BatteryValue) { // Carries raw bitfields AND 3 decoded boolean objects. // // The canonical key names below come from -// docs/firmware-alignment-update.md §"Telemetry/state facts" — they -// are NOT the existing display names in types.ChargerStateTable etc. +// ../docs/updating.md. They are NOT the existing display names in +// types.ChargerStateTable etc. // (those drop the `_charge` / `_active` / `_fault` suffixes for // log-line brevity). The wire-canonical names are spec-frozen because // the Lua import side keys off them; renaming any of these is a @@ -438,7 +483,7 @@ func (s *Service) publishCharger(v types.ChargerValue) { // operating-state booleans (charger_enabled, ok_to_charge, etc.) — // those live on state/self/power/charger. type ChargerConfigFact struct { - Schema int `json:"schema"` + Schema string `json:"schema"` Source string `json:"source"` Thresholds ChargerThresholds `json:"thresholds"` AlertMaskBits uint16 `json:"alert_mask_bits"` @@ -457,7 +502,7 @@ func (s *Service) publishChargerConfig() { source = "ltc4015-default" } fact := ChargerConfigFact{ - Schema: 1, + Schema: "charger-config/1", Source: source, Thresholds: cfg.Thresholds, AlertMaskBits: cfg.AlertMaskBits, diff --git a/services/telemetry/telemetry_test.go b/services/telemetry/telemetry_test.go index 2c40985..bcacec0 100644 --- a/services/telemetry/telemetry_test.go +++ b/services/telemetry/telemetry_test.go @@ -97,9 +97,7 @@ func TestPublishesChargerWithDecodedBooleans(t *testing.T) { if fact.StateBits != uint16(types.AbsorbCharge|types.CCCVCharge) { t.Fatalf("state_bits = 0x%x", fact.StateBits) } - // Decoded booleans use the canonical names from - // docs/firmware-alignment-update.md §6.2 — these are the - // wire-frozen names the Lua side keys off. + // Decoded booleans use the canonical wire names the Lua side keys off. if !fact.State["absorb_charge"] || !fact.State["cccv_charge"] { t.Fatalf("decoded state booleans wrong: %+v", fact.State) } @@ -246,8 +244,8 @@ func TestChargerAlertFSMEdgeOnly(t *testing.T) { } func TestPublishesChargerConfigAtStartup(t *testing.T) { - // W7 finish: state/self/power/charger/config retains at startup - // with the conservative defaults from DefaultChargerConfig(). + // state/self/power/charger/config retains at startup with the + // conservative defaults from DefaultChargerConfig(). b := newTestBus() observer := b.NewConnection("observer") sub := observer.Subscribe(TopicChargerCfg) @@ -262,7 +260,7 @@ func TestPublishesChargerConfigAtStartup(t *testing.T) { if !ok { t.Fatalf("payload type = %T", msg.Payload) } - if fact.Schema != 1 || fact.Source != "ltc4015-default" { + if fact.Schema != "charger-config/1" || fact.Source != "ltc4015-default" { t.Fatalf("schema/source wrong: %+v", fact) } if fact.Thresholds.VinLoMV == 0 || fact.Thresholds.VinHiMV == 0 || fact.Thresholds.BSRHighUohmPerCell == 0 { @@ -273,10 +271,79 @@ func TestPublishesChargerConfigAtStartup(t *testing.T) { } } +func TestRepublishesChargerConfigOnLinkReadyAndSessionEdges(t *testing.T) { + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerCfg) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + // Drain startup config retain. + waitForChargerConfig(t, sub) + + publisher := b.NewConnection("test-fabric") + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": false, "peer_sid": "cm5-a", "local_sid": "mcu-a"}, + true, + )) + time.Sleep(50 * time.Millisecond) + for len(sub.Channel()) > 0 { + <-sub.Channel() + } + + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": true, "peer_sid": "cm5-a", "local_sid": "mcu-a"}, + true, + )) + waitForChargerConfig(t, sub) + + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": true, "peer_sid": "cm5-a", "local_sid": "mcu-a"}, + true, + )) + assertNoChargerConfig(t, sub, 150*time.Millisecond) + + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu-uart0"), + map[string]any{"ready": true, "peer_sid": "cm5-b", "local_sid": "mcu-a"}, + true, + )) + waitForChargerConfig(t, sub) +} + +func waitForChargerConfig(t *testing.T, sub *bus.Subscription) { + t.Helper() + select { + case msg := <-sub.Channel(): + if _, ok := msg.Payload.(ChargerConfigFact); !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for charger config fact") + } +} + +func assertNoChargerConfig(t *testing.T, sub *bus.Subscription, d time.Duration) { + t.Helper() + settled := time.After(d) + for { + select { + case <-sub.Channel(): + t.Fatal("unexpected charger config republish on unchanged Ready=true retain") + case <-settled: + return + } + } +} + func TestChargerAlertFSMVinLoEdge(t *testing.T) { - // W8 finish: vin_lo fires on ChargerValue.VIN_mV crossing below - // the configured threshold. Subsequent observations below the - // threshold do NOT re-fire. + // vin_lo fires on ChargerValue.VIN_mV crossing below the configured + // threshold. Subsequent observations below the threshold do NOT re-fire. b := newTestBus() observer := b.NewConnection("observer") sub := observer.Subscribe(TopicChargerAlert) @@ -299,6 +366,9 @@ func TestChargerAlertFSMVinLoEdge(t *testing.T) { if ev.Kind != AlertVinLo { t.Fatalf("kind = %q, want vin_lo", ev.Kind) } + if ev.Severity != "warning" { + t.Fatalf("severity = %q, want warning", ev.Severity) + } case <-time.After(2 * time.Second): t.Fatal("timeout waiting for vin_lo alert") } @@ -334,6 +404,9 @@ func TestChargerAlertFSMVinHiEdge(t *testing.T) { if ev.Kind != AlertVinHi { t.Fatalf("kind = %q, want vin_hi", ev.Kind) } + if ev.Severity != "warning" { + t.Fatalf("severity = %q, want warning", ev.Severity) + } case <-time.After(2 * time.Second): t.Fatal("timeout waiting for vin_hi alert") } diff --git a/services/updater/abupdate_diag_host.go b/services/updater/abupdate_diag_host.go new file mode 100644 index 0000000..cfda0a4 --- /dev/null +++ b/services/updater/abupdate_diag_host.go @@ -0,0 +1,27 @@ +//go:build !tinygo || !rp2350 + +package updater + +import "sync" + +var abupdateDiagMu sync.Mutex +var abupdateDiagActive bool + +func installABUpdateDiagHook(xferID string, generation uint64) { + _, _ = xferID, generation + abupdateDiagMu.Lock() + abupdateDiagActive = true + abupdateDiagMu.Unlock() +} + +func clearABUpdateDiagHook() { + abupdateDiagMu.Lock() + abupdateDiagActive = false + abupdateDiagMu.Unlock() +} + +func abupdateDiagHookActiveForTest() bool { + abupdateDiagMu.Lock() + defer abupdateDiagMu.Unlock() + return abupdateDiagActive +} diff --git a/services/updater/abupdate_diag_tinygo.go b/services/updater/abupdate_diag_tinygo.go new file mode 100644 index 0000000..c789aee --- /dev/null +++ b/services/updater/abupdate_diag_tinygo.go @@ -0,0 +1,57 @@ +//go:build tinygo && rp2350 + +package updater + +import ( + "devicecode-go/services/otadiag" + "pico2-a-b/abupdate" + "pico2-a-b/imagev1" +) + +var abupdateDiagXferID string +var abupdateDiagGeneration uint64 + +func installABUpdateDiagHook(xferID string, generation uint64) { + abupdateDiagXferID = xferID + abupdateDiagGeneration = generation + abupdate.SetDiagnosticHook(func(event string, kv ...string) { + emitRawDiag(event, kv...) + }) + imagev1.SetDiagnosticHook(func(event string, kv ...string) { + emitRawDiag(event, kv...) + }) +} + +func emitRawDiag(event string, kv ...string) { + var fields [10]otadiag.Field + n := 0 + fields[n] = otadiag.KV("generation", abupdateDiagGeneration) + n++ + for i := 0; i+1 < len(kv) && n < len(fields); i += 2 { + fields[n] = otadiag.KV(kv[i], kv[i+1]) + n++ + } + otadiag.Event("[updater-stream]", event, abupdateDiagXferID, fields[:n]...) +} + +func clearABUpdateDiagHook() { + abupdate.ClearDiagnosticHook() + imagev1.ClearDiagnosticHook() + abupdateDiagXferID = "" + abupdateDiagGeneration = 0 +} + +func emitABUpdateDiag(event string, fields ...otadiag.Field) { + var out [10]otadiag.Field + n := 0 + out[n] = otadiag.KV("generation", abupdateDiagGeneration) + n++ + for _, f := range fields { + if n >= len(out) { + break + } + out[n] = f + n++ + } + otadiag.Event("[updater-stream]", event, abupdateDiagXferID, out[:n]...) +} diff --git a/services/updater/facts.go b/services/updater/facts.go index a004e79..b0da985 100644 --- a/services/updater/facts.go +++ b/services/updater/facts.go @@ -1,5 +1,11 @@ package updater +import ( + "time" + + "devicecode-go/services/otadiag" +) + // PublishSoftware emits the retained state/self/software fact with the // build identity + the per-boot RAM-only boot_id + the persisted // payload_sha256 (when abupdate has populated it). Callers don't pass @@ -45,8 +51,29 @@ func (s *Service) PublishUpdater() { JobID: strPtrOrNil(s.jobID), BootBuyRC: int32PtrOrNil(s.bootBuyRC), } + snap := s.diagSnapshotLocked() s.mu.Unlock() + setDiagSnapshot(snap) + active := otadiag.WindowActive() + start := time.Now() + if active { + otadiag.Event( + "[updater-stream]", "publish_updater_start", snap.XferID, + otadiag.KV("state", snap.State), + otadiag.KV("generation", snap.Generation), + otadiag.KV("lease_active", snap.LeaseActive), + ) + } s.conn.Publish(s.conn.NewMessage(TopicUpdaterFact, fact, true)) + if active { + otadiag.Event( + "[updater-stream]", "publish_updater_done", snap.XferID, + otadiag.KV("state", snap.State), + otadiag.KV("generation", snap.Generation), + otadiag.KV("lease_active", snap.LeaseActive), + otadiag.KV("dur_ms", int(time.Since(start)/time.Millisecond)), + ) + } } // PublishHealth emits the retained state/self/health fact. Reason is diff --git a/services/updater/prestage_host.go b/services/updater/prestage_host.go index 531764d..20f88fc 100644 --- a/services/updater/prestage_host.go +++ b/services/updater/prestage_host.go @@ -12,17 +12,18 @@ type streamedStage struct { PayloadSHA256 string } -func startStreamedStage(size uint32) error { - _ = size +func startStreamedStage(xferID string, generation uint64, size uint32) error { + _, _, _ = xferID, generation, size return nil } -func writeStreamedStage(data []byte) error { - _ = data +func writeStreamedStage(xferID string, generation uint64, data []byte) error { + _, _, _ = xferID, generation, data return errors.New("streamed_stage_not_supported") } -func commitStreamedStage() (streamedStage, error) { +func commitStreamedStage(xferID string, generation uint64) (streamedStage, error) { + _, _ = xferID, generation return streamedStage{}, errors.New("streamed_stage_not_supported") } @@ -32,4 +33,6 @@ func consumeStreamedStageResult() (streamedStage, bool) { return streamedStage{}, false } -func discardStreamedStageResult() {} +func discardStreamedStageResult() { + clearABUpdateDiagHook() +} diff --git a/services/updater/prestage_tinygo.go b/services/updater/prestage_tinygo.go index 793e891..88c3d6d 100644 --- a/services/updater/prestage_tinygo.go +++ b/services/updater/prestage_tinygo.go @@ -4,7 +4,9 @@ package updater import ( "errors" + "time" + "devicecode-go/services/otadiag" "pico2-a-b/abupdate" "pico2-a-b/imagev1" ) @@ -26,7 +28,7 @@ var ( streamedVerifier *imagev1.StreamVerifier ) -func startStreamedStage(size uint32) error { +func startStreamedStage(xferID string, generation uint64, size uint32) error { // A fresh prepare invalidates any prior stage, and retrying an update in // the same boot must not inherit abupdate's previous writing/complete // state. Recreate the updater before resolving the inactive slot. @@ -37,23 +39,65 @@ func startStreamedStage(size uint32) error { streamedStageDesc = streamedStage{} streamedStageOK = false streamedVerifier = imagev1.NewStreamVerifier(SignedImagePolicy(), func(payloadLen uint32) (imagev1.PayloadSink, error) { - return newSlotSink(payloadLen) + start := time.Now() + otadiag.Event( + "[updater-stream]", "slot_sink_create_start", xferID, + otadiag.KV("generation", generation), + otadiag.KV("payload_len", payloadLen), + ) + sink, err := newSlotSink(payloadLen) + if err != nil { + otadiag.Event( + "[updater-stream]", "slot_sink_create_error", xferID, + otadiag.KV("generation", generation), + otadiag.KV("err", err.Error()), + otadiag.KV("dur_ms", int(time.Since(start)/time.Millisecond)), + ) + return nil, err + } + otadiag.Event( + "[updater-stream]", "slot_sink_create_done", xferID, + otadiag.KV("generation", generation), + otadiag.KV("dur_ms", int(time.Since(start)/time.Millisecond)), + ) + return sink, nil }) return nil } -func writeStreamedStage(data []byte) error { +func writeStreamedStage(xferID string, generation uint64, data []byte) error { if len(data) == 0 { return errors.New("empty_chunk") } if streamedVerifier == nil { return errors.New("streamed_stage_not_started") } + start := time.Now() + otadiag.Event( + "[updater-stream]", "stream_write_start", xferID, + otadiag.KV("generation", generation), + otadiag.KV("len", len(data)), + ) _, err := streamedVerifier.Write(data) + if err != nil { + otadiag.Event( + "[updater-stream]", "stream_write_error", xferID, + otadiag.KV("generation", generation), + otadiag.KV("err", err.Error()), + otadiag.KV("dur_ms", int(time.Since(start)/time.Millisecond)), + ) + return err + } + otadiag.Event( + "[updater-stream]", "stream_write_done", xferID, + otadiag.KV("generation", generation), + otadiag.KV("dur_ms", int(time.Since(start)/time.Millisecond)), + ) return err } -func commitStreamedStage() (streamedStage, error) { +func commitStreamedStage(xferID string, generation uint64) (streamedStage, error) { + _, _ = xferID, generation if streamedVerifier == nil { return streamedStage{}, errors.New("streamed_stage_not_started") } @@ -100,4 +144,5 @@ func discardStreamedStageResult() { } streamedStageDesc = streamedStage{} streamedStageOK = false + clearABUpdateDiagHook() } diff --git a/services/updater/rpc.go b/services/updater/rpc.go index a9a589c..1d9a774 100644 --- a/services/updater/rpc.go +++ b/services/updater/rpc.go @@ -1,18 +1,32 @@ package updater -import "devicecode-go/bus" +import ( + "time" + + "devicecode-go/bus" + "devicecode-go/services/otadiag" +) // handlePrepare processes cap/self/updater/main/rpc/prepare-update after // Fabric remaps it to the local bus. Success returns the current contract's // prepare acknowledgement, including the required transfer target and maximum // raw chunk size. func (s *Service) handlePrepare(msg *bus.Message) { + prepareAt := time.Now() req, ok := jsonDecode[PrepareRequest](msg.Payload) if !ok { + otadiag.Event("[updater-stream]", "prepare_reject", otadiag.XferNone, otadiag.KV("reason", "bad_request")) s.reply(msg, Reply{OK: false, Error: "bad_request"}) return } + otadiag.Event( + "[updater-stream]", "prepare_rx", otadiag.XferNone, + otadiag.KV("target", req.Target), + otadiag.KV("job_id", req.JobID), + otadiag.KV("expected_image_id", req.ExpectedImageID), + ) if req.Target != "" && req.Target != PrepareTargetMCU { + otadiag.Event("[updater-stream]", "prepare_reject", otadiag.XferNone, otadiag.KV("reason", ErrTargetMismatch)) s.reply(msg, Reply{OK: false, Error: ErrTargetMismatch}) return } @@ -24,11 +38,14 @@ func (s *Service) handlePrepare(msg *bus.Message) { s.state == StateCommitting || s.state == StateRebooting { s.mu.Unlock() + otadiag.Event("[updater-stream]", "prepare_reject", otadiag.XferNone, otadiag.KV("reason", ErrBusy)) s.reply(msg, Reply{OK: false, Error: ErrBusy}) return } s.preparing = true s.mu.Unlock() + otadiag.StartUpdateWindow("prepare", otadiag.XferNone) + otadiag.Event("[updater-stream]", "prepare_start", otadiag.XferNone) prepareActive := true finishPrepare := func() { if prepareActive { @@ -49,16 +66,30 @@ func (s *Service) handlePrepare(msg *bus.Message) { errMsg := "metadata_clear_failed:" + err.Error() s.transitionTo(StateFailed, errMsg, "") finishPrepare() + otadiag.Event( + "[updater-stream]", "prepare_error", otadiag.XferNone, + otadiag.KV("err", errMsg), + otadiag.KV("dur_ms", int(time.Since(prepareAt)/time.Millisecond)), + ) + otadiag.StopUpdateWindow("prepare_error") s.reply(msg, Reply{OK: false, Error: errMsg}) return } s.mu.Lock() - s.openStageGenerationLocked() + gen := s.openStageGenerationLocked() + snap := s.diagSnapshotLocked() s.mu.Unlock() + setDiagSnapshot(snap) + otadiag.Event("[updater-stream]", "prepare_generation", otadiag.XferNone, otadiag.KV("generation", gen)) s.transitionTo(StateReady, "", "") finishPrepare() + otadiag.Event( + "[updater-stream]", "prepare_done", otadiag.XferNone, + otadiag.KV("generation", gen), + otadiag.KV("dur_ms", int(time.Since(prepareAt)/time.Millisecond)), + ) s.reply(msg, PrepareReply{ Ready: true, Target: TargetUpdaterMain, diff --git a/services/updater/sink_tinygo.go b/services/updater/sink_tinygo.go index 9ce85dc..4648330 100644 --- a/services/updater/sink_tinygo.go +++ b/services/updater/sink_tinygo.go @@ -4,7 +4,9 @@ package updater import ( "errors" + "time" + "devicecode-go/services/otadiag" "pico2-a-b/abupdate" ) @@ -42,9 +44,21 @@ func (s *abupdateSink) Write(p []byte) (int, error) { if s.closed { return 0, errors.New("abupdate_sink: closed") } + start := time.Now() + emitABUpdateDiag("abupdate_write_start", otadiag.KV("len", len(p))) if rc := s.u.WriteChunk(p); rc != 0 { + emitABUpdateDiag( + "abupdate_write_error", + otadiag.KV("rc", rc), + otadiag.KV("dur_ms", int(time.Since(start)/time.Millisecond)), + ) return 0, errFromRC("write_chunk", rc) } + emitABUpdateDiag( + "abupdate_write_done", + otadiag.KV("dur_ms", int(time.Since(start)/time.Millisecond)), + otadiag.KV("written", s.u.BytesWritten()), + ) return len(p), nil } diff --git a/services/updater/stream_lease.go b/services/updater/stream_lease.go index 1926330..905e80b 100644 --- a/services/updater/stream_lease.go +++ b/services/updater/stream_lease.go @@ -3,6 +3,9 @@ package updater import ( "errors" "sync" + "time" + + "devicecode-go/services/otadiag" ) var ( @@ -33,23 +36,90 @@ func currentService() *Service { // last successful prepare-update call. Fabric calls this from xfer_begin before // any sink mutates flash or buffers transfer state. func BeginStreamedStage(xferID string, size uint32) (uint64, error) { + beginAt := time.Now() + otadiag.SetActiveXfer(xferID) + otadiag.Event("[updater-stream]", "begin_entry", xferID, otadiag.KV("size", size)) s := currentService() if s == nil { + otadiag.Event( + "[updater-stream]", "begin_error", xferID, + otadiag.KV("err", "updater_not_running"), + otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), + ) + otadiag.StopUpdateWindow("updater_not_running") return 0, errors.New("updater_not_running") } gen, err := s.beginStreamedStageLease(xferID) if err != nil { + otadiag.Event( + "[updater-stream]", "lease_error", xferID, + otadiag.KV("err", err.Error()), + otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), + ) return 0, err } - if err := startStreamedStage(size); err != nil { + otadiag.Event("[updater-stream]", "lease_ok", xferID, otadiag.KV("generation", gen)) + installABUpdateDiagHook(xferID, gen) + startAt := time.Now() + otadiag.Event( + "[updater-stream]", "start_entry", xferID, + otadiag.KV("generation", gen), + otadiag.KV("size", size), + ) + if err := startStreamedStage(xferID, gen, size); err != nil { + otadiag.Event( + "[updater-stream]", "start_error", xferID, + otadiag.KV("generation", gen), + otadiag.KV("err", err.Error()), + otadiag.KV("dur_ms", int(time.Since(startAt)/time.Millisecond)), + ) + clearABUpdateDiagHook() s.cancelStreamedStageLease(xferID, gen, err.Error()) + otadiag.Event( + "[updater-stream]", "begin_error", xferID, + otadiag.KV("err", err.Error()), + otadiag.KV("generation", gen), + otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), + ) + otadiag.StopUpdateWindow("start_streamed_stage_error") return 0, err } + otadiag.Event( + "[updater-stream]", "start_exit", xferID, + otadiag.KV("generation", gen), + otadiag.KV("dur_ms", int(time.Since(startAt)/time.Millisecond)), + ) + markAt := time.Now() + otadiag.Event("[updater-stream]", "mark_receiving_entry", xferID, otadiag.KV("generation", gen)) if err := s.markStreamedStageReceiving(xferID, gen); err != nil { + otadiag.Event( + "[updater-stream]", "mark_receiving_error", xferID, + otadiag.KV("generation", gen), + otadiag.KV("err", err.Error()), + otadiag.KV("dur_ms", int(time.Since(markAt)/time.Millisecond)), + ) abortStreamedStage() + clearABUpdateDiagHook() s.cancelStreamedStageLease(xferID, gen, err.Error()) + otadiag.Event( + "[updater-stream]", "begin_error", xferID, + otadiag.KV("err", err.Error()), + otadiag.KV("generation", gen), + otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), + ) + otadiag.StopUpdateWindow("mark_receiving_error") return 0, err } + otadiag.Event( + "[updater-stream]", "mark_receiving_exit", xferID, + otadiag.KV("generation", gen), + otadiag.KV("dur_ms", int(time.Since(markAt)/time.Millisecond)), + ) + otadiag.Event( + "[updater-stream]", "begin_exit", xferID, + otadiag.KV("generation", gen), + otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), + ) return gen, nil } @@ -61,7 +131,7 @@ func WriteStreamedStage(xferID string, generation uint64, data []byte) error { if err := s.checkStreamedStageLease(xferID, generation, false); err != nil { return err } - return writeStreamedStage(data) + return writeStreamedStage(xferID, generation, data) } func CommitStreamedStage(xferID string, generation uint64) (uint32, error) { @@ -72,7 +142,8 @@ func CommitStreamedStage(xferID string, generation uint64) (uint32, error) { if err := s.checkStreamedStageLease(xferID, generation, false); err != nil { return 0, err } - staged, err := commitStreamedStage() + staged, err := commitStreamedStage(xferID, generation) + clearABUpdateDiagHook() if err != nil { s.cancelStreamedStageLease(xferID, generation, err.Error()) return 0, err @@ -89,11 +160,16 @@ func CommitBufferedStage(xferID string, generation uint64) error { if s == nil { return errors.New("updater_not_running") } - return s.markStreamedStageCommitted(xferID, generation) + if err := s.markStreamedStageCommitted(xferID, generation); err != nil { + return err + } + clearABUpdateDiagHook() + return nil } func AbortStreamedStage(xferID string, generation uint64, reason string) { abortStreamedStage() + clearABUpdateDiagHook() if s := currentService(); s != nil { s.cancelStreamedStageLease(xferID, generation, reason) } @@ -121,23 +197,28 @@ func (s *Service) beginStreamedStageLease(xferID string) (uint64, error) { return 0, errors.New("bad_message:xfer_id") } s.mu.Lock() - defer s.mu.Unlock() if s.preparing || s.state == StatePreparing || s.state == StateCommitting || s.state == StateRebooting || s.state == StateReceiving || s.streamLeaseActive { + s.mu.Unlock() return 0, errors.New(ErrBusy) } if s.state != StateReady || s.stageGeneration == 0 { + s.mu.Unlock() return 0, errors.New("stage_not_prepared") } s.streamLeaseActive = true s.streamXferID = xferID s.streamCancelled = false s.streamCommitted = false - return s.stageGeneration, nil + snap := s.diagSnapshotLocked() + gen := s.stageGeneration + s.mu.Unlock() + setDiagSnapshot(snap) + return gen, nil } func (s *Service) markStreamedStageReceiving(xferID string, generation uint64) error { @@ -157,7 +238,7 @@ func (s *Service) markStreamedStageReceiving(xferID string, generation uint64) e s.state = StateReceiving s.lastError = "" s.mu.Unlock() - s.PublishUpdater() + s.PublishCriticalFacts() return nil } @@ -215,10 +296,12 @@ func (s *Service) cancelStreamedStageLease(xferID string, generation uint64, rea s.lastError = reason } } + snap := s.diagSnapshotLocked() s.mu.Unlock() + setDiagSnapshot(snap) if matches { _ = s.metadataWrite.ClearStagedDescriptor() - s.PublishUpdater() + s.PublishCriticalFacts() } return matches } diff --git a/services/updater/updater.go b/services/updater/updater.go index 66726dd..5923234 100644 --- a/services/updater/updater.go +++ b/services/updater/updater.go @@ -7,6 +7,7 @@ import ( "time" "devicecode-go/bus" + "devicecode-go/services/otadiag" ) // Local-bus topics the updater binds to. Fabric routes wire @@ -281,6 +282,7 @@ func (noopMetadataWriter) ClearStagedDescriptor() error { func (s *Service) Run(ctx context.Context) { unregister := registerActiveService(s) defer unregister() + defer otadiag.StopUpdateWindow("updater_stop") prepareSub := s.conn.Subscribe(TopicPrepareRPC) defer s.conn.Unsubscribe(prepareSub) @@ -442,6 +444,23 @@ func (s *Service) transitionTo(next State, lastError, pendingVersion string) Sta return prev } +func (s *Service) diagSnapshotLocked() otadiag.StageSnapshot { + xferID := s.streamXferID + if xferID == "" { + xferID = otadiag.XferNone + } + return otadiag.StageSnapshot{ + State: string(s.state), + Generation: s.stageGeneration, + LeaseActive: s.streamLeaseActive, + XferID: xferID, + } +} + +func setDiagSnapshot(snap otadiag.StageSnapshot) { + otadiag.SetUpdaterSnapshot(snap) +} + func (s *Service) logRepublish(reason, linkID string, obs linkObservation) { if s.logf != nil { s.logf("updater republish reason=%s link=%s peer_sid=%s local_sid=%s", reason, linkID, obs.PeerSID, obs.LocalSID) diff --git a/services/updater/updater_test.go b/services/updater/updater_test.go index d3b57c8..7af1aa3 100644 --- a/services/updater/updater_test.go +++ b/services/updater/updater_test.go @@ -14,6 +14,7 @@ import ( "time" "devicecode-go/bus" + "devicecode-go/services/otadiag" "pico2-a-b/imagev1" ) @@ -21,6 +22,46 @@ import ( func newTestBus() *bus.Bus { return bus.NewBus(8, "+", "#") } +type updaterDiagCapture struct { + mu sync.Mutex + lines []string +} + +func captureUpdaterDiag(t *testing.T) *updaterDiagCapture { + t.Helper() + c := &updaterDiagCapture{} + restore := otadiag.SetSinkForTest(func(line string) { + c.mu.Lock() + c.lines = append(c.lines, line) + c.mu.Unlock() + }) + t.Cleanup(restore) + return c +} + +func (c *updaterDiagCapture) snapshot() []string { + c.mu.Lock() + defer c.mu.Unlock() + return append([]string(nil), c.lines...) +} + +func assertUpdaterDiagContains(t *testing.T, lines []string, want ...string) { + t.Helper() + for _, line := range lines { + ok := true + for _, part := range want { + if !strings.Contains(line, part) { + ok = false + break + } + } + if ok { + return + } + } + t.Fatalf("diagnostics missing %v in:\n%s", want, strings.Join(lines, "\n")) +} + type fakeVerifierAccept struct { manifest Manifest payload []byte @@ -476,6 +517,37 @@ func TestPrepareTransitionsToReady(t *testing.T) { } } +func TestPrepareEmitsGenerationBreadcrumbs(t *testing.T) { + diag := captureUpdaterDiag(t) + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + req := caller.NewMessage(TopicPrepareRPC, PrepareRequest{ + Target: PrepareTargetMCU, + JobID: "job-diag", + ExpectedImageID: "image-diag", + }, false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + select { + case msg := <-replySub.Channel(): + if _, ok := msg.Payload.(PrepareReply); !ok { + t.Fatalf("prepare reply = %#v, want PrepareReply", msg.Payload) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for prepare reply") + } + + lines := diag.snapshot() + assertUpdaterDiagContains(t, lines, "[updater-stream]", "xfer_id -", "ev prepare_rx", "job_id job-diag", "expected_image_id image-diag") + assertUpdaterDiagContains(t, lines, "[updater-stream]", "xfer_id -", "ev prepare_generation", "generation 1") + assertUpdaterDiagContains(t, lines, "[updater-stream]", "xfer_id -", "ev prepare_done", "generation 1") +} + func prepareUpdaterForLease(t *testing.T, caller *bus.Connection) { t.Helper() req := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) @@ -595,6 +667,73 @@ func TestPrepareAndCommitRejectWhileStreamLeaseActive(t *testing.T) { } } +func TestStreamedStageDiagHookClearsOnBufferedCommit(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + prepareUpdaterForLease(t, caller) + gen, err := BeginStreamedStage("xfer-hook-commit", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + if !abupdateDiagHookActiveForTest() { + t.Fatal("diagnostic hook inactive after BeginStreamedStage") + } + if err := CommitBufferedStage("xfer-hook-commit", gen); err != nil { + t.Fatalf("CommitBufferedStage: %v", err) + } + if abupdateDiagHookActiveForTest() { + t.Fatal("diagnostic hook still active after buffered commit") + } +} + +func TestStreamedStageDiagHookClearsOnAbort(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + prepareUpdaterForLease(t, caller) + gen, err := BeginStreamedStage("xfer-hook-abort", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + if !abupdateDiagHookActiveForTest() { + t.Fatal("diagnostic hook inactive after BeginStreamedStage") + } + AbortStreamedStage("xfer-hook-abort", gen, "test_abort") + if abupdateDiagHookActiveForTest() { + t.Fatal("diagnostic hook still active after abort") + } +} + +func TestStreamedStageDiagHookClearsOnCommitError(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + prepareUpdaterForLease(t, caller) + gen, err := BeginStreamedStage("xfer-hook-commit-error", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + if !abupdateDiagHookActiveForTest() { + t.Fatal("diagnostic hook inactive after BeginStreamedStage") + } + if _, err := CommitStreamedStage("xfer-hook-commit-error", gen); err == nil { + t.Fatal("CommitStreamedStage returned nil error, want host streamed_stage_not_supported") + } + if abupdateDiagHookActiveForTest() { + t.Fatal("diagnostic hook still active after commit error") + } +} + func TestPrepareRejectsWhileCommittingOrRebooting(t *testing.T) { for _, state := range []State{StateCommitting, StateRebooting} { t.Run(string(state), func(t *testing.T) { diff --git a/types/capabilities.go b/types/capabilities.go index d6adc4a..edd54c1 100644 --- a/types/capabilities.go +++ b/types/capabilities.go @@ -7,21 +7,22 @@ package types type Kind string const ( - KindLED Kind = "led" - KindSwitch Kind = "switch" - KindPWM Kind = "pwm" - KindTemperature Kind = "temperature" - KindHumidity Kind = "humidity" - KindSerial Kind = "serial" - KindButton Kind = "button" - KindBattery Kind = "battery" - KindCharger Kind = "charger" + KindLED Kind = "led" + KindSwitch Kind = "switch" + KindPWM Kind = "pwm" + KindTemperature Kind = "temperature" + KindHumidity Kind = "humidity" + KindSerial Kind = "serial" + KindButton Kind = "button" + KindBattery Kind = "battery" + KindCharger Kind = "charger" + KindChargerConfig Kind = "charger_config" ) func (k Kind) Valid() bool { switch k { case KindLED, KindSwitch, KindPWM, KindTemperature, KindHumidity, - KindSerial, KindButton, KindBattery, KindCharger: + KindSerial, KindButton, KindBattery, KindCharger, KindChargerConfig: return true } return false diff --git a/types/power.go b/types/power.go index 8466006..4f3166d 100644 --- a/types/power.go +++ b/types/power.go @@ -37,6 +37,15 @@ type ChargerValue struct { Sys uint16 `json:"sys"` // raw SYSTEM_STATUS bits } +// Retained value: hal/cap/power/charger_config//value +type ChargerConfigValue struct { + Source string `json:"source"` + VinLo_mV int32 `json:"vin_lo_mV"` + VinHi_mV int32 `json:"vin_hi_mV"` + BSRHigh_uOhmPerCell uint32 `json:"bsr_high_uohm_per_cell"` + AlertMaskBits uint16 `json:"alert_mask_bits"` +} + // Controls type ChargerEnable struct{ On bool } // verb: "enable" type SetInputLimit struct{ MilliA int32 } // verb: "set_input_limit" From dcf2bee5aa8183eccfda12a904d79554f7460b3b Mon Sep 17 00:00:00 2001 From: cpunt Date: Wed, 3 Jun 2026 09:24:29 +0000 Subject: [PATCH 3/6] ota: tighten transfer and commit invariants --- services/fabric/transfer.go | 2 +- services/fabric/transfer_test.go | 68 ++++++++++++++++++++++++++++++++ services/updater/rpc.go | 4 +- services/updater/updater_test.go | 44 +++++++++++++++++++++ 4 files changed, 115 insertions(+), 3 deletions(-) diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index f57ba2b..95b10a7 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -201,6 +201,7 @@ func (s *session) retryCorruptTransferFrame(reason string) bool { if cur == nil { return false } + s.markRx() if cur.corruptRetryOffset != cur.bytesWritten { cur.corruptRetryOffset = cur.bytesWritten cur.corruptRetriesAtOffset = 0 @@ -422,7 +423,6 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { otadiag.KV("expected", u32s(cur.bytesWritten)), otadiag.KV("need_tx", needOK), ) - cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) return } if msg.Offset > cur.bytesWritten { diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index edf4fa9..53daa2d 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -870,6 +870,74 @@ func TestTransferChunkStaleOffsetRequestsCurrentAndCompletes(t *testing.T) { } } +func TestTransferStaleLowerOffsetDoesNotRefreshPhaseDeadline(t *testing.T) { + tr := &captureTransport{} + sink := &fakeTransferSink{} + oldDeadline := time.Now().Add(-time.Second) + s := &session{ + tr: tr, + cfg: LinkConfig{PhaseTimeout: time.Second}, + incomingTransfer: &incomingTransfer{ + meta: transferMeta{ID: "xfer-stale-deadline", Size: 6}, + sink: sink, + bytesWritten: 3, + deadline: oldDeadline, + }, + } + + s.onTransferChunk(&protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-stale-deadline", + Offset: 0, + Data: rawURL([]byte("abc")), + ChunkDigest: xxhashStr([]byte("abc")), + }) + + if !s.incomingTransfer.deadline.Equal(oldDeadline) { + t.Fatalf("stale lower offset refreshed deadline: got %v want %v", + s.incomingTransfer.deadline, oldDeadline) + } + if len(tr.writes) != 1 { + t.Fatalf("stale lower offset wrote %d frames, want one xfer_need", len(tr.writes)) + } + if len(sink.writes) != 0 { + t.Fatalf("stale lower offset rewrote sink: %d writes", len(sink.writes)) + } +} + +func TestTransferCurrentCorruptChunkRefreshesLinkLiveness(t *testing.T) { + tr := &captureTransport{} + oldRx := time.Now().Add(-time.Second) + s := &session{ + tr: tr, + lastRxAt: oldRx, + cfg: LinkConfig{PhaseTimeout: time.Second}, + incomingTransfer: &incomingTransfer{ + meta: transferMeta{ID: "xfer-corrupt-liveness", Size: 4}, + sink: &fakeTransferSink{}, + deadline: time.Now().Add(time.Second), + }, + } + + s.onTransferChunk(&protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-corrupt-liveness", + Offset: 0, + Data: rawURL([]byte("abcd")), + ChunkDigest: "00000000", + }) + + if !s.lastRxAt.After(oldRx) { + t.Fatalf("current corrupt chunk did not refresh liveness: got %v old %v", s.lastRxAt, oldRx) + } + if got := s.incomingTransfer.corruptRetriesAtOffset; got != 1 { + t.Fatalf("corrupt retries = %d, want 1", got) + } + if len(tr.writes) != 1 { + t.Fatalf("current corrupt chunk wrote %d frames, want one retry xfer_need", len(tr.writes)) + } +} + func TestTransferChunkDecodeFailureRequestsSameOffset(t *testing.T) { b := newBus() cm5, mcu := pipePair() diff --git a/services/updater/rpc.go b/services/updater/rpc.go index 1d9a774..825d0f2 100644 --- a/services/updater/rpc.go +++ b/services/updater/rpc.go @@ -122,9 +122,9 @@ func (s *Service) handleCommit(msg *bus.Message) { s.reply(msg, Reply{OK: false, Error: ErrNothingStaged}) return } - expectedImageID := req.ExpectedImageID + expectedImageID := pendingImageID if expectedImageID == "" { - expectedImageID = pendingImageID + expectedImageID = req.ExpectedImageID } if expectedImageID != "" && desc.ImageID != expectedImageID { s.reply(msg, Reply{OK: false, Error: ErrTargetMismatch}) diff --git a/services/updater/updater_test.go b/services/updater/updater_test.go index 7af1aa3..931050c 100644 --- a/services/updater/updater_test.go +++ b/services/updater/updater_test.go @@ -969,6 +969,50 @@ func TestCommitWithoutStagedStateRefusesEvenWithDescriptor(t *testing.T) { } } +func TestCommitUsesPreparedExpectedImageOverCommitPayload(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + memMD := NewMemoryMetadata() + if err := memMD.WriteStagedDescriptor(StagedDescriptor{ + Version: "9.9.9", + BuildID: "build-b", + ImageID: "image-B", + Length: 4096, + Slot: 1, + PayloadSHA256: strings.Repeat("b", 64), + }); err != nil { + t.Fatalf("write staged descriptor: %v", err) + } + app := &fakeApplier{rebootCh: make(chan StagedDescriptor, 1)} + svc, cancel := runService(t, b, Options{ + Conn: conn, + Metadata: memMD, + MetadataWrite: memMD, + Applier: app, + }) + defer cancel() + + svc.setJobContext("job-image-a", "image-A") + svc.transitionTo(StateStaged, "", "9.9.9") + + payload := requestUpdaterReply(t, caller, TopicCommitRPC, CommitRequest{ExpectedImageID: "image-B"}) + reply, ok := payload.(Reply) + if !ok || reply.OK || reply.Error != ErrTargetMismatch { + t.Fatalf("commit reply = %#v, want target mismatch", payload) + } + canCalls, rebootCalls := app.callCounts() + if canCalls != 0 || rebootCalls != 0 { + t.Fatalf("applier called despite target mismatch: can=%d reboot=%d", canCalls, rebootCalls) + } + select { + case d := <-app.rebootCh: + t.Fatalf("unexpected reboot descriptor: %+v", d) + default: + } +} + func TestCommitWithoutApplierReturnsApplyUnavailable(t *testing.T) { // Spec safety: the commit RPC must not claim success when the MCU // has no apply hook wired (the production default RefusingApplier From ac50c8afc960c37632daf2ca55251c9421eb27c3 Mon Sep 17 00:00:00 2001 From: cpunt Date: Wed, 3 Jun 2026 11:33:15 +0000 Subject: [PATCH 4/6] fabric: add updater rpc diagnostics --- services/fabric/fabric_test.go | 8 +- services/fabric/session.go | 141 +++++++++++++++++++++++++++++++-- 2 files changed, 143 insertions(+), 6 deletions(-) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index ece5583..f51d738 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -1995,6 +1995,7 @@ func TestCallIgnoredBeforeHandshake(t *testing.T) { func TestCallImport(t *testing.T) { // Test the canonical inbound call route: cap/self/updater/main/rpc/prepare-update // maps to local rpc/updater/prepare where services/updater binds. + diag := captureOTADiag(t) mcu, cm5 := pipePair() b := newBus() fabricConn := b.NewConnection("fabric") @@ -2013,7 +2014,7 @@ func TestCallImport(t *testing.T) { sendMsg(t, cm5, protoCall{ Type: "call", ID: "test-corr-1", Topic: []string{"cap", "self", "updater", "main", "rpc", "prepare-update"}, - Payload: json.RawMessage(`{}`), TimeoutMs: 5000, + Payload: json.RawMessage(`{"job_id":"job-prepare","expected_image_id":"mcu-dev-15.3"}`), TimeoutMs: 5000, }) reply := readMsg[protoReply](t, cm5) @@ -2023,6 +2024,11 @@ func TestCallImport(t *testing.T) { if !reply.OK { t.Errorf("reply not ok: %s", reply.Err) } + lines := diag.snapshot() + assertDiagContains(t, lines, "[fabric-rpc]", "ev call_rx", "call_id test-corr-1", "job_id job-prepare", "expected_image_id mcu-dev-15.3") + assertDiagContains(t, lines, "[fabric-rpc]", "ev call_route_ok", "local_topic rpc/updater/prepare") + assertDiagContains(t, lines, "[fabric-rpc]", "ev call_dispatch_start", "timeout_ms 5000") + waitDiagContains(t, diag, "[fabric-rpc]", "ev call_reply_tx", "ok true", "sent true") } func TestCallNoRoute(t *testing.T) { diff --git a/services/fabric/session.go b/services/fabric/session.go index 0e04805..fcca752 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -88,6 +88,8 @@ const ( type inboundCall struct { id string topic []string + localTopic bus.Topic + payload json.RawMessage sub *bus.Subscription deadline time.Time transferPrepare bool @@ -650,6 +652,111 @@ func wireTopicString(topic []string) string { return strings.Join(topic, "/") } +func busTopicPath(topic bus.Topic) string { + if topic == nil { + return "" + } + parts := make([]string, 0, topic.Len()) + for i := 0; i < topic.Len(); i++ { + if s, ok := topic.At(i).(string); ok { + parts = append(parts, s) + } + } + return strings.Join(parts, "/") +} + +func traceRPCDiagTopic(topic []string) bool { + return wireTopicEquals(topic, wireUpdaterPrepare) || wireTopicEquals(topic, wireUpdaterCommit) +} + +func rawJSONScalar(raw json.RawMessage) string { + if len(raw) == 0 { + return "" + } + var s string + if err := json.Unmarshal(raw, &s); err == nil { + return s + } + var n float64 + if err := json.Unmarshal(raw, &n); err == nil { + return strconvx.FormatFloat(n, 'f', -1, 64) + } + var b bool + if err := json.Unmarshal(raw, &b); err == nil { + if b { + return "true" + } + return "false" + } + return "" +} + +func rpcPayloadField(payload json.RawMessage, key string) string { + if len(payload) == 0 { + return "" + } + var obj map[string]json.RawMessage + if err := json.Unmarshal(payload, &obj); err != nil { + return "" + } + return rawJSONScalar(obj[key]) +} + +func (s *session) rpcDiag(event string, msg *protoCall, localTopic bus.Topic, reason string, fields ...otadiag.Field) { + if msg == nil || !traceRPCDiagTopic(msg.Topic) { + return + } + base := []otadiag.Field{ + otadiag.KV("call_id", msg.ID), + otadiag.KV("topic", wireTopicString(msg.Topic)), + otadiag.KV("local_sid", s.localSID), + otadiag.KV("peer_sid", s.peerSID), + otadiag.KV("payload_len", len(msg.Payload)), + } + if local := busTopicPath(localTopic); local != "" { + base = append(base, otadiag.KV("local_topic", local)) + } + if jobID := rpcPayloadField(msg.Payload, "job_id"); jobID != "" { + base = append(base, otadiag.KV("job_id", jobID)) + } + if imageID := rpcPayloadField(msg.Payload, "expected_image_id"); imageID != "" { + base = append(base, otadiag.KV("expected_image_id", imageID)) + } + if reason != "" { + base = append(base, otadiag.KV("reason", reason)) + } + base = append(base, fields...) + otadiag.Event("[fabric-rpc]", event, otadiag.XferNone, base...) +} + +func (s *session) rpcDiagInbound(event string, call *inboundCall, ok bool, err string, fields ...otadiag.Field) { + if call == nil || !traceRPCDiagTopic(call.topic) { + return + } + base := []otadiag.Field{ + otadiag.KV("call_id", call.id), + otadiag.KV("topic", wireTopicString(call.topic)), + otadiag.KV("local_sid", s.localSID), + otadiag.KV("peer_sid", s.peerSID), + otadiag.KV("payload_len", len(call.payload)), + otadiag.KV("ok", ok), + } + if local := busTopicPath(call.localTopic); local != "" { + base = append(base, otadiag.KV("local_topic", local)) + } + if jobID := rpcPayloadField(call.payload, "job_id"); jobID != "" { + base = append(base, otadiag.KV("job_id", jobID)) + } + if imageID := rpcPayloadField(call.payload, "expected_image_id"); imageID != "" { + base = append(base, otadiag.KV("expected_image_id", imageID)) + } + if err != "" { + base = append(base, otadiag.KV("err", err)) + } + base = append(base, fields...) + otadiag.Event("[fabric-rpc]", event, otadiag.XferNone, base...) +} + func validWireTopic(topic []string) bool { if len(topic) == 0 { return false @@ -845,22 +952,29 @@ func (s *session) onUnretain(msg *protoUnretain) { func (s *session) onCall(msg *protoCall) { if msg.ID == "" { + s.rpcDiag("call_reject", msg, nil, "missing_id") s.log("incoming call dropped: missing_id") return } if !validWireTopic(msg.Topic) { + s.rpcDiag("call_reject", msg, nil, "bad_topic") s.log("incoming call dropped: bad_topic") s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: "bad_topic"})) return } + s.rpcDiag("call_rx", msg, nil, "", + otadiag.KV("timeout_ms", strconvx.Itoa(msg.TimeoutMs)), + ) for _, call := range s.inboundCalls { if call.id == msg.ID { + s.rpcDiag("call_reject", msg, nil, "duplicate_call_id") s.logKV("incoming call dropped", "err", "duplicate_call_id") s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: "duplicate_call_id"})) return } } if len(s.inboundCalls) >= s.cfg.MaxInboundHelpers { + s.rpcDiag("call_reject", msg, nil, reasonBusy) s.log("incoming call dropped: busy") s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonBusy})) return @@ -868,10 +982,12 @@ func (s *session) onCall(msg *protoCall) { localTopic := importCallTopic(msg.Topic) if localTopic == nil { + s.rpcDiag("call_reject", msg, nil, reasonNoRoute) s.log("incoming call dropped: no_route") s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) return } + s.rpcDiag("call_route_ok", msg, localTopic, "") s.markRx() isTransferPrepare := wireTopicEquals(msg.Topic, wireUpdaterPrepare) @@ -884,11 +1000,16 @@ func (s *session) onCall(msg *protoCall) { timeout = time.Duration(msg.TimeoutMs) * time.Millisecond } busMsg := s.conn.NewMessage(localTopic, msg.Payload, false) + s.rpcDiag("call_dispatch_start", msg, localTopic, "", + otadiag.KV("timeout_ms", strconvx.Itoa(int(timeout/time.Millisecond))), + ) sub := s.conn.Request(busMsg) topicCopy := append([]string(nil), msg.Topic...) s.inboundCalls = append(s.inboundCalls, &inboundCall{ id: msg.ID, topic: topicCopy, + localTopic: localTopic, + payload: append(json.RawMessage(nil), msg.Payload...), sub: sub, deadline: time.Now().Add(timeout), transferPrepare: isTransferPrepare, @@ -1184,7 +1305,9 @@ func (s *session) drainInbound(now time.Time) { if call.transferPrepare { s.extendTransferQuiet("prepare_reply_timeout", transferPrepareQuiet) } - if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { + sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) + s.rpcDiagInbound("call_reply_tx", call, false, reasonTimeout, otadiag.KV("sent", sent)) + if !sent { return } continue @@ -1193,7 +1316,9 @@ func (s *session) drainInbound(now time.Time) { if call.transferPrepare { s.extendTransferQuiet("prepare_reply_error", transferPrepareQuiet) } - if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errStr})) { + sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errStr})) + s.rpcDiagInbound("call_reply_tx", call, false, errStr, otadiag.KV("sent", sent)) + if !sent { return } continue @@ -1203,7 +1328,9 @@ func (s *session) drainInbound(now time.Time) { if call.transferPrepare { s.extendTransferQuiet("prepare_reply_marshal_failed", transferPrepareQuiet) } - if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { + sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) + s.rpcDiagInbound("call_reply_tx", call, false, errPayloadMarshal, otadiag.KV("sent", sent)) + if !sent { return } continue @@ -1211,7 +1338,9 @@ func (s *session) drainInbound(now time.Time) { if call.transferPrepare { s.extendTransferQuiet("prepare_reply_ok", transferPrepareQuiet) } - if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Payload: payload})) { + sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Payload: payload})) + s.rpcDiagInbound("call_reply_tx", call, true, "", otadiag.KV("sent", sent)) + if !sent { return } continue @@ -1224,7 +1353,9 @@ func (s *session) drainInbound(now time.Time) { if call.transferPrepare { s.extendTransferQuiet("prepare_call_timeout", transferPrepareQuiet) } - if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { + sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) + s.rpcDiagInbound("call_reply_tx", call, false, reasonTimeout, otadiag.KV("sent", sent)) + if !sent { return } continue From 81065c5798b04bbc86c1ac54913fb7dd663b6d25 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 4 Jun 2026 09:49:40 +0000 Subject: [PATCH 5/6] fabric: focus ota uart diagnostics --- go.mod | 2 + services/fabric/fabric_test.go | 17 + services/fabric/session.go | 36 ++ services/hal/devices/serial_raw/builder.go | 220 +++++++--- .../hal/devices/serial_raw/builder_test.go | 63 +++ .../hal/internal/provider/rp2_resources.go | 35 ++ third_party/tinygo-uartx/README.md | 239 +++++++++++ third_party/tinygo-uartx/go.mod | 3 + third_party/tinygo-uartx/uartx/export.go | 20 + third_party/tinygo-uartx/uartx/ringbuffer.go | 63 +++ third_party/tinygo-uartx/uartx/rp2.go | 42 ++ third_party/tinygo-uartx/uartx/rp2_uart.go | 405 ++++++++++++++++++ third_party/tinygo-uartx/uartx/uartx.go | 186 ++++++++ 13 files changed, 1275 insertions(+), 56 deletions(-) create mode 100644 third_party/tinygo-uartx/README.md create mode 100644 third_party/tinygo-uartx/go.mod create mode 100644 third_party/tinygo-uartx/uartx/export.go create mode 100644 third_party/tinygo-uartx/uartx/ringbuffer.go create mode 100644 third_party/tinygo-uartx/uartx/rp2.go create mode 100644 third_party/tinygo-uartx/uartx/rp2_uart.go create mode 100644 third_party/tinygo-uartx/uartx/uartx.go diff --git a/go.mod b/go.mod index 6962427..bbf004f 100644 --- a/go.mod +++ b/go.mod @@ -12,3 +12,5 @@ require ( require github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect replace pico2-a-b => ../pico2-a-b + +replace github.com/jangala-dev/tinygo-uartx => ./third_party/tinygo-uartx diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index f51d738..b326e4c 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -23,6 +23,23 @@ func pipePair() (*rwTransport, *rwTransport) { func newBus() *bus.Bus { return bus.NewBus(3, "+", "#") } +func TestShouldLogFabricReadSkipsIdleDataFrames(t *testing.T) { + longRead := 3 * time.Second + longGap := 3 * time.Second + + for _, msgType := range []string{msgPub, msgPing, msgPong, msgUnretain} { + if shouldLogFabricRead(msgType, longRead, longGap) { + t.Fatalf("idle %s frame should not be logged", msgType) + } + } + + for _, msgType := range []string{msgHello, msgHelloAck, msgCall, msgReply, msgXferBegin, msgXferCommit, msgXferAbort} { + if !shouldLogFabricRead(msgType, 0, 0) { + t.Fatalf("control %s frame should be logged", msgType) + } + } +} + type captureTransport struct { writes [][]byte writeErr error diff --git a/services/fabric/session.go b/services/fabric/session.go index fcca752..133ba30 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -190,19 +190,47 @@ func (s *session) run(ctx context.Context) { go func() { defer close(lines) + lastLineAt := time.Now() for { + started := time.Now() line, err := s.tr.ReadLine() + now := time.Now() + readDur := now.Sub(started) + sinceLine := now.Sub(lastLineAt) if err != nil { if errors.Is(err, ErrLineTooLong) { + otadiag.Event( + "[fabric-rx]", "read_error", otadiag.XferNone, + otadiag.KV("reason", "line_too_long"), + otadiag.KV("read_ms", int(readDur/time.Millisecond)), + otadiag.KV("since_line_ms", int(sinceLine/time.Millisecond)), + ) s.log("oversized line dropped") continue } + otadiag.Event( + "[fabric-rx]", "read_error", otadiag.XferNone, + otadiag.KV("reason", err.Error()), + otadiag.KV("read_ms", int(readDur/time.Millisecond)), + otadiag.KV("since_line_ms", int(sinceLine/time.Millisecond)), + ) select { case lines <- readResult{err: err}: case <-ctx.Done(): } return } + t := protoType(line) + if shouldLogFabricRead(t, readDur, sinceLine) { + otadiag.Event( + "[fabric-rx]", "read_line", protoXferID(line), + otadiag.KV("type", t), + otadiag.KV("line_len", len(line)), + otadiag.KV("read_ms", int(readDur/time.Millisecond)), + otadiag.KV("since_line_ms", int(sinceLine/time.Millisecond)), + ) + } + lastLineAt = now cp := make([]byte, len(line)) copy(cp, line) select { @@ -276,6 +304,14 @@ func (s *session) run(ctx context.Context) { } } +func shouldLogFabricRead(msgType string, _, _ time.Duration) bool { + switch msgType { + case msgHello, msgHelloAck, msgCall, msgReply, msgXferBegin, msgXferCommit, msgXferAbort: + return true + } + return false +} + func resetTimer(t *time.Timer, d time.Duration) { if !t.Stop() { select { diff --git a/services/hal/devices/serial_raw/builder.go b/services/hal/devices/serial_raw/builder.go index 9d7b482..15ecaca 100644 --- a/services/hal/devices/serial_raw/builder.go +++ b/services/hal/devices/serial_raw/builder.go @@ -8,6 +8,7 @@ import ( "devicecode-go/errcode" "devicecode-go/services/hal/internal/core" + "devicecode-go/services/otadiag" "devicecode-go/types" "devicecode-go/x/shmring" "devicecode-go/x/strconvx" @@ -27,6 +28,7 @@ type Params struct { const ( serialRawPumpRXBudget = 256 serialRawPumpTXBudget = 256 + serialRawPumpGapWarn = 20 * time.Millisecond ) // ---- Device ---- @@ -58,11 +60,17 @@ type session struct { txRing *shmring.Ring // Reactor-owned observability. Single writer only. - rxRingFull uint32 - rxLogAt time.Time - rxLogHits uint32 - rxPressureAt time.Time - rxPressureHits uint32 + rxRingFull uint32 + rxLogAt time.Time + rxLogHits uint32 + rxPressureAt time.Time + rxPressureHits uint32 + rxPumpGapAt time.Time + rxPumpGapHits uint32 + lastRXPumpAt time.Time + lastRXPumpMoved int + lastRXPumpDurMS int + lastRXPumpGapMS int // Single worker (reactor) for the port. ctx context.Context @@ -75,6 +83,14 @@ type serialRXDiagnostics interface { RXBufferCap() int } +type serialRXErrorDiagnostics interface { + RXDropCount() uint32 + RXOverrunCount() uint32 + RXBreakCount() uint32 + RXParityCount() uint32 + RXFramingCount() uint32 +} + // ---- Builder registration ---- func Builder() core.Builder { return builder{} } @@ -365,6 +381,34 @@ func (d *Device) logRingFullChange(s *session, force bool) { s.rxLogHits = hits } +func (d *Device) appendRXPumpFields(s *session, fields []otadiag.Field, now time.Time) []otadiag.Field { + if !s.lastRXPumpAt.IsZero() { + fields = append(fields, otadiag.KV("since_rx_pump_ms", int(now.Sub(s.lastRXPumpAt)/time.Millisecond))) + } + fields = append(fields, + otadiag.KV("last_pump_moved", s.lastRXPumpMoved), + otadiag.KV("last_pump_dur_ms", s.lastRXPumpDurMS), + ) + if s.lastRXPumpGapMS >= 0 { + fields = append(fields, otadiag.KV("last_pump_gap_ms", s.lastRXPumpGapMS)) + } + return fields +} + +func appendRXErrorFields(port core.SerialPort, fields []otadiag.Field) []otadiag.Field { + diag, ok := port.(serialRXErrorDiagnostics) + if !ok { + return fields + } + return append(fields, + otadiag.KV("rx_drops", diag.RXDropCount()), + otadiag.KV("rx_overrun", diag.RXOverrunCount()), + otadiag.KV("rx_break", diag.RXBreakCount()), + otadiag.KV("rx_parity", diag.RXParityCount()), + otadiag.KV("rx_framing", diag.RXFramingCount()), + ) +} + func (d *Device) logDriverPressure(s *session, force bool) { const minInterval = 1 * time.Second @@ -386,75 +430,96 @@ func (d *Device) logDriverPressure(s *session, force bool) { } hits := s.rxPressureHits + 1 + now := time.Now() if !force { - now := time.Now() if now.Sub(s.rxPressureAt) < minInterval { return } - s.rxPressureAt = now } else { - s.rxPressureAt = time.Now() + now = time.Now() } + s.rxPressureAt = now s.rxPressureHits = hits - println( - "[serial-raw]", "rx_driver_pressure", - "uart", d.a.Name, - "hits", strconvx.Utoa64(uint64(hits)), - "driver_used", strconvx.Itoa(used), - "driver_cap", strconvx.Itoa(capacity), - "ring_avail", strconvx.Itoa(s.rxRing.Available()), - "ring_space", strconvx.Itoa(s.rxRing.Space()), - "ring_cap", strconvx.Itoa(s.rxRing.Cap()), - ) -} - -func (d *Device) reactor(s *session) { - defer close(s.done) - - u := d.port - rxR := s.rxRing // UART -> app - txR := s.txRing // app -> UART - - for { - made := false + fields := []otadiag.Field{ + otadiag.KV("uart", d.a.Name), + otadiag.KV("hits", strconvx.Utoa64(uint64(hits))), + otadiag.KV("driver_used", used), + otadiag.KV("driver_cap", capacity), + otadiag.KV("ring_avail", s.rxRing.Available()), + otadiag.KV("ring_space", s.rxRing.Space()), + otadiag.KV("ring_cap", s.rxRing.Cap()), + } + fields = d.appendRXPumpFields(s, fields, now) + fields = appendRXErrorFields(d.port, fields) + otadiag.Event("[serial-raw]", "rx_driver_pressure", otadiag.XferNone, fields...) - if d.pumpRX(s, u, rxR, serialRawPumpRXBudget) { - made = true - } + if !s.lastRXPumpAt.IsZero() && now.Sub(s.lastRXPumpAt) >= serialRawPumpGapWarn { + d.logRXPumpGap(s, used, capacity, now) + } +} - if d.pumpTX(u, txR, serialRawPumpTXBudget) { - made = true - } +func (d *Device) logRXPumpGap(s *session, used, capacity int, now time.Time) { + const minInterval = 1 * time.Second - if made { - select { - case <-s.ctx.Done(): - d.logRingFullChange(s, true) - return - default: - } - runtime.Gosched() - continue - } + if now.Sub(s.rxPumpGapAt) < minInterval { + return + } + s.rxPumpGapAt = now + s.rxPumpGapHits++ + fields := []otadiag.Field{ + otadiag.KV("uart", d.a.Name), + otadiag.KV("hits", strconvx.Utoa64(uint64(s.rxPumpGapHits))), + otadiag.KV("driver_used", used), + otadiag.KV("driver_cap", capacity), + otadiag.KV("ring_avail", s.rxRing.Available()), + otadiag.KV("ring_space", s.rxRing.Space()), + otadiag.KV("ring_cap", s.rxRing.Cap()), + otadiag.KV("since_rx_pump_ms", int(now.Sub(s.lastRXPumpAt)/time.Millisecond)), + otadiag.KV("last_pump_moved", s.lastRXPumpMoved), + otadiag.KV("last_pump_dur_ms", s.lastRXPumpDurMS), + otadiag.KV("last_pump_gap_ms", s.lastRXPumpGapMS), + } + fields = appendRXErrorFields(d.port, fields) + otadiag.Event("[serial-raw]", "rx_pump_gap", otadiag.XferNone, fields...) +} - // Idle: wait for any edge, then re-check. - d.logRingFullChange(s, false) - select { - case <-s.ctx.Done(): - d.logRingFullChange(s, true) - return - case <-u.Readable(): - case <-u.Writable(): - case <-rxR.Writable(): - case <-txR.Readable(): - } +func (s *session) noteRXPump(moved int, started time.Time) { + if moved <= 0 { + return + } + now := time.Now() + gapMS := -1 + if !s.lastRXPumpAt.IsZero() { + gapMS = int(started.Sub(s.lastRXPumpAt) / time.Millisecond) + } + s.lastRXPumpAt = now + s.lastRXPumpMoved = moved + s.lastRXPumpDurMS = int(now.Sub(started) / time.Millisecond) + s.lastRXPumpGapMS = gapMS + if s.lastRXPumpGapMS < 0 { + s.lastRXPumpGapMS = 0 + } + if s.lastRXPumpDurMS >= 5 { + otadiag.Event( + "[serial-raw]", "rx_pump_slow", otadiag.XferNone, + otadiag.KV("moved", moved), + otadiag.KV("dur_ms", s.lastRXPumpDurMS), + otadiag.KV("gap_ms", s.lastRXPumpGapMS), + ) } } func (d *Device) pumpRX(s *session, u core.SerialPort, rxR *shmring.Ring, budget int) bool { + started := time.Now() moved := 0 + defer func() { + if moved > 0 { + s.noteRXPump(moved, started) + } + }() + for moved < budget { d.logDriverPressure(s, false) p1, p2 := rxR.WriteAcquire() @@ -494,6 +559,49 @@ func (d *Device) pumpRX(s *session, u core.SerialPort, rxR *shmring.Ring, budget return moved > 0 } +func (d *Device) reactor(s *session) { + defer close(s.done) + + u := d.port + rxR := s.rxRing // UART -> app + txR := s.txRing // app -> UART + + for { + made := false + + if d.pumpRX(s, u, rxR, serialRawPumpRXBudget) { + made = true + } + + if d.pumpTX(u, txR, serialRawPumpTXBudget) { + made = true + } + + if made { + select { + case <-s.ctx.Done(): + d.logRingFullChange(s, true) + return + default: + } + runtime.Gosched() + continue + } + + // Idle: wait for any edge, then re-check. + d.logRingFullChange(s, false) + select { + case <-s.ctx.Done(): + d.logRingFullChange(s, true) + return + case <-u.Readable(): + case <-u.Writable(): + case <-rxR.Writable(): + case <-txR.Readable(): + } + } +} + func (d *Device) pumpTX(u core.SerialPort, txR *shmring.Ring, budget int) bool { moved := 0 diff --git a/services/hal/devices/serial_raw/builder_test.go b/services/hal/devices/serial_raw/builder_test.go index e824e35..3b71105 100644 --- a/services/hal/devices/serial_raw/builder_test.go +++ b/services/hal/devices/serial_raw/builder_test.go @@ -2,12 +2,14 @@ package serial_raw import ( "context" + "strings" "sync" "sync/atomic" "testing" "time" "devicecode-go/services/hal/internal/core" + "devicecode-go/services/otadiag" "devicecode-go/types" ) @@ -20,6 +22,13 @@ type fakeSerialPort struct { readCalls atomic.Int32 maxReadLen atomic.Int32 maxWriteLen atomic.Int32 + rxBuffered atomic.Int32 + rxBufferCap atomic.Int32 + rxDrops atomic.Uint32 + rxOverrun atomic.Uint32 + rxBreak atomic.Uint32 + rxParity atomic.Uint32 + rxFraming atomic.Uint32 mu sync.Mutex written []byte @@ -35,6 +44,14 @@ func newFakeSerialPort() *fakeSerialPort { return p } +func (p *fakeSerialPort) RXBuffered() int { return int(p.rxBuffered.Load()) } +func (p *fakeSerialPort) RXBufferCap() int { return int(p.rxBufferCap.Load()) } +func (p *fakeSerialPort) RXDropCount() uint32 { return p.rxDrops.Load() } +func (p *fakeSerialPort) RXOverrunCount() uint32 { return p.rxOverrun.Load() } +func (p *fakeSerialPort) RXBreakCount() uint32 { return p.rxBreak.Load() } +func (p *fakeSerialPort) RXParityCount() uint32 { return p.rxParity.Load() } +func (p *fakeSerialPort) RXFramingCount() uint32 { return p.rxFraming.Load() } + func (p *fakeSerialPort) TryRead(dst []byte) int { p.readCalls.Add(1) recordMax(&p.maxReadLen, len(dst)) @@ -133,6 +150,52 @@ func waitUntil(t *testing.T, timeout time.Duration, pred func() bool) { t.Fatal("condition was not met before timeout") } +func TestDriverPressureLogIncludesPumpEvidence(t *testing.T) { + port := newFakeSerialPort() + port.rxBuffered.Store(128) + port.rxBufferCap.Store(128) + port.rxDrops.Store(3) + port.rxOverrun.Store(2) + port.rxFraming.Store(1) + dev := newTestDevice(port) + dev.startSession(512, 512) + defer dev.stopSession() + + s := dev.sess + s.lastRXPumpAt = time.Now().Add(-50 * time.Millisecond) + s.lastRXPumpMoved = 0 + s.lastRXPumpDurMS = 0 + s.lastRXPumpGapMS = 50 + + var lines []string + restore := otadiag.SetSinkForTest(func(line string) { + lines = append(lines, line) + }) + defer restore() + + dev.logDriverPressure(s, true) + + joined := strings.Join(lines, "\n") + for _, want := range []string{ + "[serial-raw]", + "ev rx_driver_pressure", + "uart uart1", + "driver_used 128", + "driver_cap 128", + "ring_space 512", + "since_rx_pump_ms", + "last_pump_gap_ms 50", + "rx_drops 3", + "rx_overrun 2", + "rx_framing 1", + "ev rx_pump_gap", + } { + if !strings.Contains(joined, want) { + t.Fatalf("pressure log missing %q:\n%s", want, joined) + } + } +} + func TestReactorServicesTXWhileRXIsContinuous(t *testing.T) { port := newFakeSerialPort() port.continuousRX.Store(true) diff --git a/services/hal/internal/provider/rp2_resources.go b/services/hal/internal/provider/rp2_resources.go index f047444..8967322 100644 --- a/services/hal/internal/provider/rp2_resources.go +++ b/services/hal/internal/provider/rp2_resources.go @@ -725,6 +725,41 @@ func (p *rp2SerialPort) RXBufferCap() int { return int(p.u.Buffer.Size()) } +func (p *rp2SerialPort) RXDropCount() uint32 { + if p.u == nil { + return 0 + } + return p.u.RXDropCount() +} + +func (p *rp2SerialPort) RXOverrunCount() uint32 { + if p.u == nil { + return 0 + } + return p.u.RXOverrunCount() +} + +func (p *rp2SerialPort) RXBreakCount() uint32 { + if p.u == nil { + return 0 + } + return p.u.RXBreakCount() +} + +func (p *rp2SerialPort) RXParityCount() uint32 { + if p.u == nil { + return 0 + } + return p.u.RXParityCount() +} + +func (p *rp2SerialPort) RXFramingCount() uint32 { + if p.u == nil { + return 0 + } + return p.u.RXFramingCount() +} + func (p *rp2SerialPort) SetBaudRate(br uint32) error { p.u.SetBaudRate(br); return nil } // Parity strings: "none","even","odd" diff --git a/third_party/tinygo-uartx/README.md b/third_party/tinygo-uartx/README.md new file mode 100644 index 0000000..83d52c5 --- /dev/null +++ b/third_party/tinygo-uartx/README.md @@ -0,0 +1,239 @@ +# tinygo-uartx + +Interrupt-driven UART for TinyGo with **blocking `io.Reader`/`io.Writer` semantics**, explicit non-blocking operations, and practical flush. Designed and tested on RP2040/RP2350 (PL011), with stubs for other TinyGo targets. + +> **Compatibility notes** +> * `uartx` **breaks** with TinyGo’s `machine.UART` behaviour: **`Read(p)` blocks** until at least one byte is available. If you need non-blocking reads, use `TryRead`. See “API differences” below. +> * **TinyGo 0.39:** this currently requires the single-core scheduler. Build with `-scheduler tasks`. + +--- + +## Why use `uartx`? + +* **Correct, event-driven TX/RX**: uses hardware FIFOs and IRQs; foreground code does not poll during normal operation. +* **Clear Go semantics**: implements `io.Reader`, `io.Writer`, `io.ByteReader`, `io.ByteWriter` (via `ReadByte`/`WriteByte`) and a simple `Flusher` (`Flush()`). +* **Concurrent composability**: coalesced readiness channels `Readable()` and `Writable()` integrate naturally with `select` for fast, back-pressure-aware pacing. +* **On-the-wire completion**: `Flush()` waits for software buffer empty **and** hardware FIFO empty **and** the transmitter to go idle. +* **Production-oriented**: liveness at TX start, no foreground/ISR races, correct ordering of RX error handling, and minimal timed polling only where the hardware offers no interrupt (TX idle). + +--- + +## Supported targets + +* RP2040 / RP2350 (`go:build rp2040 || rp2350`) with PL011. + Other arches are build-gated but not implemented here. + +--- + +## Install + +```bash +go get github.com/jangala-dev/tinygo-uartx/uartx +``` + +--- + +## Quick start (RP2040) + +```go +package main + +import ( + "machine" + "time" + + "github.com/jangala-dev/tinygo-uartx/uartx" +) + +func main() { + u0 := uartx.UART0 + u1 := uartx.UART1 + + // Wire: U0 TX=GP0 -> U1 RX=GP5, U1 TX=GP4 -> U0 RX=GP1 + _ = u0.Configure(uartx.UARTConfig{BaudRate: 230400, TX: machine.Pin(0), RX: machine.Pin(1)}) + _ = u1.Configure(uartx.UARTConfig{BaudRate: 230400, TX: machine.Pin(4), RX: machine.Pin(5)}) + + // Writer: block until bytes are accepted by driver (SW TX ring and/or HW FIFO). + msg := []byte("hello, world\n") + _, _ = u0.Write(msg) + + // Reader: Read blocks until at least 1 byte is available, returns n>0, nil. + buf := make([]byte, 64) + n, _ := u1.Read(buf) + _, _ = u1.Write(buf[:n]) // echo back + + // Ensure everything went on the wire. + _ = u0.Flush() + _ = u1.Flush() + + for { time.Sleep(time.Second) } +} +``` + +--- + +## API overview + +### Blocking I/O + +* `Read(p []byte) (int, error)` + Blocks until **at least one byte** is available; returns `n>0, nil`. Does **not** return `io.EOF` for an idle UART. + +* `Write(p []byte) (int, error)` + Blocks until **all bytes are accepted** by the driver (software TX buffer and/or HW FIFO). Does **not** wait for the line to drain; see `Flush`. + +* `ReadByte() (byte, error)` + Non-blocking single-byte read from the software RX buffer. Returns `errUARTBufferEmpty` if no data is available. + +* `WriteByte(b byte) error` + Blocks until the byte is accepted by the driver. + +### Non-blocking helpers + +* `TryRead(p []byte) int` + Returns immediately with up to `len(p)` bytes from the RX buffer. `0` means “no data now”. +* `TryWrite(p []byte) int` + Returns immediately with `0..len(p)` bytes accepted into HW FIFO and/or SW TX buffer. `0` means “no space now”. + +### Readiness (for `select`) + +* `Readable() <-chan struct{}` + Coalesced notification: a receive interrupt that enqueues ≥1 byte sends a token. You **must re-check** state after waking (level→edge coalescer). +* `Writable() <-chan struct{}` + Coalesced notification: TX progress/space. Sent when bytes move SW→HW or space appears. Also level-coalesced; re-check state after waking. + +### Flush + +* `Flush() error` + Blocks until software TX buffer is empty, the HW TX FIFO is empty, **and** the transmitter is not busy. + Note: PL011 does not raise an interrupt for the final “idle” edge, so `Flush` uses a short timed poll (scaled to baud) in addition to readiness wakes. + +### Buffer introspection + +* `Buffered() int` – bytes in the RX ring. +* `TxFree() int` – free space in the SW TX ring. + +### Interfaces satisfied + +* `io.Reader`, `io.Writer`, `io.ByteReader`, `io.ByteWriter` +* `Flusher` (package-local `Flush() error`) + +--- + +## API differences vs TinyGo `machine.UART` + +| Behaviour | `machine.UART` (TinyGo) | `uartx` | +| ---------------------- | ------------------------ | --------------------------------------------- | +| `Read(p)` | **Non-blocking** | **Blocking** until ≥1 byte | +| Non-blocking read | `Read(p)` | `TryRead(p) int` | +| Non-blocking write | implementation-dependent | `TryWrite(p) int` | +| Event readiness | varied | `Readable()`, `Writable()` coalesced channels | +| On-the-wire completion | `Write(p)` | `Flush()` (FIFO empty **and** line idle) | +| Internals | polling/IRQ mix | **IRQ-driven**; HW FIFOs; minimal timed poll | + +If you migrate from `machine.UART`, audit any paths that relied on `Read` being non-blocking. Use `TryRead` or `Readable()` with `select` for non-blocking behaviour. + +--- + +## Concurrent patterns + +### Producer with pacing + +```go +func writeAll(u *uartx.UART, p []byte) { + sent := 0 + for sent < len(p) { + if n := u.TryWrite(p[sent:]); n > 0 { + sent += n + continue + } + <-u.Writable() // wait for space/progress; then re-check + } +} +``` + +### Consumer with timeout + +```go +func readSome(ctx context.Context, u *uartx.UART, p []byte) (int, error) { + if n := u.TryRead(p); n > 0 { return n, nil } + for { + select { + case <-u.Readable(): + if n := u.TryRead(p); n > 0 { return n, nil } + case <-ctx.Done(): + return 0, ctx.Err() + } + } +} +``` + +### Duplex with `select` + +```go +func pump(uIn, uOut *uartx.UART, buf []byte) { + for { + select { + case <-uIn.Readable(): + if n := uIn.TryRead(buf); n > 0 { + writeAll(uOut, buf[:n]) + } + case <-uOut.Writable(): + // optional: send pending application data + } + } +} +``` + +--- + +## Behavioural notes (RP2040/RP2350) + +* **Interrupt model**: RX uses level/timeout; TX uses level. Steady-state writes to the HW FIFO are performed in the ISR. Foreground only seeds the FIFO at TX start or in a guarded “masked kick” corner case; this avoids reordering. +* **Error handling**: framing/parity/overrun bytes are dropped on RX (read clears per-byte flags); sticky error status is cleared after draining. +* **Flush**: requires SW TX empty, FIFO empty and transmitter not busy. The final idle edge is not interrupt-driven on PL011; a short timed poll is used. + +--- + +## Example: integrity test (excerpt) + +```go +// Sender +func sendPattern(ctx context.Context, u *uartx.UART, gen func(int) byte, n int) error { + var buf [192]byte + for i := 0; i < n; { + k := len(buf) + if n-i < k { k = n - i } + for j := 0; j < k; j++ { buf[j] = gen(i+j) } + if _, err := sendAll(ctx, u, buf[:k]); err != nil { return err } + i += k + } + return nil +} + +func sendAll(ctx context.Context, u *uartx.UART, p []byte) (int, error) { + sent := 0 + for sent < len(p) { + if n := u.TryWrite(p[sent:]); n > 0 { sent += n; continue } + select { + case <-u.Writable(): + case <-ctx.Done(): return sent, ctx.Err() + } + } + return sent, nil +} +``` + +--- + +## Limitations and future work + +* Only RP2040/RP2350 implementation is included at present. +* RX overflow is dropped silently by default; add counters if required for diagnostics. +* CTS/RTS flow control is enabled only if both pins are configured; application-level tests advised. + +--- + +## Licence + +MIT, see `LICENCE` file. diff --git a/third_party/tinygo-uartx/go.mod b/third_party/tinygo-uartx/go.mod new file mode 100644 index 0000000..2f8612a --- /dev/null +++ b/third_party/tinygo-uartx/go.mod @@ -0,0 +1,3 @@ +module github.com/jangala-dev/tinygo-uartx + +go 1.25.0 diff --git a/third_party/tinygo-uartx/uartx/export.go b/third_party/tinygo-uartx/uartx/export.go new file mode 100644 index 0000000..716621f --- /dev/null +++ b/third_party/tinygo-uartx/uartx/export.go @@ -0,0 +1,20 @@ +// uartx/export.go + +//go:build atmega || esp || nrf || sam || sifive || stm32 || k210 || nxp || rp2040 || rp2350 + +package uartx + +import "machine" + +type UARTConfig = machine.UARTConfig +type Pin = machine.Pin + +const ( + NoPin = machine.NoPin + UART_TX_PIN = machine.UART_TX_PIN + UART_RX_PIN = machine.UART_RX_PIN + UART0_RX_PIN = machine.UART0_RX_PIN + UART0_TX_PIN = machine.UART0_TX_PIN + UART1_TX_PIN = machine.UART1_TX_PIN + UART1_RX_PIN = machine.UART1_RX_PIN +) diff --git a/third_party/tinygo-uartx/uartx/ringbuffer.go b/third_party/tinygo-uartx/uartx/ringbuffer.go new file mode 100644 index 0000000..303b9d6 --- /dev/null +++ b/third_party/tinygo-uartx/uartx/ringbuffer.go @@ -0,0 +1,63 @@ +// uartx/ringbuffer.go + +//go:build atmega || esp || nrf || sam || sifive || stm32 || k210 || nxp || rp2040 || rp2350 + +// An API-compatible replacement for machine.RingBuffer with an added Size() method. +// Methods and semantics match TinyGo's implementation. + +package uartx + +import "runtime/volatile" + +// Choose a power-of-two size for efficient modulo. +const bufferSize uint8 = 128 + +// RingBuffer is a byte ring buffer compatible with TinyGo's machine.RingBuffer. +type RingBuffer struct { + rxbuffer [bufferSize]volatile.Register8 + head volatile.Register8 + tail volatile.Register8 +} + +// NewRingBuffer returns a new ring buffer. +func NewRingBuffer() *RingBuffer { + return &RingBuffer{} +} + +// Size returns the total capacity of the buffer in bytes. +func (rb *RingBuffer) Size() uint8 { + return bufferSize +} + +// Used returns how many bytes in buffer have been used. +func (rb *RingBuffer) Used() uint8 { + return uint8(rb.head.Get() - rb.tail.Get()) +} + +// Put stores a byte in the buffer. If the buffer is already full, it returns false. +func (rb *RingBuffer) Put(val byte) bool { + if rb.Used() == bufferSize { // full + return false + } + h := rb.head.Get() + rb.rxbuffer[(h+1)%bufferSize].Set(val) // 1) write data + rb.head.Set(h + 1) // 2) publish + return true +} + +// Get returns a byte from the buffer. If the buffer is empty, it returns (0, false). +func (rb *RingBuffer) Get() (byte, bool) { + if rb.Used() == 0 { + return 0, false + } + t := rb.tail.Get() + v := rb.rxbuffer[(t+1)%bufferSize].Get() // 1) read current element + rb.tail.Set(t + 1) // 2) publish consumption + return v, true +} + +// Clear resets the head and tail pointers to zero. +func (rb *RingBuffer) Clear() { + rb.head.Set(0) + rb.tail.Set(0) +} diff --git a/third_party/tinygo-uartx/uartx/rp2.go b/third_party/tinygo-uartx/uartx/rp2.go new file mode 100644 index 0000000..4c3414e --- /dev/null +++ b/third_party/tinygo-uartx/uartx/rp2.go @@ -0,0 +1,42 @@ +// uartx/machine_rp2.go + +//go:build rp2040 || rp2350 + +package uartx + +import ( + "device/rp" + "runtime/interrupt" +) + +const deviceName = rp.Device + +// UART on the RP2040/RP2035 +var ( + UART0 = &_UART0 + _UART0 = UART{ + Bus: rp.UART0, + // RX + Buffer: NewRingBuffer(), + notify: make(chan struct{}, 1), + // TX + TxBuffer: NewRingBuffer(), + txNotify: make(chan struct{}, 1), + } + + UART1 = &_UART1 + _UART1 = UART{ + Bus: rp.UART1, + // RX + Buffer: NewRingBuffer(), + notify: make(chan struct{}, 1), + // TX + TxBuffer: NewRingBuffer(), + txNotify: make(chan struct{}, 1), + } +) + +func init() { + UART0.Interrupt = interrupt.New(rp.IRQ_UART0_IRQ, _UART0.handleInterrupt) + UART1.Interrupt = interrupt.New(rp.IRQ_UART1_IRQ, _UART1.handleInterrupt) +} diff --git a/third_party/tinygo-uartx/uartx/rp2_uart.go b/third_party/tinygo-uartx/uartx/rp2_uart.go new file mode 100644 index 0000000..9a6b974 --- /dev/null +++ b/third_party/tinygo-uartx/uartx/rp2_uart.go @@ -0,0 +1,405 @@ +// uartx/rp2_uart.go +//go:build rp2040 || rp2350 + +// Package uartx provides a minimal, interrupt-driven UART for RP2040/RP2350 +// built around the PL011. Steady-state transmission is ISR-driven; the +// foreground only seeds the HW FIFO when it safely owns the TX start (TXIM +// masked) or performs a short “masked kick” when TXIM is enabled but no IRQ is +// pending and the FIFO is empty. This avoids foreground/ISR interleave while +// guaranteeing forward progress. +package uartx + +import ( + "device/rp" + "errors" + "machine" + "runtime/interrupt" + "runtime/volatile" +) + +// UART represents a single PL011 instance on RP2040/RP2350. +// Invariants (TX path): +// - Steady-state writer to UARTDR is the ISR. +// - Foreground writes UARTDR only when TXIM is masked (we own the start) or +// in the “masked kick” corner case (TXIM enabled, no TX IRQ pending, FIFO empty). +// +// Signalling: +// - txNotify is coalesced and used both for “writable” (space/progress) and +// a final “drained” edge (SW empty and TXFE==1). Callers must re-check state. +type UART struct { + // RX + Buffer *RingBuffer // software RX ring (compatible with TinyGo semantics) + Bus *rp.UART0_Type // PL011 register block + + // TX + TxBuffer *RingBuffer // software TX ring drained by the ISR + txNotify chan struct{} // coalesced TX readiness/drain notifications + + Interrupt interrupt.Interrupt + notify chan struct{} // coalesced RX readiness notifications + + baud uint32 // last configured baud (for diagnostics, not used by HW) + + rxDrops volatile.Register32 + rxOverruns volatile.Register32 + rxBreaks volatile.Register32 + rxParity volatile.Register32 + rxFraming volatile.Register32 +} + +// Configure sets up the PL011, its pins and interrupts. It leaves RXIM/RTIM +// enabled and TXIM masked (enabled on demand by attemptSend). +func (uart *UART) Configure(cfg machine.UARTConfig) error { + initUART(uart) + + if cfg.BaudRate == 0 { + cfg.BaudRate = 115200 + } + uart.baud = cfg.BaudRate + + if cfg.TX == machine.NoPin && cfg.RX == machine.NoPin { + cfg.TX = machine.UART_TX_PIN + cfg.RX = machine.UART_RX_PIN + } + + // 1) Disable UART while configuring (PL011 CR). + uart.Bus.UARTCR.ClearBits(rp.UART0_UARTCR_UARTEN | rp.UART0_UARTCR_RXE | rp.UART0_UARTCR_TXE) + + // 2) Mux pins before touching baud/format. + if cfg.TX != machine.NoPin { + cfg.TX.Configure(machine.PinConfig{Mode: machine.PinUART}) + } + if cfg.RX != machine.NoPin { + cfg.RX.Configure(machine.PinConfig{Mode: machine.PinUART}) + } + if cfg.RTS != machine.NoPin { + cfg.RTS.Configure(machine.PinConfig{Mode: machine.PinUART}) + } + if cfg.CTS != machine.NoPin { + cfg.CTS.Configure(machine.PinConfig{Mode: machine.PinUART}) + } + + // 3) Baud and format. SetFormat does a full LCR_H write including FEN. + uart.SetBaudRate(cfg.BaudRate) // includes the “dummy” LCR_H write required by PL011 + _ = uart.SetFormat(8, 1, ParityNone) // default 8N1 with FIFOs enabled + + // 4) Clear any pending IRQs and purge RX FIFO (read until RXFE). + uart.Bus.UARTICR.Set(0x7FF) // clear all PL011 interrupts + for !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_RXFE) { + _ = uart.Bus.UARTDR.Get() + } + // Clear sticky RX errors (ECR share-address via RSR). + uart.Bus.UARTRSR.Set(0) + + // 5) Enable UART and optional flow control (only if both pins valid). + settings := uint32(rp.UART0_UARTCR_UARTEN | rp.UART0_UARTCR_RXE | rp.UART0_UARTCR_TXE) + if cfg.RTS != machine.NoPin && cfg.CTS != machine.NoPin { + settings |= rp.UART0_UARTCR_RTSEN | rp.UART0_UARTCR_CTSEN + } + uart.Bus.UARTCR.Set(settings) + + // 6) IRQ configuration: leave TXIM masked; enable RXIM and RTIM. + uart.Interrupt.SetPriority(0x80) + uart.Interrupt.Enable() + // IFLS=0 sets RX/TX thresholds to 1/8. That minimises latency and increases + // IRQ rate. Consider RX=1/2, TX=1/8 if you prefer fewer IRQs during RX bursts. + uart.Bus.UARTIFLS.Set(0) + uart.Bus.UARTIMSC.Set(rp.UART0_UARTIMSC_RXIM | rp.UART0_UARTIMSC_RTIM) // TXIM is enabled by attemptSend + + // 7) Prime initial “writable” notification (FIFO starts empty). + select { + case uart.txNotify <- struct{}{}: + default: + } + + return nil +} + +// SetBaudRate programs the PL011 integer and fractional divisors and performs +// the “dummy” LCR_H write required to latch them. +func (uart *UART) SetBaudRate(br uint32) { + uart.baud = br + div := 8 * machine.CPUFrequency() / br + + ibrd := div >> 7 + var fbrd uint32 + switch { + case ibrd == 0: + ibrd = 1 + fbrd = 0 + case ibrd >= 65535: + ibrd = 65535 + fbrd = 0 + default: + fbrd = ((div & 0x7f) + 1) / 2 + } + + uart.Bus.UARTIBRD.Set(ibrd) + uart.Bus.UARTFBRD.Set(fbrd) + + // PL011 requires an LCR_H write after changing divisors. + uart.Bus.UARTLCR_H.Set(uart.Bus.UARTLCR_H.Get()) +} + +// SetFormat sets data bits, stop bits and parity, and enables the FIFOs. +// It writes the full LCR_H value (not OR-ing). +func (uart *UART) SetFormat(databits, stopbits uint8, parity UARTParity) error { + if databits < 5 || databits > 8 { + return errors.New("invalid databits") + } + if stopbits != 1 && stopbits != 2 { + return errors.New("invalid stopbits") + } + + var pen, pev uint32 + if parity != ParityNone { + pen = rp.UART0_UARTLCR_H_PEN + if parity == ParityEven { + pev = rp.UART0_UARTLCR_H_EPS + } + } + const fen = rp.UART0_UARTLCR_H_FEN + + val := uint32((databits-5)<HW when required, sourcing bytes from the SW ring. + + // Case A: TXIM masked — we own TX start; seed FIFO directly from SW ring. + if !uart.Bus.UARTIMSC.HasBits(rp.UART0_UARTIMSC_TXIM) { + for !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_TXFF) { + b, ok := uart.TxBuffer.Get() + if !ok { + break + } + uart.Bus.UARTDR.Set(uint32(b)) + } + // Arm TX interrupts; ISR takes over steady-state. + uart.Bus.UARTIMSC.SetBits(rp.UART0_UARTIMSC_TXIM) + return n + } + + // Case B: only kick if we can actually write ≥1 byte now. + if uart.Bus.UARTIMSC.HasBits(rp.UART0_UARTIMSC_TXIM) && + !uart.Bus.UARTMIS.HasBits(rp.UART0_UARTMIS_TXMIS) && + !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_TXFF) && // FIFO has space + (uart.TxBuffer.Used() > 0) { // something to seed + + // Prevent ISR interleave while we start TX. + uart.Bus.UARTIMSC.ClearBits(rp.UART0_UARTIMSC_TXIM) + + // Seed from SW ring until FIFO becomes full or SW empties. + for !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_TXFF) { + b, ok := uart.TxBuffer.Get() + if !ok { + break + } + uart.Bus.UARTDR.Set(uint32(b)) + } + + // Re-enable TX level interrupts. Next drop to/under IFLS will raise TX IRQ. + uart.Bus.UARTIMSC.SetBits(rp.UART0_UARTIMSC_TXIM) + + // Could perform a userland nudge here - better for latency but untested. + // select { case uart.txNotify <- struct{}{}: default: } + } + + return n +} + +// enableTxIRQ ensures TX level interrupts are unmasked. +func (uart *UART) enableTxIRQ() { + uart.Bus.UARTIMSC.SetBits(rp.UART0_UARTIMSC_TXIM) +} + +// txFifoEmpty reports TXFE (FIFO empty). Note: PL011 does not raise an +// interrupt for TXFE alone; callers must poll if they require the exact edge. +func (uart *UART) txFifoEmpty() bool { + return uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_TXFE) +} + +// txLineIdle reports FR.BUSY==0 (shifter idle). This bit also does not raise +// an interrupt; it is only polled where precise “all bits out” is required. +func (uart *UART) txLineIdle() bool { + return !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_BUSY) +} + +// tryWriteHW opportunistically pushes into the HW FIFO until TXFF. +func (uart *UART) tryWriteHW(p []byte) int { + i := 0 + for i < len(p) && !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_TXFF) { + uart.Bus.UARTDR.Set(uint32(p[i])) + i++ + } + return i +} + +// enqueueTX inserts into the software TX ring until full. +func (uart *UART) enqueueTX(p []byte) int { + i := 0 + for i < len(p) { + if ok := uart.TxBuffer.Put(p[i]); !ok { + break + } + i++ + } + return i +} + +// --- RX/TX ISR --- + +// handleInterrupt services RX level/timeout and TX level interrupts. +// RX: drain DR until RXFE, dropping errored bytes (read clears per-byte flags), +// +// clear RXIC/RTIC, clear sticky errors (RSR/ECR), coalesce a Readable() wake. +// +// TX: while !TXFF, move SW→HW; coalesce a Writable() wake; when SW empty and TXFE, +// +// coalesce a final “drained” wake and mask TXIM; clear TXIC. +func (uart *UART) handleInterrupt(interrupt.Interrupt) { + mis := uart.Bus.UARTMIS.Get() + + // RX path (RX level or RX timeout). + if (mis & (rp.UART0_UARTMIS_RXMIS | rp.UART0_UARTMIS_RTMIS)) != 0 { + + // In the ISR, only notify if at least one byte was enqueued. + enq := 0 + for !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_RXFE) { + r := uart.Bus.UARTDR.Get() + errs := r & (rp.UART0_UARTDR_OE | rp.UART0_UARTDR_BE | + rp.UART0_UARTDR_PE | rp.UART0_UARTDR_FE) + if errs != 0 { + uart.noteRXErrors(errs) + // Drop errored byte; reading DR clears the per-byte error flags. + continue + } + if uart.Buffer.Put(byte(r & 0xFF)) { + enq++ + } else { + incrementRegister32(&uart.rxDrops) + } + } + + // Clear RX level and RX timeout sources, then sticky errors. + uart.Bus.UARTICR.Set(rp.UART0_UARTICR_RXIC | rp.UART0_UARTICR_RTIC) + uart.Bus.UARTRSR.Set(0) + + // Coalesce a Readable notification. + if enq > 0 { + select { + case uart.notify <- struct{}{}: + default: + } + } + } + + // TX path (TX level). + if mis&rp.UART0_UARTMIS_TXMIS != 0 { + + // Move bytes from SW buffer to HW FIFO. + for !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_TXFF) { + b, ok := uart.TxBuffer.Get() + if !ok { + break + } + uart.Bus.UARTDR.Set(uint32(b)) + } + + // Coalesce a Writable notification (space/progress). + select { + case uart.txNotify <- struct{}{}: + default: + } + + // If SW buffer empty, manage the tail. + if uart.TxBuffer.Used() == 0 { + // When FIFO is now empty, emit a final “drained” notification and mask TXIM. + if uart.txFifoEmpty() { + select { + case uart.txNotify <- struct{}{}: + default: + } + uart.Bus.UARTIMSC.ClearBits(rp.UART0_UARTIMSC_TXIM) + } + } + + // Clear TX interrupt. + uart.Bus.UARTICR.Set(rp.UART0_UARTICR_TXIC) + } +} + +func (uart *UART) noteRXErrors(errs uint32) { + if errs&rp.UART0_UARTDR_OE != 0 { + incrementRegister32(&uart.rxOverruns) + } + if errs&rp.UART0_UARTDR_BE != 0 { + incrementRegister32(&uart.rxBreaks) + } + if errs&rp.UART0_UARTDR_PE != 0 { + incrementRegister32(&uart.rxParity) + } + if errs&rp.UART0_UARTDR_FE != 0 { + incrementRegister32(&uart.rxFraming) + } +} + +func incrementRegister32(r *volatile.Register32) { + r.Set(r.Get() + 1) +} diff --git a/third_party/tinygo-uartx/uartx/uartx.go b/third_party/tinygo-uartx/uartx/uartx.go new file mode 100644 index 0000000..9757304 --- /dev/null +++ b/third_party/tinygo-uartx/uartx/uartx.go @@ -0,0 +1,186 @@ +// uartx/uartx.go + +//go:build atmega || esp || nrf || sam || sifive || stm32 || k210 || nxp || rp2040 || rp2350 + +// Package uartx provides an interrupt-driven UART driver with blocking io.Reader/io.Writer +// semantics and explicit non-blocking and flush operations. Write blocks until data is +// accepted by the driver (software TX buffer and/or hardware FIFO). Flush provides an +// explicit “on the wire” completion. +package uartx + +import ( + "errors" + "time" +) + +// Flusher is implemented by types that can flush buffered output to the underlying device. +type Flusher interface{ Flush() error } + +var errUARTBufferEmpty = errors.New("UART buffer empty") + +// UARTParity defines the parity setting used for UART communication. +type UARTParity uint8 + +const ( + // ParityNone disables parity generation and checking (the most common setting). + ParityNone UARTParity = iota + // ParityEven sets even parity (total number of 1 bits is even). + ParityEven + // ParityOdd sets odd parity (total number of 1 bits is odd). + ParityOdd +) + +// Readable returns a coalesced notification for RX readiness. +// A receive interrupt that enqueues one or more bytes will send on this channel. +// The channel is level-coalesced; callers must re-check state after waking. +func (uart *UART) Readable() <-chan struct{} { return uart.notify } + +// Writable returns a coalesced notification for TX progress or space. +// The driver sends on this channel when it moves bytes from software to hardware +// or when space becomes available. The channel is level-coalesced; callers must +// re-check state after waking. +func (uart *UART) Writable() <-chan struct{} { return uart.txNotify } + +// TryRead returns immediately with up to len(p) bytes copied from the RX buffer. +// It never blocks and never returns an error. A return value of 0 means “no data now”. +func (uart *UART) TryRead(p []byte) int { + if len(p) == 0 { + return 0 + } + n := 0 + // Read at most len(p) bytes; stop on first empty. + for n < len(p) { + b, err := uart.ReadByte() + if err != nil { + break + } + p[n] = b + n++ + } + return n +} + +// Read implements io.Reader. It blocks until at least one byte is available, +// then returns n>0, nil. It does not return io.EOF for an idle UART. +func (uart *UART) Read(p []byte) (int, error) { + if len(p) == 0 { + return 0, nil + } + // Fast path: anything already buffered? + if n := uart.TryRead(p); n > 0 { + return n, nil + } + // Block until notified, then retry. + for { + select { + case <-uart.Readable(): // coalesced wake-up; must re-check + if n := uart.TryRead(p); n > 0 { + return n, nil + } + // Spurious or lost race; loop back to wait. + } + } +} + +// ReadByte reads a single byte from the software RX buffer. +// If there is no data available, it returns errUARTBufferEmpty. +func (uart *UART) ReadByte() (byte, error) { + buf, ok := uart.Buffer.Get() + if !ok { + return 0, errUARTBufferEmpty + } + return buf, nil +} + +// TryWrite returns immediately with 0..len(p) bytes accepted into the hardware FIFO +// and/or the software TX buffer. It never blocks and never returns an error. +// A return value of 0 means “no space now”. +func (uart *UART) TryWrite(p []byte) int { + return uart.attemptSend(p) // non-blocking enqueue path (defined elsewhere) +} + +// WriteByte writes a single byte. It blocks until the byte is accepted by the driver +// (queued to software TX and/or hardware FIFO). It does not wait for the UART to drain. +func (uart *UART) WriteByte(c byte) error { + _, err := uart.Write([]byte{c}) + return err +} + +// Writev writes the provided buffers in sequence with the same blocking behaviour as Write. +// It stops on the first error and returns the total number of bytes accepted up to that point. +func (u *UART) Writev(bufs ...[]byte) (int, error) { + sent := 0 + for _, p := range bufs { + n, err := u.Write(p) + sent += n + if err != nil { + return sent, err + } + } + return sent, nil +} + +// Write implements io.Writer. It blocks until all bytes in p have been accepted by +// the driver (queued to software TX and/or hardware FIFO). Write does not wait for +// the UART to drain; use Flush for on-the-wire completion. +func (uart *UART) Write(p []byte) (int, error) { + sent := 0 + for sent < len(p) { + n := uart.TryWrite(p[sent:]) + if n > 0 { + sent += n + continue + } + // Wait for TX progress (space created or drain) then retry. + <-uart.txNotify + } + return sent, nil +} + +// Flush blocks until all queued bytes have left the PL011: the software TX buffer is empty, +// the TX FIFO is empty, and the line is not busy. Because PL011 does not interrupt on BUSY +// deassertion, Flush uses a short timed poll in addition to txNotify wakes. +func (uart *UART) Flush() error { + // A small polling interval. 40–60 µs works well across common bauds. + tick := uart.drainTick() + for { + if uart.TxBuffer.Used() == 0 && uart.txFifoEmpty() && uart.txLineIdle() { + return nil + } + // Wake promptly if the ISR made progress; otherwise fall back to a short tick. + select { + case <-uart.txNotify: + // Progress likely occurred; loop and re-check. + case <-time.After(tick): + } + } +} + +// drainTick returns a short polling interval for Flush based on the configured baud. +// The value is approximately two character times at 8N1, with a lower bound to avoid zero. +func (uart *UART) drainTick() time.Duration { + if uart.baud == 0 { + return 50 * time.Microsecond + } + // ~2 character times at 8N1 (10 bits/char). + perBit := time.Second / time.Duration(uart.baud) + t := 2 * 10 * perBit + if t < 20*time.Microsecond { + t = 20 * time.Microsecond + } + return t +} + +// Buffered returns the number of bytes currently stored in the software RX buffer. +func (uart *UART) Buffered() int { + return int(uart.Buffer.Used()) +} + +// TxFree returns the remaining space in the software TX buffer in bytes. +func (u *UART) TxFree() int { return int(u.TxBuffer.Size() - u.TxBuffer.Used()) } + +// Receive inserts one byte into the software RX buffer. It is intended to be called +// by the UART interrupt handler when a new byte has been read from the hardware FIFO. +func (uart *UART) Receive(data byte) { + uart.Buffer.Put(data) +} From 58b119928494949a311503abc65c9674922989d0 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 4 Jun 2026 11:03:25 +0000 Subject: [PATCH 6/6] otadiag: gate verbose OTA breadcrumbs --- services/otadiag/otadiag.go | 83 ++++++++++++++++++++++++++++ services/otadiag/otadiag_test.go | 94 ++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 services/otadiag/otadiag_test.go diff --git a/services/otadiag/otadiag.go b/services/otadiag/otadiag.go index 2c0e3b5..b31999b 100644 --- a/services/otadiag/otadiag.go +++ b/services/otadiag/otadiag.go @@ -18,6 +18,7 @@ type Field struct { var ( startedAt = time.Now() nextSeq atomic.Uint64 + verbose atomic.Bool sinkMu sync.Mutex sink func(string) @@ -40,6 +41,9 @@ func KV(key string, value any) Field { } func Event(prefix, event, xferID string, fields ...Field) { + if !allowEvent(prefix, event, fields) { + return + } if xferID == "" { xferID = XferNone } @@ -71,15 +75,26 @@ func Event(prefix, event, xferID string, fields ...Field) { func SetSinkForTest(fn func(string)) func() { sinkMu.Lock() prev := sink + prevVerbose := verbose.Load() sink = fn + verbose.Store(true) sinkMu.Unlock() return func() { sinkMu.Lock() sink = prev + verbose.Store(prevVerbose) sinkMu.Unlock() } } +func SetVerboseForTest(enabled bool) func() { + prev := verbose.Load() + verbose.Store(enabled) + return func() { + verbose.Store(prev) + } +} + func SetUpdaterSnapshot(s StageSnapshot) { if s.XferID == "" { s.XferID = XferNone @@ -198,6 +213,74 @@ func emit(line string) { println(line) } +func allowEvent(prefix, event string, fields []Field) bool { + if verbose.Load() { + return true + } + switch prefix { + case "[serial-raw]", "[fabric-rx]", "[fabric-rpc]", "[fabric-handshake]": + return true + case "[mcu-ota]": + return event == "heartbeat_start" || event == "heartbeat_stop" + case "[fabric-xfer]": + return allowFabricXferEvent(event, fields) + case "[updater-stream]": + return allowUpdaterStreamEvent(event) + default: + return true + } +} + +func allowFabricXferEvent(event string, fields []Field) bool { + if strings.HasPrefix(event, "begin_") { + return true + } + switch event { + case "abort_local", "abort_tx", "done_tx", "ready_tx", "malformed_retry", "transfer_mem_sample": + return true + case "need_tx": + next := fieldValue(fields, "next") + return next == "0" || + fieldValue(fields, "ok") == "false" || + fieldValue(fields, "duplicate") == "true" || + fieldValue(fields, "retry") == "true" || + fieldValue(fields, "skipped") != "" + case "chunk_decode_done", "chunk_digest_done": + return fieldValue(fields, "ok") == "false" + case "chunk_stale_offset", "chunk_future_offset", "chunk_size_overflow", + "sink_write_error", "chunk_write_error": + return true + default: + return strings.HasSuffix(event, "_error") || + strings.Contains(event, "reject") || + strings.Contains(event, "abort") + } +} + +func allowUpdaterStreamEvent(event string) bool { + if strings.HasPrefix(event, "prepare_") { + return true + } + switch event { + case "begin_entry", "lease_ok", "begin_exit", + "flash_erase_start", "flash_erase_done", "flash_erase_error", + "flash_program_error", "program_page_error": + return true + default: + return strings.HasSuffix(event, "_error") || + strings.Contains(event, "reject") + } +} + +func fieldValue(fields []Field, key string) string { + for _, f := range fields { + if f.Key == key { + return f.Value + } + } + return "" +} + func blank(v string) string { if v == "" { return XferNone diff --git a/services/otadiag/otadiag_test.go b/services/otadiag/otadiag_test.go new file mode 100644 index 0000000..590d93f --- /dev/null +++ b/services/otadiag/otadiag_test.go @@ -0,0 +1,94 @@ +package otadiag + +import ( + "strings" + "testing" +) + +func TestDefaultFilterSuppressesHighVolumeEvents(t *testing.T) { + lines, restore := captureDefaultFilteredEvents() + defer restore() + + Event("[mcu-ota]", "heartbeat", "xfer-1", KV("state", "receiving")) + Event("[fabric-xfer]", "chunk_rx", "xfer-1", KV("offset", 0), KV("expected", 0)) + Event("[fabric-xfer]", "chunk_decode_done", "xfer-1", KV("ok", true), KV("raw_len", 512)) + Event("[fabric-xfer]", "chunk_digest_done", "xfer-1", KV("ok", true)) + Event("[fabric-xfer]", "sink_write_start", "xfer-1", KV("offset", 0)) + Event("[fabric-xfer]", "sink_write_done", "xfer-1", KV("next", 512)) + Event("[fabric-xfer]", "gc_done", "xfer-1", KV("next", 512)) + Event("[updater-stream]", "stream_write_start", "xfer-1", KV("len", 512)) + Event("[updater-stream]", "image_signature_verify_start", "xfer-1", KV("manifest_len", 192)) + Event("[updater-stream]", "flash_program_start", "xfer-1", KV("offset", 0)) + + if got := strings.Join(*lines, "\n"); got != "" { + t.Fatalf("default filter emitted high-volume diagnostics:\n%s", got) + } +} + +func TestDefaultFilterKeepsActionableEvents(t *testing.T) { + lines, restore := captureDefaultFilteredEvents() + defer restore() + + Event("[serial-raw]", "rx_driver_pressure", XferNone, KV("uart", "uart0")) + Event("[fabric-rx]", "read_line", XferNone, KV("type", "ping")) + Event("[fabric-rpc]", "sent", XferNone, KV("call_id", "call-1")) + Event("[mcu-ota]", "heartbeat_start", "xfer-1", KV("reason", "prepare")) + Event("[mcu-ota]", "heartbeat_stop", "xfer-1", KV("reason", "done")) + Event("[fabric-xfer]", "begin_rx", "xfer-1", KV("target", "updater/main")) + Event("[fabric-xfer]", "ready_tx", "xfer-1", KV("ok", true)) + Event("[fabric-xfer]", "need_tx", "xfer-1", KV("next", 0), KV("ok", true)) + Event("[fabric-xfer]", "chunk_digest_done", "xfer-1", KV("ok", false), KV("reason", "chunk_digest_mismatch")) + Event("[fabric-xfer]", "sink_write_error", "xfer-1", KV("reason", "write_boom")) + Event("[updater-stream]", "prepare_rx", XferNone, KV("job_id", "job-1")) + Event("[updater-stream]", "flash_erase_start", "xfer-1", KV("offset", 0)) + Event("[updater-stream]", "image_signature_verify_error", "xfer-1", KV("reason", "bad_signature")) + + got := strings.Join(*lines, "\n") + for _, want := range []string{ + "[serial-raw]", "[fabric-rx]", "[fabric-rpc]", + "ev heartbeat_start", "ev heartbeat_stop", + "ev begin_rx", "ev ready_tx", "ev need_tx", + "ev chunk_digest_done", "ev sink_write_error", + "ev prepare_rx", "ev flash_erase_start", "ev image_signature_verify_error", + } { + if !strings.Contains(got, want) { + t.Fatalf("default filter output missing %q:\n%s", want, got) + } + } +} + +func TestSinkForTestEnablesVerboseDiagnostics(t *testing.T) { + var lines []string + restore := SetSinkForTest(func(line string) { + lines = append(lines, line) + }) + defer restore() + + Event("[fabric-xfer]", "chunk_rx", "xfer-1", KV("offset", 0), KV("expected", 0)) + Event("[updater-stream]", "image_signature_verify_start", "xfer-1", KV("manifest_len", 192)) + + got := strings.Join(lines, "\n") + for _, want := range []string{"ev chunk_rx", "ev image_signature_verify_start"} { + if !strings.Contains(got, want) { + t.Fatalf("verbose test sink output missing %q:\n%s", want, got) + } + } +} + +func captureDefaultFilteredEvents() (*[]string, func()) { + lines := []string{} + sinkMu.Lock() + prevSink := sink + prevVerbose := verbose.Load() + sink = func(line string) { + lines = append(lines, line) + } + verbose.Store(false) + sinkMu.Unlock() + return &lines, func() { + sinkMu.Lock() + sink = prevSink + verbose.Store(prevVerbose) + sinkMu.Unlock() + } +}