diff --git a/README.md b/README.md index 5d77520..a94e2d6 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ tinygo flash -stack-size=3KB -monitor -scheduler tasks -target=pico -tags "pico_bb_proto_1" main.go ## Flashing ISOC Power Board via USB port on Pico2 -tinygo flash -stack-size=8KB -monitor -scheduler tasks -target=pico2 -tags "pico_bb_proto_1" main.go +tinygo flash -stack-size=3KB -monitor -scheduler tasks -target=pico2 -tags "pico_bb_proto_1" main.go ------------------- diff --git a/bus/bus.go b/bus/bus.go index c45894a..4d1c027 100644 --- a/bus/bus.go +++ b/bus/bus.go @@ -165,6 +165,7 @@ type Subscription struct { ch chan *Message bus *Bus conn *Connection + set *SubscriptionSet } func (s *Subscription) Topic() Topic { return s.topic } @@ -176,6 +177,137 @@ func (s *Subscription) Reply(to *Message, payload any, retained bool) { s.conn.Reply(to, payload, retained) } +// ----------------------------------------------------------------------------- +// SubscriptionSet +// ----------------------------------------------------------------------------- + +// SubscriptionSet lets a reactor wait on one readiness channel for several +// subscriptions without spawning a goroutine per subscription. Readiness is +// coalesced: once Ready() is signalled, callers should drain the subscriptions +// they own until no immediate work remains. +type SubscriptionSet struct { + conn *Connection + ready chan struct{} + mu sync.Mutex + subs []*Subscription + closed bool +} + +func (c *Connection) NewSubscriptionSet() *SubscriptionSet { + return &SubscriptionSet{ + conn: c, + ready: make(chan struct{}, 1), + } +} + +func (ss *SubscriptionSet) Ready() <-chan struct{} { + if ss == nil { + return nil + } + return ss.ready +} + +func (ss *SubscriptionSet) Subscribe(tp Topic) *Subscription { + if ss == nil || ss.conn == nil { + return nil + } + ss.mu.Lock() + if ss.closed { + ss.mu.Unlock() + return nil + } + ss.mu.Unlock() + ct := toConcrete(tp) + sub := &Subscription{topic: ct, ch: make(chan *Message, ss.conn.bus.qLen), bus: ss.conn.bus, conn: ss.conn, set: ss} + ss.conn.bus.addSubscription(ct, sub) + ss.conn.mu.Lock() + ss.conn.subs = append(ss.conn.subs, sub) + ss.conn.mu.Unlock() + ss.mu.Lock() + if !ss.closed { + ss.subs = append(ss.subs, sub) + } else { + sub.set = nil + } + ss.mu.Unlock() + if sub.set == nil { + ss.conn.Unsubscribe(sub) + return nil + } + return sub +} + +func (ss *SubscriptionSet) Request(msg *Message) *Subscription { + if ss == nil || ss.conn == nil { + return nil + } + if topicLen(msg.ReplyTo) == 0 { + msg.ReplyTo = TNoIntern("_rr", ss.conn.rrCtr.Add(1)) + } + sub := ss.Subscribe(msg.ReplyTo) + ss.conn.Publish(msg) + return sub +} + +func (ss *SubscriptionSet) Unsubscribe(sub *Subscription) { + if ss == nil || sub == nil || ss.conn == nil { + return + } + ss.conn.Unsubscribe(sub) +} + +func (ss *SubscriptionSet) Close() { + if ss == nil || ss.conn == nil { + return + } + ss.mu.Lock() + if ss.closed { + ss.mu.Unlock() + return + } + subs := append([]*Subscription(nil), ss.subs...) + ss.subs = nil + ss.closed = true + close(ss.ready) + ss.mu.Unlock() + for _, sub := range subs { + ss.conn.Unsubscribe(sub) + } +} + +func (ss *SubscriptionSet) remove(sub *Subscription) { + if ss == nil || sub == nil { + return + } + ss.mu.Lock() + ss.subs = removeSub(ss.subs, sub) + ss.mu.Unlock() +} + +func (ss *SubscriptionSet) signal() { + if ss == nil { + return + } + ss.mu.Lock() + ready := ss.ready + closed := ss.closed + ss.mu.Unlock() + if closed || ready == nil { + return + } + defer func() { _ = recover() }() + select { + case ready <- struct{}{}: + default: + } +} + +func (s *Subscription) signalReady() { + if s != nil && s.set != nil { + s.set.signal() + } +} + // ----------------------------------------------------------------------------- // Trie node (shared for subscribers and retained messages) // ----------------------------------------------------------------------------- @@ -308,10 +440,12 @@ func drainOne(ch chan *Message) { func (b *Bus) tryDeliver(sub *Subscription, msg *Message) { defer func() { _ = recover() }() // channel may be closed; best-effort if trySend(sub.ch, msg) { + sub.signalReady() return } drainOne(sub.ch) _ = trySend(sub.ch, msg) + sub.signalReady() } // ----------------------------------------------------------------------------- @@ -472,6 +606,16 @@ func (b *Bus) NewConnection(id string) *Connection { return &Connection{bus: b, id: id} } +// NewChildConnection creates a separate connection on the same bus. +// Services should use separate connections so subscriptions, request-reply +// counters, and Disconnect lifetimes remain locally owned. +func (c *Connection) NewChildConnection(id string) *Connection { + if c == nil || c.bus == nil { + return nil + } + return c.bus.NewConnection(id) +} + func (c *Connection) NewMessage(tp Topic, payload any, retained bool) *Message { return c.bus.NewMessage(tp, payload, retained) } @@ -489,10 +633,19 @@ func (c *Connection) Subscribe(tp Topic) *Subscription { } func (c *Connection) Unsubscribe(sub *Subscription) { + if sub == nil { + return + } c.bus.unsubscribe(sub.topic, sub) c.mu.Lock() c.subs = removeSub(c.subs, sub) c.mu.Unlock() + if sub.set != nil { + set := sub.set + sub.set = nil + set.remove(sub) + } + defer func() { _ = recover() }() close(sub.ch) } @@ -504,7 +657,15 @@ func (c *Connection) Disconnect() { for _, sub := range subs { c.bus.unsubscribe(sub.topic, sub) - close(sub.ch) + if sub.set != nil { + set := sub.set + sub.set = nil + set.remove(sub) + } + func(ch chan *Message) { + defer func() { _ = recover() }() + close(ch) + }(sub.ch) } } diff --git a/bus/bus_test.go b/bus/bus_test.go index f1de6e4..f8415d1 100644 --- a/bus/bus_test.go +++ b/bus/bus_test.go @@ -347,3 +347,55 @@ func TestTopic_InvalidTokenPanics(t *testing.T) { // []byte is not comparable, so T should panic _ = T([]byte{1, 2, 3}) } + +func TestSubscriptionSetSignalsForAnyMember(t *testing.T) { + b := NewBus(4, "+", "#") + c := b.NewConnection("test") + ss := c.NewSubscriptionSet() + defer ss.Close() + + a := ss.Subscribe(T("a")) + bb := ss.Subscribe(T("b")) + + c.Publish(c.NewMessage(T("b"), "bee", false)) + select { + case <-ss.Ready(): + case <-time.After(100 * time.Millisecond): + t.Fatal("timed out waiting for subscription set readiness") + } + + select { + case m := <-bb.Channel(): + if m.Payload != "bee" { + t.Fatalf("unexpected b payload: %#v", m.Payload) + } + default: + t.Fatal("b subscription was not ready after set signal") + } + + select { + case m := <-a.Channel(): + t.Fatalf("unexpected a message: %#v", m) + default: + } +} + +func TestSubscriptionSetCoalescesReadiness(t *testing.T) { + b := NewBus(4, "+", "#") + c := b.NewConnection("test") + ss := c.NewSubscriptionSet() + defer ss.Close() + + sub := ss.Subscribe(T("a")) + c.Publish(c.NewMessage(T("a"), "one", false)) + c.Publish(c.NewMessage(T("a"), "two", false)) + + select { + case <-ss.Ready(): + case <-time.After(100 * time.Millisecond): + t.Fatal("timed out waiting for first readiness") + } + + got := drainPayloads(t, sub, 2) + assertUnorderedEqual(t, got, []string{"one", "two"}) +} diff --git a/cmd/fabric-selftest/README.md b/cmd/fabric-selftest/README.md new file mode 100644 index 0000000..6f89016 --- /dev/null +++ b/cmd/fabric-selftest/README.md @@ -0,0 +1,27 @@ +# Fabric self-test firmware + +This is a narrow board-level protocol test image. It does not start the main +appliance Reactor, HAL polling, Telemetry, or the normal UART sessions. + +It starts only: + +- an in-memory bus; +- the Updater service with the `fabric_uart_hwtest` staging backend; +- one MCU-side Fabric session; +- a tiny in-process CM5 peer cross-wired through shmring UART-shaped transports. + +It then performs `hello`, `prepare-update`, `xfer_begin`, `xfer_chunk*`, +`xfer_commit`, and waits for `xfer_done`. It does not call `commit-update` and it +does not exercise the production A/B flash writer. + +Example Pico 2 run: + +```sh +tinygo flash -stack-size=3KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1 fabric_uart_hwtest fabric_uart_selftest" \ + ./cmd/fabric-selftest +``` + +If this image needs more than 3 KB stack, the active Fabric transfer hot path +itself needs further stack reduction before production transfer is enabled at the +3 KB appliance gate. diff --git a/cmd/fabric-selftest/main.go b/cmd/fabric-selftest/main.go new file mode 100644 index 0000000..b1253a5 --- /dev/null +++ b/cmd/fabric-selftest/main.go @@ -0,0 +1,97 @@ +package main + +import ( + "context" + "runtime" + "time" + + "devicecode-go/bus" + "devicecode-go/services/fabric" + "devicecode-go/services/updater" +) + +const ( + payloadSize = 1024 + chunkSize = 256 +) + +func main() { + // Give the USB/monitor path a short settle window, matching the appliance + // firmware's behaviour. This image is intentionally not the appliance: it is + // a narrow board-level Fabric protocol gate. + time.Sleep(3 * time.Second) + println("0.000 [fabric-selftest-fw] bootstrapping bus") + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + b := bus.NewBus(3, "+", "#") + mainConn := b.NewConnection("fabric-selftest-main") + updaterConn := b.NewConnection("updater") + fabricConn := b.NewConnection("fabric-selftest") + + readySub := mainConn.Subscribe(updater.TopicUpdaterFact) + defer mainConn.Unsubscribe(readySub) + + updater.GenerateBootID() + updaterSvc := updater.New(updater.Options{ + Conn: updaterConn, + Identity: updater.Identity{ + Version: "fabric-selftest", + Build: "standalone", + ImageID: "fabric-selftest-image", + }, + }) + go updaterSvc.Run(ctx) + println("0.000 [fabric-selftest-fw] updater started") + + if !waitUpdaterReady(ctx, readySub, 2*time.Second) { + println("0.000 [fabric-selftest-fw] updater not ready") + for { + time.Sleep(2 * time.Second) + } + } + + println("0.000 [fabric-selftest-fw] starting fabric transfer self-test") + res, err := fabric.RunUARTSelfTest(ctx, fabric.UARTSelfTestOptions{ + Conn: fabricConn, + StageController: updaterSvc, + PayloadSize: payloadSize, + ChunkSize: chunkSize, + Timeout: 10 * time.Second, + }) + if err != nil { + println("0.000 [fabric-selftest-fw] failed", err.Error()) + } else { + println("0.000 [fabric-selftest-fw] ok xfer=", res.XferID, "bytes=", int(res.PayloadSize), "chunk=", int(res.ChunkSize), "digest=", res.Digest) + } + + // Stop active service goroutines after the test. Keep the image alive so the + // monitor remains connected and the heap profile can be observed. + cancel() + for { + printMem() + time.Sleep(3 * time.Second) + } +} + +func waitUpdaterReady(ctx context.Context, sub *bus.Subscription, d time.Duration) bool { + ctx2, cancel := context.WithTimeout(ctx, d) + defer cancel() + for { + select { + case m := <-sub.Channel(): + if m != nil { + return true + } + case <-ctx2.Done(): + return false + } + } +} + +func printMem() { + var m runtime.MemStats + runtime.ReadMemStats(&m) + println("0.000 [fabric-selftest-fw] mem alloc:", int(m.Alloc), "heapSys:", int(m.HeapSys), "mallocs:", int(m.Mallocs), "frees:", int(m.Frees)) +} diff --git a/cmd/mcu-devhost-pty/applier.go b/cmd/mcu-devhost-pty/applier.go new file mode 100644 index 0000000..1b67e62 --- /dev/null +++ b/cmd/mcu-devhost-pty/applier.go @@ -0,0 +1,56 @@ +//go:build !tinygo + +package main + +import ( + "encoding/json" + "errors" + "fmt" + "os" + + "devicecode-go/services/updater" +) + +type devhostApplier struct { + store *stateStore + exitCode int + exit func(int) +} + +func (a devhostApplier) CanApply(d updater.StagedDescriptor) error { + if d.ImageID == "" { + return errors.New("staged_image_id_required") + } + if d.Length == 0 { + return errors.New("staged_length_required") + } + if d.PayloadSHA256 == "" { + return errors.New("staged_payload_sha256_required") + } + return nil +} + +func (a devhostApplier) ArmReboot(d updater.StagedDescriptor) error { + if a.store == nil { + return errors.New("devhost_state_store_required") + } + if err := a.store.MarkRunningFromStaged(d); err != nil { + return err + } + logJSON(map[string]any{"event": "rebooting", "image_id": d.ImageID, "version": d.Version, "length": d.Length}) + if a.exit != nil { + a.exit(a.exitCode) + return nil + } + os.Exit(a.exitCode) + return nil +} + +func logJSON(v any) { + b, err := json.Marshal(v) + if err != nil { + fmt.Printf("{\"event\":\"log_error\",\"err\":%q}\n", err.Error()) + return + } + fmt.Println(string(b)) +} diff --git a/cmd/mcu-devhost-pty/dcmcu.go b/cmd/mcu-devhost-pty/dcmcu.go new file mode 100644 index 0000000..13d51b3 --- /dev/null +++ b/cmd/mcu-devhost-pty/dcmcu.go @@ -0,0 +1,132 @@ +//go:build !tinygo + +package main + +import ( + "crypto/sha256" + "encoding/binary" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "io" + + "devicecode-go/services/updater" +) + +const ( + dcmcuMagic = "DCMCUIMG" + dcmcuFormatVersion = uint16(1) + dcmcuMinHeaderLen = 16 + dcmcuMaxHeaderLen = 4096 + dcmcuMaxManifestLen = 1024 * 1024 +) + +type dcmcuManifest struct { + Schema int `json:"schema"` + Component string `json:"component"` + Build struct { + Version string `json:"version"` + BuildID string `json:"build_id"` + ImageID string `json:"image_id"` + } `json:"build"` + Payload struct { + Length uint32 `json:"length"` + SHA256 string `json:"sha256"` + } `json:"payload"` +} + +type devhostDCMCUVerifier struct{} + +func (devhostDCMCUVerifier) Verify(r io.Reader, sink updater.SlotSink) (updater.Manifest, error) { + if sink == nil { + return updater.Manifest{}, errors.New("devhost_dcmcu: nil sink") + } + manifest, payload, err := readDCMCU(r) + if err != nil { + _ = sink.Abort() + return updater.Manifest{}, err + } + if _, err := sink.Write(payload); err != nil { + _ = sink.Abort() + return updater.Manifest{}, err + } + if err := sink.Commit(); err != nil { + _ = sink.Abort() + return updater.Manifest{}, err + } + return updater.Manifest{ + Version: manifest.Build.Version, + BuildID: manifest.Build.BuildID, + ImageID: manifest.Build.ImageID, + PayloadSHA256: manifest.Payload.SHA256, + PayloadLength: manifest.Payload.Length, + }, nil +} + +func readDCMCU(r io.Reader) (dcmcuManifest, []byte, error) { + var zero dcmcuManifest + header16 := make([]byte, dcmcuMinHeaderLen) + if _, err := io.ReadFull(r, header16); err != nil { + return zero, nil, fmt.Errorf("dcmcu_header: %w", err) + } + if string(header16[:len(dcmcuMagic)]) != dcmcuMagic { + return zero, nil, errors.New("dcmcu_bad_magic") + } + version := binary.LittleEndian.Uint16(header16[8:10]) + headerLen := binary.LittleEndian.Uint16(header16[10:12]) + manifestLen := binary.LittleEndian.Uint32(header16[12:16]) + if version != dcmcuFormatVersion { + return zero, nil, errors.New("dcmcu_version_unsupported") + } + if headerLen < dcmcuMinHeaderLen || headerLen > dcmcuMaxHeaderLen { + return zero, nil, errors.New("dcmcu_header_len_invalid") + } + if manifestLen == 0 || manifestLen > dcmcuMaxManifestLen { + return zero, nil, errors.New("dcmcu_manifest_len_invalid") + } + if extra := int(headerLen) - len(header16); extra > 0 { + if _, err := io.CopyN(io.Discard, r, int64(extra)); err != nil { + return zero, nil, fmt.Errorf("dcmcu_header_rest: %w", err) + } + } + manifestRaw := make([]byte, manifestLen) + if _, err := io.ReadFull(r, manifestRaw); err != nil { + return zero, nil, fmt.Errorf("dcmcu_manifest: %w", err) + } + var manifest dcmcuManifest + if err := json.Unmarshal(manifestRaw, &manifest); err != nil { + return zero, nil, fmt.Errorf("dcmcu_manifest_json_invalid: %w", err) + } + if err := validateDCMCUManifest(manifest); err != nil { + return zero, nil, err + } + payload := make([]byte, manifest.Payload.Length) + if _, err := io.ReadFull(r, payload); err != nil { + return zero, nil, fmt.Errorf("dcmcu_payload: %w", err) + } + sum := sha256.Sum256(payload) + if got := hex.EncodeToString(sum[:]); got != manifest.Payload.SHA256 { + return zero, nil, fmt.Errorf("dcmcu_payload_sha256_mismatch: got %s want %s", got, manifest.Payload.SHA256) + } + return manifest, payload, nil +} + +func validateDCMCUManifest(m dcmcuManifest) error { + if m.Schema != 1 { + return errors.New("dcmcu_manifest_schema_unsupported") + } + if m.Component != "mcu" { + return errors.New("dcmcu_component_not_mcu") + } + if m.Build.ImageID == "" { + return errors.New("dcmcu_image_id_required") + } + if len(m.Payload.SHA256) != 64 { + return errors.New("dcmcu_payload_sha256_invalid") + } + if _, err := hex.DecodeString(m.Payload.SHA256); err != nil { + return errors.New("dcmcu_payload_sha256_invalid") + } + return nil +} diff --git a/cmd/mcu-devhost-pty/dcmcu_test.go b/cmd/mcu-devhost-pty/dcmcu_test.go new file mode 100644 index 0000000..559f1f4 --- /dev/null +++ b/cmd/mcu-devhost-pty/dcmcu_test.go @@ -0,0 +1,89 @@ +//go:build !tinygo + +package main + +import ( + "bytes" + "crypto/sha256" + "encoding/binary" + "encoding/hex" + "encoding/json" + "strings" + "testing" +) + +func makeDCMCUForTest(t *testing.T, imageID string, payload []byte) []byte { + t.Helper() + sum := sha256.Sum256(payload) + manifest, err := json.Marshal(map[string]any{ + "schema": 1, + "component": "mcu", + "build": map[string]any{ + "version": "15.0", + "build_id": "test-build", + "image_id": imageID, + }, + "payload": map[string]any{ + "length": len(payload), + "sha256": hex.EncodeToString(sum[:]), + }, + }) + if err != nil { + t.Fatal(err) + } + headerLen := uint16(32) + header := make([]byte, headerLen) + copy(header, []byte(dcmcuMagic)) + binary.LittleEndian.PutUint16(header[8:10], dcmcuFormatVersion) + binary.LittleEndian.PutUint16(header[10:12], headerLen) + binary.LittleEndian.PutUint32(header[12:16], uint32(len(manifest))) + out := append(header, manifest...) + out = append(out, payload...) + return out +} + +func TestReadDCMCUExtractsManifestAndVerifiesPayload(t *testing.T) { + payload := []byte(strings.Repeat("payload-", 64)) + m, got, err := readDCMCU(bytes.NewReader(makeDCMCUForTest(t, "mcu-dev-15.0", payload))) + if err != nil { + t.Fatalf("readDCMCU: %v", err) + } + if m.Build.ImageID != "mcu-dev-15.0" || m.Build.Version != "15.0" || m.Build.BuildID != "test-build" { + t.Fatalf("manifest build = %+v", m.Build) + } + if !bytes.Equal(got, payload) { + t.Fatalf("payload mismatch") + } +} + +func TestReadDCMCURejectsPayloadHashMismatch(t *testing.T) { + blob := makeDCMCUForTest(t, "mcu-dev-15.0", []byte("payload")) + blob[len(blob)-1] ^= 0xff + _, _, err := readDCMCU(bytes.NewReader(blob)) + if err == nil || !strings.Contains(err.Error(), "dcmcu_payload_sha256_mismatch") { + t.Fatalf("err = %v, want payload hash mismatch", err) + } +} + +type recordingSink struct { + bytes.Buffer + committed, aborted bool +} + +func (s *recordingSink) Commit() error { s.committed = true; return nil } +func (s *recordingSink) Abort() error { s.aborted = true; return nil } + +func TestDevhostVerifierStreamsPayloadIntoSink(t *testing.T) { + payload := []byte("verified-payload") + sink := &recordingSink{} + manifest, err := (devhostDCMCUVerifier{}).Verify(bytes.NewReader(makeDCMCUForTest(t, "mcu-dev-16.0", payload)), sink) + if err != nil { + t.Fatalf("Verify: %v", err) + } + if manifest.ImageID != "mcu-dev-16.0" || manifest.PayloadLength != uint32(len(payload)) { + t.Fatalf("manifest = %+v", manifest) + } + if !sink.committed || sink.aborted || sink.String() != string(payload) { + t.Fatalf("sink committed=%v aborted=%v payload=%q", sink.committed, sink.aborted, sink.String()) + } +} diff --git a/cmd/mcu-devhost-pty/main.go b/cmd/mcu-devhost-pty/main.go new file mode 100644 index 0000000..be9328d --- /dev/null +++ b/cmd/mcu-devhost-pty/main.go @@ -0,0 +1,87 @@ +//go:build !tinygo + +package main + +import ( + "context" + "errors" + "flag" + "fmt" + "os" + "os/signal" + "runtime" + "syscall" + + "devicecode-go/bus" + "devicecode-go/services/fabric" + "devicecode-go/services/updater" +) + +func main() { + if err := run(); err != nil { + fmt.Fprintf(os.Stderr, "mcu-devhost-pty: %v\n", err) + os.Exit(2) + } +} + +func run() error { + var cfg struct { + uart string + stateDir string + node string + peer string + initialImageID string + initialVersion string + initialBuildID string + rebootExitCode int + } + flag.StringVar(&cfg.uart, "uart", "", "POSIX tty path connected to the CM5-side PTY") + flag.StringVar(&cfg.stateDir, "state-dir", "", "directory for devhost MCU state.json") + flag.StringVar(&cfg.node, "node", "mcu", "local Fabric node id") + flag.StringVar(&cfg.peer, "peer", "bigbox-cm5", "expected peer Fabric node id") + flag.StringVar(&cfg.initialImageID, "initial-image-id", "mcu-dev-10.0", "initial running image id when state-dir is empty") + flag.StringVar(&cfg.initialVersion, "initial-version", "10.0", "initial running version when state-dir is empty") + flag.StringVar(&cfg.initialBuildID, "initial-build-id", "devhost-initial", "initial running build id when state-dir is empty") + flag.IntVar(&cfg.rebootExitCode, "reboot-exit-code", 42, "exit code used to model MCU reboot after commit") + flag.Parse() + + if cfg.uart == "" { + return errors.New("--uart is required") + } + store, err := openStateStore(cfg.stateDir, imageState{ImageID: cfg.initialImageID, Version: cfg.initialVersion, BuildID: cfg.initialBuildID}) + if err != nil { + return err + } + tr, err := openLineTransport(cfg.uart) + if err != nil { + return err + } + defer tr.Close() + + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + + _ = updater.GenerateBootID() + identity := store.identity() + logJSON(map[string]any{ + "event": "ready", "node": cfg.node, "peer": cfg.peer, + "image_id": identity.ImageID, "version": identity.Version, "build_id": identity.Build, + "gomaxprocs": runtime.GOMAXPROCS(0), + }) + + b := bus.NewBus(64, "+", "#") + updaterConn := b.NewConnection("updater") + fabricConn := b.NewConnection("fabric") + + svc := updater.New(updater.Options{ + Conn: updaterConn, + Verifier: devhostDCMCUVerifier{}, + Applier: devhostApplier{store: store, exitCode: cfg.rebootExitCode}, + Metadata: store, + MetadataWrite: store, + Identity: identity, + }) + go svc.Run(ctx) + fabric.RunWithOptions(ctx, tr, fabricConn, cfg.node, cfg.peer, fabric.DefaultLinkConfig(), fabric.RunOptions{StageController: svc}) + return nil +} diff --git a/cmd/mcu-devhost-pty/state.go b/cmd/mcu-devhost-pty/state.go new file mode 100644 index 0000000..379c29d --- /dev/null +++ b/cmd/mcu-devhost-pty/state.go @@ -0,0 +1,124 @@ +//go:build !tinygo + +package main + +import ( + "encoding/json" + "errors" + "os" + "path/filepath" + "sync" + + "devicecode-go/services/updater" +) + +type imageState struct { + ImageID string `json:"image_id"` + Version string `json:"version"` + BuildID string `json:"build_id"` + PayloadSHA256 string `json:"payload_sha256,omitempty"` +} + +type devhostStateFile struct { + BootSeq int `json:"boot_seq"` + Running imageState `json:"running"` + Staged *updater.StagedDescriptor `json:"staged,omitempty"` +} + +type stateStore struct { + mu sync.Mutex + dir string + path string + data devhostStateFile +} + +func openStateStore(dir string, initial imageState) (*stateStore, error) { + if dir == "" { + return nil, errors.New("state-dir required") + } + if initial.ImageID == "" { + return nil, errors.New("initial image id required") + } + if initial.Version == "" { + initial.Version = "0.0.0-devhost" + } + if initial.BuildID == "" { + initial.BuildID = "devhost-initial" + } + if err := os.MkdirAll(dir, 0o755); err != nil { + return nil, err + } + s := &stateStore{dir: dir, path: filepath.Join(dir, "state.json")} + b, err := os.ReadFile(s.path) + if err == nil { + if err := json.Unmarshal(b, &s.data); err != nil { + return nil, err + } + if s.data.Running.ImageID == "" { + return nil, errors.New("state running image_id missing") + } + return s, nil + } + if !errors.Is(err, os.ErrNotExist) { + return nil, err + } + s.data = devhostStateFile{BootSeq: 1, Running: initial} + return s, s.saveLocked() +} + +func (s *stateStore) identity() updater.Identity { + s.mu.Lock() + defer s.mu.Unlock() + return updater.Identity{Version: s.data.Running.Version, Build: s.data.Running.BuildID, ImageID: s.data.Running.ImageID} +} + +func (s *stateStore) PayloadSHA256() string { + s.mu.Lock() + defer s.mu.Unlock() + return s.data.Running.PayloadSHA256 +} + +func (s *stateStore) StagedDescriptor() (updater.StagedDescriptor, bool) { + s.mu.Lock() + defer s.mu.Unlock() + if s.data.Staged == nil { + return updater.StagedDescriptor{}, false + } + return *s.data.Staged, true +} + +func (s *stateStore) WriteStagedDescriptor(d updater.StagedDescriptor) error { + s.mu.Lock() + defer s.mu.Unlock() + s.data.Staged = &d + return s.saveLocked() +} + +func (s *stateStore) ClearStagedDescriptor() error { + s.mu.Lock() + defer s.mu.Unlock() + s.data.Staged = nil + return s.saveLocked() +} + +func (s *stateStore) MarkRunningFromStaged(d updater.StagedDescriptor) error { + s.mu.Lock() + defer s.mu.Unlock() + s.data.BootSeq++ + s.data.Running = imageState{ImageID: d.ImageID, Version: d.Version, BuildID: d.BuildID, PayloadSHA256: d.PayloadSHA256} + s.data.Staged = nil + return s.saveLocked() +} + +func (s *stateStore) saveLocked() error { + b, err := json.MarshalIndent(s.data, "", " ") + if err != nil { + return err + } + b = append(b, '\n') + tmp := s.path + ".tmp" + if err := os.WriteFile(tmp, b, 0o644); err != nil { + return err + } + return os.Rename(tmp, s.path) +} diff --git a/cmd/mcu-devhost-pty/state_test.go b/cmd/mcu-devhost-pty/state_test.go new file mode 100644 index 0000000..0ee952d --- /dev/null +++ b/cmd/mcu-devhost-pty/state_test.go @@ -0,0 +1,63 @@ +//go:build !tinygo + +package main + +import ( + "testing" + + "devicecode-go/services/updater" +) + +func TestStateStorePersistsRunningAndStagedDescriptor(t *testing.T) { + dir := t.TempDir() + store, err := openStateStore(dir, imageState{ImageID: "mcu-dev-10.0", Version: "10.0", BuildID: "initial"}) + if err != nil { + t.Fatalf("openStateStore: %v", err) + } + if id := store.identity(); id.ImageID != "mcu-dev-10.0" || id.Version != "10.0" { + t.Fatalf("identity = %+v", id) + } + desc := updater.StagedDescriptor{ImageID: "mcu-dev-15.0", Version: "15.0", BuildID: "build-15", Length: 1234, PayloadSHA256: "abc"} + if err := store.WriteStagedDescriptor(desc); err != nil { + t.Fatalf("WriteStagedDescriptor: %v", err) + } + got, ok := store.StagedDescriptor() + if !ok || got.ImageID != desc.ImageID { + t.Fatalf("staged = %+v ok=%v", got, ok) + } + + reopened, err := openStateStore(dir, imageState{ImageID: "ignored", Version: "ignored"}) + if err != nil { + t.Fatalf("reopen: %v", err) + } + got, ok = reopened.StagedDescriptor() + if !ok || got.ImageID != desc.ImageID { + t.Fatalf("reopened staged = %+v ok=%v", got, ok) + } +} + +func TestDevhostApplierMarksRunningAndExits(t *testing.T) { + store, err := openStateStore(t.TempDir(), imageState{ImageID: "mcu-dev-10.0", Version: "10.0"}) + if err != nil { + t.Fatal(err) + } + desc := updater.StagedDescriptor{ImageID: "mcu-dev-15.0", Version: "15.0", BuildID: "build-15", Length: 1234, PayloadSHA256: "sha"} + var exitCode int + applier := devhostApplier{store: store, exitCode: 42, exit: func(code int) { exitCode = code }} + if err := applier.CanApply(desc); err != nil { + t.Fatalf("CanApply: %v", err) + } + if err := applier.ArmReboot(desc); err != nil { + t.Fatalf("ArmReboot: %v", err) + } + if exitCode != 42 { + t.Fatalf("exitCode=%d", exitCode) + } + id := store.identity() + if id.ImageID != "mcu-dev-15.0" || id.Version != "15.0" || id.Build != "build-15" { + t.Fatalf("running identity = %+v", id) + } + if _, ok := store.StagedDescriptor(); ok { + t.Fatalf("staged descriptor was not cleared") + } +} diff --git a/cmd/mcu-devhost-pty/transport.go b/cmd/mcu-devhost-pty/transport.go new file mode 100644 index 0000000..bab96b0 --- /dev/null +++ b/cmd/mcu-devhost-pty/transport.go @@ -0,0 +1,81 @@ +//go:build !tinygo + +package main + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "sync" +) + +const devhostMaxLineLen = 64 * 1024 + +// lineTransport adapts a host POSIX tty, pipe, or socket to Fabric's JSONL +// transport interface. It deliberately does not inject noise; the Lua test rig +// owns stream pressure and fragmentation so there is one place to reason about +// line conditions. +type lineTransport struct { + rwc io.ReadWriteCloser + r *bufio.Reader + mu sync.Mutex +} + +func openLineTransport(path string) (*lineTransport, error) { + if path == "" { + return nil, errors.New("uart path required") + } + f, err := os.OpenFile(path, os.O_RDWR, 0) + if err != nil { + return nil, err + } + return newLineTransport(f), nil +} + +func newLineTransport(rwc io.ReadWriteCloser) *lineTransport { + return &lineTransport{rwc: rwc, r: bufio.NewReaderSize(rwc, devhostMaxLineLen)} +} + +func (t *lineTransport) ReadLine() ([]byte, error) { + if t == nil || t.r == nil { + return nil, errors.New("transport closed") + } + line, err := t.r.ReadBytes('\n') + if err != nil { + return nil, err + } + if len(line) > 0 && line[len(line)-1] == '\n' { + line = line[:len(line)-1] + } + if len(line) > 0 && line[len(line)-1] == '\r' { + line = line[:len(line)-1] + } + if len(line) > devhostMaxLineLen { + return nil, fmt.Errorf("line too long: %d", len(line)) + } + out := make([]byte, len(line)) + copy(out, line) + return out, nil +} + +func (t *lineTransport) WriteLine(data []byte) error { + if len(data) > devhostMaxLineLen { + return fmt.Errorf("line too long: %d", len(data)) + } + t.mu.Lock() + defer t.mu.Unlock() + if _, err := t.rwc.Write(data); err != nil { + return err + } + _, err := t.rwc.Write([]byte{'\n'}) + return err +} + +func (t *lineTransport) Close() error { + if t == nil || t.rwc == nil { + return nil + } + return t.rwc.Close() +} diff --git a/cmd/mcu-devhost-pty/transport_test.go b/cmd/mcu-devhost-pty/transport_test.go new file mode 100644 index 0000000..ede85dc --- /dev/null +++ b/cmd/mcu-devhost-pty/transport_test.go @@ -0,0 +1,46 @@ +//go:build !tinygo + +package main + +import ( + "net" + "testing" + "time" +) + +func TestLineTransportReadsAndWritesJSONLines(t *testing.T) { + a, b := net.Pipe() + defer a.Close() + defer b.Close() + left := newLineTransport(a) + defer left.Close() + + go func() { + _, _ = b.Write([]byte("{\"type\":\"hello\"}\n")) + }() + line, err := left.ReadLine() + if err != nil { + t.Fatalf("ReadLine: %v", err) + } + if string(line) != `{"type":"hello"}` { + t.Fatalf("line = %q", string(line)) + } + + read := make(chan string, 1) + go func() { + buf := make([]byte, 64) + n, _ := b.Read(buf) + read <- string(buf[:n]) + }() + if err := left.WriteLine([]byte(`{"type":"pong"}`)); err != nil { + t.Fatalf("WriteLine: %v", err) + } + select { + case got := <-read: + if got != `{"type":"pong"}`+"\n" { + t.Fatalf("write = %q", got) + } + case <-time.After(time.Second): + t.Fatal("timeout waiting for write") + } +} diff --git a/cmd/pico-cm5-emulator/README.md b/cmd/pico-cm5-emulator/README.md new file mode 100644 index 0000000..57f55e4 --- /dev/null +++ b/cmd/pico-cm5-emulator/README.md @@ -0,0 +1,109 @@ +# Pico CM5 emulator + +This command builds a very small Pico 1 firmware that behaves like a CM5-side +Fabric peer for hardware bring-up. It is intended for the two-Pico setup: + +```text +Pico 1 UART0 TX GP0 -> Pico 2 UART1 RX GP5 +Pico 1 UART0 RX GP1 <- Pico 2 UART1 TX GP4 +Pico 1 GND <-> Pico 2 GND +``` + +Do not connect 3V3 or VSYS between the boards unless you are deliberately +powering one board from the other. + +## Pico 2 under test + +For the first physical UART protocol test, flash the Pico 2 with the hwtest +staging backend: + +```sh +tinygo flash -stack-size=3KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1 fabric_uart_hwtest" main.go +``` + +This keeps production flash/apply disabled and stages into the safe digest/count +backend. + +## Pico 1 emulator + +Flash the Pico 1 with: + +```sh +tinygo flash -stack-size=3KB -monitor -scheduler tasks \ + -target=pico -tags "pico_cm5_emulator" ./cmd/pico-cm5-emulator +``` + +The emulator opens UART0 at 115200 baud, sends a Fabric hello, calls +`cap/self/updater/main/rpc/prepare-update`, transfers a deterministic 1024-byte +blob to `updater/main`, commits the transfer, and waits for `xfer_done`. + +When monitored over USB it prints each major phase. When running headless, the +onboard LED gives status: + +```text +fast blink failure +mostly-on blink pass / alive +``` + +This test does not send `commit-update` and cannot reboot the Pico 2. + +### Timing note + +The emulator uses a 180 second end-to-end script timeout. On the full Pico 2 +appliance image the first `prepare-update` can be delayed by HAL and retained +state publication, so a shorter timeout can expire just as the transfer phase +begins. The emulator logs `prepare-update sent`, `xfer_begin sent`, +`xfer_ready received`, and `xfer_need next=0 received` to make it clear which +phase is blocking. + +### JSONL reader note + +The emulator reader treats UART as a byte stream, not as one read per line. It +only releases bytes from the RX ring up to the newline that completed the line. +If the same UART read span also contains the start of the next JSONL frame, +those bytes remain in the ring and are consumed by the next `readLine` call. +This is important because the reactive UART path can legitimately deliver +`...\n{` in one readable span when the peer is sending frames back-to-back. + +### Chunk-size tags + +By default the emulator uses 2048-byte chunks, matching the current advertised +`max_chunk_size` used by the MCU updater prepare response. This is the normal +large-transfer test shape: + +```sh +tinygo flash -stack-size=8KB -monitor -scheduler tasks \ + -target=pico -tags "pico_cm5_emulator pico_cm5_payload_200k" \ + ./cmd/pico-cm5-emulator +``` + +Use 1024-byte chunks for an intermediate setting: + +```sh +tinygo flash -stack-size=8KB -monitor -scheduler tasks \ + -target=pico -tags "pico_cm5_emulator pico_cm5_payload_200k pico_cm5_chunk_1024" \ + ./cmd/pico-cm5-emulator +``` + +Use 256-byte chunks as a stop-and-wait stress test: + +```sh +tinygo flash -stack-size=8KB -monitor -scheduler tasks \ + -target=pico -tags "pico_cm5_emulator pico_cm5_payload_200k pico_cm5_chunk_256" \ + ./cmd/pico-cm5-emulator +``` + +### MCU transfer probe + +For a targeted MCU-side transfer trace without the full `fabric_trace` frame dump, +flash the Pico 2 with `fabric_xfer_probe`: + +```sh +tinygo flash -stack-size=8KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1 fabric_uart_hwtest fabric_xfer_probe" main.go +``` + +The probe logs chunk receive offsets, write start/done, receiver retries, +digest/decode errors, stale/future chunks and commit start/done. It is intended +to explain receiver-driven retries during large transfers. diff --git a/cmd/pico-cm5-emulator/chunk_1024.go b/cmd/pico-cm5-emulator/chunk_1024.go new file mode 100644 index 0000000..95be6bf --- /dev/null +++ b/cmd/pico-cm5-emulator/chunk_1024.go @@ -0,0 +1,8 @@ +//go:build pico_cm5_chunk_1024 && !pico_cm5_chunk_256 + +package main + +const ( + chunkSize = 1024 + chunkBase64Max = 1536 +) diff --git a/cmd/pico-cm5-emulator/chunk_2048.go b/cmd/pico-cm5-emulator/chunk_2048.go new file mode 100644 index 0000000..7ad655c --- /dev/null +++ b/cmd/pico-cm5-emulator/chunk_2048.go @@ -0,0 +1,6 @@ +//go:build ignore + +package main + +// The emulator's default chunk size is now 2048. This file is intentionally +// ignored and kept only to make older references to pico_cm5_chunk_2048 obvious. diff --git a/cmd/pico-cm5-emulator/chunk_256.go b/cmd/pico-cm5-emulator/chunk_256.go new file mode 100644 index 0000000..e6160fc --- /dev/null +++ b/cmd/pico-cm5-emulator/chunk_256.go @@ -0,0 +1,8 @@ +//go:build pico_cm5_chunk_256 + +package main + +const ( + chunkSize = 256 + chunkBase64Max = 512 +) diff --git a/cmd/pico-cm5-emulator/chunk_default.go b/cmd/pico-cm5-emulator/chunk_default.go new file mode 100644 index 0000000..ad494fb --- /dev/null +++ b/cmd/pico-cm5-emulator/chunk_default.go @@ -0,0 +1,8 @@ +//go:build !pico_cm5_chunk_256 && !pico_cm5_chunk_1024 + +package main + +const ( + chunkSize = 2048 + chunkBase64Max = 3072 +) diff --git a/cmd/pico-cm5-emulator/led_rp.go b/cmd/pico-cm5-emulator/led_rp.go new file mode 100644 index 0000000..7b39133 --- /dev/null +++ b/cmd/pico-cm5-emulator/led_rp.go @@ -0,0 +1,36 @@ +//go:build tinygo && (rp2040 || rp2350) + +package main + +import ( + "machine" + "time" +) + +var statusLED = machine.LED + +func ledInit() { + statusLED.Configure(machine.PinConfig{Mode: machine.PinOutput}) + statusLED.Low() +} + +func ledOn() { statusLED.High() } +func ledOff() { statusLED.Low() } + +func ledPassLoop() { + for { + statusLED.High() + time.Sleep(1800 * time.Millisecond) + statusLED.Low() + time.Sleep(200 * time.Millisecond) + } +} + +func ledFailLoop() { + for { + statusLED.High() + time.Sleep(120 * time.Millisecond) + statusLED.Low() + time.Sleep(120 * time.Millisecond) + } +} diff --git a/cmd/pico-cm5-emulator/led_stub.go b/cmd/pico-cm5-emulator/led_stub.go new file mode 100644 index 0000000..9dcce88 --- /dev/null +++ b/cmd/pico-cm5-emulator/led_stub.go @@ -0,0 +1,9 @@ +//go:build !tinygo || !(rp2040 || rp2350) + +package main + +func ledInit() {} +func ledOn() {} +func ledOff() {} +func ledPassLoop() { select {} } +func ledFailLoop() { select {} } diff --git a/cmd/pico-cm5-emulator/main.go b/cmd/pico-cm5-emulator/main.go new file mode 100644 index 0000000..19f1f98 --- /dev/null +++ b/cmd/pico-cm5-emulator/main.go @@ -0,0 +1,701 @@ +package main + +import ( + "context" + "encoding/base64" + "errors" + "runtime" + "strconv" + "time" + + "devicecode-go/bus" + "devicecode-go/services/hal" + "devicecode-go/types" + "devicecode-go/x/shmring" + "devicecode-go/x/xxhash" +) + +const ( + linkUART = "uart0" + testTimeout = 180 * time.Second + chunkAckTimeout = 750 * time.Millisecond + maxChunkResends = 32 + cm5SIDPrefix = "pico-cm5-emulator" +) + +var chunkScratch [chunkSize]byte +var lineScratch [4096]byte +var b64Scratch [chunkBase64Max]byte + +type peer struct { + rx *shmring.Ring + tx *shmring.Ring + n int + sid string + prepareID string + jobID string + xferID string +} + +func main() { + ledInit() + ledOff() + time.Sleep(3 * time.Second) + println("0.000 [pico-cm5] bootstrapping bus + HAL") + + ctx := context.Background() + b := bus.NewBus(4, "+", "#") + halConn := b.NewConnection("hal") + ctlConn := b.NewConnection("pico-cm5") + go hal.Run(ctx, halConn) + + time.Sleep(250 * time.Millisecond) + opened, err := openSerial(ctx, ctlConn, linkUART, 512, 512) + if err != nil { + fail("serial open failed", err) + } + rx := shmring.Get(shmring.Handle(opened.RXHandle)) + tx := shmring.Get(shmring.Handle(opened.TXHandle)) + if rx == nil || tx == nil { + fail("serial ring resolution failed", errors.New("nil_ring")) + } + println("0.000 [pico-cm5] uart0 opened; starting Fabric CM5-emulator script") + + ledOn() + runID := makeRunID() + p := &peer{ + rx: rx, + tx: tx, + sid: cm5SIDPrefix + "-sid-" + runID, + prepareID: "pico-cm5-prepare-" + runID, + jobID: "pico-cm5-job-" + runID, + xferID: "pico-cm5-xfer-" + runID, + } + println("0.000 [pico-cm5] session sid=", p.sid, "xfer=", p.xferID) + if err := p.run(ctx); err != nil { + fail("fabric script failed", err) + } + println("0.000 [pico-cm5] PASS: Fabric prepare + transfer completed") + for { + ledOn() + printMem() + time.Sleep(1800 * time.Millisecond) + ledOff() + time.Sleep(200 * time.Millisecond) + } +} + +func fail(msg string, err error) { + println("0.000 [pico-cm5] FAIL:", msg, err.Error()) + for { + ledOn() + time.Sleep(120 * time.Millisecond) + ledOff() + time.Sleep(120 * time.Millisecond) + } +} + +func (p *peer) run(parent context.Context) error { + ctx, cancel := context.WithTimeout(parent, testTimeout) + defer cancel() + + digest := payloadDigest(payloadSize) + println("0.000 [pico-cm5] payload bytes=", payloadSize, "chunk=", chunkSize, "digest=", digest) + + if err := p.writeHello(ctx); err != nil { + return err + } + println("0.000 [pico-cm5] hello sent") + if _, err := p.waitType(ctx, "hello_ack", ""); err != nil { + return err + } + println("0.000 [pico-cm5] hello_ack received") + + if err := p.writePrepare(ctx, p.prepareID); err != nil { + return err + } + println("0.000 [pico-cm5] prepare-update sent") + if err := p.waitReplyOK(ctx, p.prepareID); err != nil { + return err + } + println("0.000 [pico-cm5] prepare-update ok") + + cm5TraceEvent("phase_xfer_begin_write") + if err := p.writeXferBegin(ctx, p.xferID, uint32(payloadSize), digest); err != nil { + return err + } + println("0.000 [pico-cm5] xfer_begin sent") + cm5TraceEvent("phase_wait_xfer_ready") + if _, err := p.waitType(ctx, "xfer_ready", p.xferID); err != nil { + return err + } + println("0.000 [pico-cm5] xfer_ready received") + cm5TraceEvent("phase_wait_xfer_need_0") + if err := p.transferPayload(ctx, p.xferID); err != nil { + return err + } + cm5TraceEvent("phase_commit_write") + if err := p.writeXferCommit(ctx, p.xferID, uint32(payloadSize), digest); err != nil { + return err + } + if _, err := p.waitType(ctx, "xfer_done", p.xferID); err != nil { + return err + } + println("0.000 [pico-cm5] xfer_done") + return nil +} + +func (p *peer) transferPayload(ctx context.Context, id string) error { + maxSentEnd := uint32(0) + lastAck := uint32(0) + lastSentOff := uint32(0) + haveOutstanding := false + resendsWithoutProgress := 0 + + for { + need, err := p.waitNeedWithTimeout(ctx, id, chunkAckTimeout) + if err != nil { + if errors.Is(err, context.DeadlineExceeded) && haveOutstanding { + resendsWithoutProgress++ + println("0.000 [pico-cm5] ack timeout resend offset=", int(lastSentOff), "retry=", resendsWithoutProgress) + if resendsWithoutProgress > maxChunkResends { + return errors.New("too_many_ack_timeouts_at_offset:" + strconv.FormatUint(uint64(lastSentOff), 10)) + } + if err := p.sendPayloadChunk(ctx, id, lastSentOff); err != nil { + return err + } + continue + } + return err + } + if need > uint32(payloadSize) { + return errors.New("bad_xfer_need_next_too_large:" + strconv.FormatUint(uint64(need), 10)) + } + if need == uint32(payloadSize) { + println("0.000 [pico-cm5] chunk ack next=", int(need)) + return nil + } + if need > maxSentEnd { + return errors.New("bad_xfer_need_future:" + strconv.FormatUint(uint64(need), 10) + ":max_sent=" + strconv.FormatUint(uint64(maxSentEnd), 10)) + } + + if need > lastAck { + lastAck = need + resendsWithoutProgress = 0 + haveOutstanding = false + if shouldPrintAck(need) { + println("0.000 [pico-cm5] chunk ack next=", int(need)) + } + } else if need < maxSentEnd { + resendsWithoutProgress++ + println("0.000 [pico-cm5] retry need next=", int(need), "max_sent=", int(maxSentEnd), "retry=", resendsWithoutProgress) + if resendsWithoutProgress > maxChunkResends { + return errors.New("too_many_retries_at_older_offset:" + strconv.FormatUint(uint64(need), 10)) + } + } + + if err := p.sendPayloadChunk(ctx, id, need); err != nil { + return err + } + lastSentOff = need + haveOutstanding = true + end := need + uint32(chunkSize) + if end > uint32(payloadSize) { + end = uint32(payloadSize) + } + if end > maxSentEnd { + maxSentEnd = end + } + } +} + +func (p *peer) sendPayloadChunk(ctx context.Context, id string, off uint32) error { + end := int(off) + chunkSize + if end > payloadSize { + end = payloadSize + } + chunk := makePayloadChunk(int(off), chunkScratch[:end-int(off)]) + cm5TraceEventKV("phase_chunk_write", "offset", strconv.Itoa(int(off))) + return p.writeXferChunk(ctx, id, off, chunk) +} + +func shouldPrintAck(next uint32) bool { + if payloadSize <= 4096 { + return true + } + return next != 0 && (next%4096 == 0 || next == uint32(payloadSize)) +} + +func makeRunID() string { + // The SID is a session identifier, not a stable node identity. Make it + // change across emulator resets so the MCU can distinguish a fresh CM5 + // session from a duplicate hello in an existing session, and abort any + // old in-flight transfer state accordingly. + n := uint64(time.Now().UnixNano()) + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + n ^= uint64(ms.Mallocs) << 17 + n ^= uint64(ms.Alloc) << 1 + if n == 0 { + n = 1 + } + return strconv.FormatUint(n, 16) +} + +func openSerial(ctx context.Context, conn *bus.Connection, name string, rxSize, txSize int) (types.SerialSessionOpened, error) { + evT := bus.T("hal", "cap", "io", "serial", name, "event", "session_opened") + sub := conn.Subscribe(evT) + defer conn.Unsubscribe(sub) + + ctrlT := bus.T("hal", "cap", "io", "serial", name, "control", "session_open") + reqCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + if _, err := conn.RequestWait(reqCtx, conn.NewMessage(ctrlT, types.SerialSessionOpen{RXSize: rxSize, TXSize: txSize}, false)); err != nil { + return types.SerialSessionOpened{}, err + } + for { + select { + case m := <-sub.Channel(): + if rep, ok := m.Payload.(types.SerialSessionOpened); ok { + return rep, nil + } + case <-reqCtx.Done(): + return types.SerialSessionOpened{}, reqCtx.Err() + } + } +} + +func (p *peer) writeHello(ctx context.Context) error { + b := make([]byte, 0, 96) + b = append(b, `{"type":"hello","proto":"fabric-jsonl/1","sid":"`...) + b = appendJSONString(b, p.sid) + b = append(b, `","node":"bigbox-cm5"}`...) + return p.writeLine(ctx, b) +} + +func (p *peer) writePrepare(ctx context.Context, id string) error { + b := make([]byte, 0, 192) + b = append(b, `{"type":"call","id":"`...) + b = appendJSONString(b, id) + b = append(b, `","topic":["cap","self","updater","main","rpc","prepare-update"],"payload":{"job_id":"`...) + b = appendJSONString(b, p.jobID) + b = append(b, `","target":"mcu","expected_image_id":"pico-cm5-hwtest-image"}}`...) + return p.writeLine(ctx, b) +} + +func (p *peer) writeXferBegin(ctx context.Context, id string, size uint32, digest string) error { + b := make([]byte, 0, 192) + b = append(b, `{"type":"xfer_begin","xfer_id":"`...) + b = appendJSONString(b, id) + b = append(b, `","target":"updater/main","size":`...) + b = strconv.AppendUint(b, uint64(size), 10) + b = append(b, `,"digest_alg":"xxhash32","digest":"`...) + b = appendJSONString(b, digest) + b = append(b, `","meta":{"source":"pico-cm5-emulator"}}`...) + return p.writeLine(ctx, b) +} + +func (p *peer) writeXferChunk(ctx context.Context, id string, off uint32, chunk []byte) error { + n := base64.RawURLEncoding.EncodedLen(len(chunk)) + if n > len(b64Scratch) { + return errors.New("chunk_too_large") + } + base64.RawURLEncoding.Encode(b64Scratch[:n], chunk) + chunkDigest := hex8(xxhash.Sum32(chunk, 0)) + b := make([]byte, 0, 160+n) + b = append(b, `{"type":"xfer_chunk","xfer_id":"`...) + b = appendJSONString(b, id) + b = append(b, `","offset":`...) + b = strconv.AppendUint(b, uint64(off), 10) + b = append(b, `,"data":"`...) + b = append(b, b64Scratch[:n]...) + b = append(b, `","chunk_digest":"`...) + b = appendJSONString(b, chunkDigest) + b = append(b, `"}`...) + return p.writeLine(ctx, b) +} + +func (p *peer) writeXferCommit(ctx context.Context, id string, size uint32, digest string) error { + b := make([]byte, 0, 144) + b = append(b, `{"type":"xfer_commit","xfer_id":"`...) + b = appendJSONString(b, id) + b = append(b, `","size":`...) + b = strconv.AppendUint(b, uint64(size), 10) + b = append(b, `,"digest_alg":"xxhash32","digest":"`...) + b = appendJSONString(b, digest) + b = append(b, `"}`...) + return p.writeLine(ctx, b) +} + +func (p *peer) writePong(ctx context.Context, sid string) error { + b := make([]byte, 0, 64) + b = append(b, `{"type":"pong","sid":"`...) + b = appendJSONString(b, sid) + b = append(b, `"}`...) + return p.writeLine(ctx, b) +} + +func (p *peer) waitReplyOK(ctx context.Context, id string) error { + for { + line, err := p.readLine(ctx) + if err != nil { + return err + } + t := topString(line, "type") + switch t { + case "reply": + if topString(line, "id") != id { + continue + } + ok, seen := topBool(line, "ok") + if seen && ok { + return nil + } + return errors.New("reply_error:" + topString(line, "err")) + case "ping": + _ = p.writePong(ctx, topString(line, "sid")) + case "pub": + logPub(line) + default: + if t != "" { + println("0.000 [pico-cm5] rx", t) + } + } + } +} + +func (p *peer) waitType(ctx context.Context, wantType, wantXfer string) ([]byte, error) { + for { + line, err := p.readLine(ctx) + if err != nil { + return nil, err + } + t := topString(line, "type") + switch t { + case wantType: + if wantXfer == "" || topString(line, "xfer_id") == wantXfer { + return line, nil + } + case "xfer_abort": + if wantXfer == "" || topString(line, "xfer_id") == wantXfer { + return nil, errors.New("xfer_abort:" + topString(line, "err")) + } + case "reply": + if errText := topString(line, "err"); errText != "" { + println("0.000 [pico-cm5] stray reply err=", errText) + } + case "ping": + _ = p.writePong(ctx, topString(line, "sid")) + case "pub": + logPub(line) + default: + if t != "" { + println("0.000 [pico-cm5] rx", t) + } + } + } +} + +func (p *peer) waitNeed(ctx context.Context, id string) (uint32, error) { + for { + line, err := p.waitType(ctx, "xfer_need", id) + if err != nil { + return 0, err + } + got, ok := topUint(line, "next") + if ok { + return got, nil + } + return 0, errors.New("bad_xfer_need_missing_next:" + cm5TracePreview(line)) + } +} + +func (p *peer) waitNeedWithTimeout(ctx context.Context, id string, d time.Duration) (uint32, error) { + waitCtx, cancel := context.WithTimeout(ctx, d) + defer cancel() + return p.waitNeed(waitCtx, id) +} + +func (p *peer) writeLine(ctx context.Context, b []byte) error { + if len(b) == 0 || b[len(b)-1] != '\n' { + b = append(b, '\n') + } + cm5TraceFrame("tx", b) + off := 0 + for off < len(b) { + if n := p.tx.TryWriteFrom(b[off:]); n > 0 { + off += n + continue + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-p.tx.Writable(): + } + } + return nil +} + +func (p *peer) readLine(ctx context.Context) ([]byte, error) { + for { + span, _ := p.rx.ReadAcquire() + if len(span) > 0 { + line, consumed, ok, err := p.consumeLineSpan(span) + if consumed > 0 { + p.rx.ReadRelease(consumed) + } + if err != nil { + return nil, err + } + if ok { + return line, nil + } + continue + } + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-p.rx.Readable(): + } + } +} + +func (p *peer) consumeLineSpan(span []byte) (line []byte, consumed int, ok bool, err error) { + for i, c := range span { + consumed = i + 1 + if c == '\r' { + continue + } + if c == '\n' { + if p.n == 0 { + continue + } + line := lineScratch[:p.n] + cm5TraceFrame("rx", line) + p.n = 0 + return line, consumed, true, nil + } + if p.n >= len(lineScratch) { + p.n = 0 + return nil, consumed, false, errors.New("line_too_long") + } + lineScratch[p.n] = c + p.n++ + } + return nil, consumed, false, nil +} + +func payloadByteAt(i int) byte { + return byte((i*37 + 11) & 0xff) +} + +func makePayloadChunk(off int, dst []byte) []byte { + for i := range dst { + dst[i] = payloadByteAt(off + i) + } + return dst +} + +func payloadDigest(size int) string { + h := xxhash.New(0) + var buf [chunkSize]byte + for off := 0; off < size; off += chunkSize { + end := off + chunkSize + if end > size { + end = size + } + _, _ = h.Write(makePayloadChunk(off, buf[:end-off])) + } + return hex8(h.Sum32()) +} + +func hex8(v uint32) string { + const h = "0123456789abcdef" + var b [8]byte + for i := 7; i >= 0; i-- { + b[i] = h[v&0xf] + v >>= 4 + } + return string(b[:]) +} + +func appendJSONString(b []byte, s string) []byte { + for i := 0; i < len(s); i++ { + c := s[i] + switch c { + case '\\', '"': + b = append(b, '\\', c) + case '\n': + b = append(b, '\\', 'n') + case '\r': + b = append(b, '\\', 'r') + case '\t': + b = append(b, '\\', 't') + default: + b = append(b, c) + } + } + return b +} + +func findKey(line []byte, key string) int { + // Locate a top-level-ish JSON field by string pattern. The Fabric frames used + // by this emulator are compact and do not contain the same field names inside + // escaped strings, which is sufficient for this smoke firmware. + patLen := len(key) + 2 + for i := 0; i+patLen < len(line); i++ { + if line[i] != '"' { + continue + } + if string(line[i+1:i+1+len(key)]) != key || line[i+1+len(key)] != '"' { + continue + } + j := i + patLen + for j < len(line) && (line[j] == ' ' || line[j] == '\t') { + j++ + } + if j < len(line) && line[j] == ':' { + j++ + for j < len(line) && (line[j] == ' ' || line[j] == '\t') { + j++ + } + return j + } + } + return -1 +} + +func topString(line []byte, key string) string { + i := findKey(line, key) + if i < 0 || i >= len(line) || line[i] != '"' { + return "" + } + i++ + start := i + for i < len(line) { + if line[i] == '\\' { + i += 2 + continue + } + if line[i] == '"' { + return string(line[start:i]) + } + i++ + } + return "" +} + +func topBool(line []byte, key string) (bool, bool) { + i := findKey(line, key) + if i < 0 || i >= len(line) { + return false, false + } + if i+4 <= len(line) && string(line[i:i+4]) == "true" { + return true, true + } + if i+5 <= len(line) && string(line[i:i+5]) == "false" { + return false, true + } + return false, false +} + +func topUint(line []byte, key string) (uint32, bool) { + i := findKey(line, key) + if i < 0 || i >= len(line) || line[i] < '0' || line[i] > '9' { + return 0, false + } + var v uint32 + for i < len(line) && line[i] >= '0' && line[i] <= '9' { + v = v*10 + uint32(line[i]-'0') + i++ + } + return v, true +} + +func logPub(line []byte) { + // Keep pub logging light. Detailed frame logs are intentionally omitted so the + // emulator remains closer to the 3 KB stack target. + if topic := topString(line, "topic"); topic != "" { + println("0.000 [pico-cm5] pub", topic) + } +} + +func cm5TraceFrame(dir string, b []byte) { + if !picoCM5TraceEnabled { + return + } + line := b + if len(line) > 0 && line[len(line)-1] == '\n' { + line = line[:len(line)-1] + } + println( + "0.000 [pico-cm5-trace]", dir, + "type", topString(line, "type"), + "xfer", topString(line, "xfer_id"), + "id", topString(line, "id"), + "next", traceUint(line, "next"), + "len", len(line), + "line", cm5TracePreview(line), + ) +} + +func cm5TraceEvent(event string) { + if !picoCM5TraceEnabled { + return + } + println("0.000 [pico-cm5-trace]", event) +} + +func cm5TraceEventKV(event, key, value string) { + if !picoCM5TraceEnabled { + return + } + println("0.000 [pico-cm5-trace]", event, key, value) +} + +func cm5TracePreview(data []byte) string { + const max = 220 + if len(data) > max { + data = data[:max] + } + out := make([]byte, 0, len(data)*2+3) + for _, c := range data { + switch c { + case '\n': + out = append(out, '\\', 'n') + case '\r': + out = append(out, '\\', 'r') + case '\t': + out = append(out, '\\', 't') + default: + if c < 0x20 || c > 0x7e { + out = append(out, '\\', 'x', hexNibble(c>>4), hexNibble(c)) + } else { + out = append(out, c) + } + } + } + if len(data) == max { + out = append(out, '.', '.', '.') + } + return string(out) +} + +func traceUint(line []byte, key string) uint32 { + v, _ := topUint(line, key) + return v +} + +func hexNibble(v byte) byte { + v &= 0x0f + if v < 10 { + return '0' + v + } + return 'a' + (v - 10) +} + +func printMem() { + var m runtime.MemStats + runtime.ReadMemStats(&m) + println("0.000 [pico-cm5] mem alloc:", int(m.Alloc), "heapSys:", int(m.HeapSys), "mallocs:", int(m.Mallocs), "frees:", int(m.Frees)) +} diff --git a/cmd/pico-cm5-emulator/main_test.go b/cmd/pico-cm5-emulator/main_test.go new file mode 100644 index 0000000..bf4ba97 --- /dev/null +++ b/cmd/pico-cm5-emulator/main_test.go @@ -0,0 +1,101 @@ +package main + +import ( + "context" + "testing" + "time" + + "devicecode-go/x/shmring" + "devicecode-go/x/xxhash" +) + +func TestReadLinePreservesBytesAfterNewline(t *testing.T) { + rx := shmring.New(1024) + p := &peer{rx: rx} + input := []byte("{\"type\":\"pub\"}\n{\"type\":\"xfer_need\",\"xfer_id\":\"x\",\"next\":0}\n") + if n := rx.TryWriteFrom(input); n != len(input) { + t.Fatalf("write = %d, want %d", n, len(input)) + } + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + line, err := p.readLine(ctx) + if err != nil { + t.Fatalf("read first line: %v", err) + } + if got, want := string(line), "{\"type\":\"pub\"}"; got != want { + t.Fatalf("first line = %q, want %q", got, want) + } + + line, err = p.readLine(ctx) + if err != nil { + t.Fatalf("read second line: %v", err) + } + want := "{\"type\":\"xfer_need\",\"xfer_id\":\"x\",\"next\":0}" + if got := string(line); got != want { + t.Fatalf("second line = %q, want %q", got, want) + } +} + +func TestReadLinePreservesBytesAfterNewlineAcrossRingWrap(t *testing.T) { + rx := shmring.New(64) + p := &peer{rx: rx} + pad := []byte("012345678901234567890123456789012345678901234567") + if n := rx.TryWriteFrom(pad); n != len(pad) { + t.Fatalf("pad write = %d", n) + } + var discard [64]byte + if n := rx.TryReadInto(discard[:len(pad)]); n != len(pad) { + t.Fatalf("pad read = %d", n) + } + input := []byte("{\"type\":\"a\"}\n{\"type\":\"b\"}\n") + if n := rx.TryWriteFrom(input); n != len(input) { + t.Fatalf("write = %d, want %d", n, len(input)) + } + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + line, err := p.readLine(ctx) + if err != nil { + t.Fatalf("read first line: %v", err) + } + if got, want := string(line), "{\"type\":\"a\"}"; got != want { + t.Fatalf("first line = %q, want %q", got, want) + } + line, err = p.readLine(ctx) + if err != nil { + t.Fatalf("read second line: %v", err) + } + if got, want := string(line), "{\"type\":\"b\"}"; got != want { + t.Fatalf("second line = %q, want %q", got, want) + } +} + +func TestPayloadDigestMatchesMaterialisedPayload(t *testing.T) { + for _, size := range []int{0, 1, 15, 16, 17, 255, 256, 257, 1024, 4097} { + buf := make([]byte, size) + for off := 0; off < size; off += chunkSize { + end := off + chunkSize + if end > size { + end = size + } + makePayloadChunk(off, buf[off:end]) + } + got := payloadDigest(size) + want := hex8(xxhash.Sum32(buf, 0)) + if got != want { + t.Fatalf("size %d digest = %s, want %s", size, got, want) + } + } +} + +func TestPayloadChunkMatchesGeneratorOffset(t *testing.T) { + var buf [17]byte + chunk := makePayloadChunk(251, buf[:]) + for i, got := range chunk { + want := payloadByteAt(251 + i) + if got != want { + t.Fatalf("byte %d = %d, want %d", i, got, want) + } + } +} diff --git a/cmd/pico-cm5-emulator/payload_200k.go b/cmd/pico-cm5-emulator/payload_200k.go new file mode 100644 index 0000000..5ac074a --- /dev/null +++ b/cmd/pico-cm5-emulator/payload_200k.go @@ -0,0 +1,5 @@ +//go:build pico_cm5_payload_200k + +package main + +const payloadSize = 200 * 1024 diff --git a/cmd/pico-cm5-emulator/payload_default.go b/cmd/pico-cm5-emulator/payload_default.go new file mode 100644 index 0000000..517ae6d --- /dev/null +++ b/cmd/pico-cm5-emulator/payload_default.go @@ -0,0 +1,5 @@ +//go:build !pico_cm5_payload_200k + +package main + +const payloadSize = 1024 diff --git a/cmd/pico-cm5-emulator/trace_disabled.go b/cmd/pico-cm5-emulator/trace_disabled.go new file mode 100644 index 0000000..e698ba4 --- /dev/null +++ b/cmd/pico-cm5-emulator/trace_disabled.go @@ -0,0 +1,5 @@ +//go:build !pico_cm5_trace + +package main + +const picoCM5TraceEnabled = false diff --git a/cmd/pico-cm5-emulator/trace_enabled.go b/cmd/pico-cm5-emulator/trace_enabled.go new file mode 100644 index 0000000..4a72b13 --- /dev/null +++ b/cmd/pico-cm5-emulator/trace_enabled.go @@ -0,0 +1,5 @@ +//go:build pico_cm5_trace + +package main + +const picoCM5TraceEnabled = true diff --git a/docs/gate2-hello-diagnostic.md b/docs/gate2-hello-diagnostic.md new file mode 100644 index 0000000..6d52c02 --- /dev/null +++ b/docs/gate2-hello-diagnostic.md @@ -0,0 +1,81 @@ +# Gate 2 hello/hello_ack diagnostic build + +Use this build only when the CM5 can open `uart0` and Fabric is running, but the +CM5 link remains in `state=hello` and does not observe the MCU peer. + +This is a safe Gate 2 diagnostic build. It uses the hwtest updater backend: + +```text +pico_bb_proto_1 fabric_uart_hwtest fabric_trace uartx_probe +``` + +It must not enable production flash staging or apply/reboot. Do not add +`fabric_stage_enabled` or `fabric_apply_enabled` to this diagnostic run. + +## Flash command + +```sh +tinygo flash -stack-size=8KB -monitor -scheduler tasks \ + -target=pico2 \ + -tags "pico_bb_proto_1 fabric_uart_hwtest fabric_trace uartx_probe" \ + main.go +``` + +The same command is available as: + +```sh +./scripts/flash-mcu-gate2-hello-diag.sh +``` + +## Expected startup markers + +The correct hardware role image should log: + +```text +[uart1] log session opened +[uart0] fabric session opening node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-controller:hwtest +[uart0] fabric session opened node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-controller:hwtest +``` + +## What to look for + +When the CM5 current session reports a local SID, for example: + +```text +local_sid=a53d3e09-... +``` + +look in the MCU diagnostic output for the same SID. + +Good receive path: + +```text +[fabric-trace] rx len ... line {"type":"hello",..."sid":"a53d3e09-...","node":"bigbox-cm5"...} +[fabric] sid mcu-sid-... rx_frame type hello ... +``` + +Good reply path: + +```text +[fabric] sid mcu-sid-... tx_frame lane control type hello_ack ... +[fabric-trace] tx len ... line {"type":"hello_ack",..."node":"mcu"...} +[uartx-probe] uart0 reason tx ... tx_acc ... tx_hw ... sess_tx_avail ... +``` + +## Interpretation + +If there is no `rx` line for the CM5's current SID, the current CM5 hello is not +reaching the MCU. Check single process ownership of `/dev/ttyAMA0`, wiring, +shared ground, and that the CM5 TX pin reaches MCU UART0 RX GP1. + +If the MCU logs `rx hello` and `tx hello_ack`, but `uartx-probe` does not show +TX progress on `uart0`, the fault is between Fabric and the Go HAL serial TX +path. + +If the MCU logs `rx hello`, `tx hello_ack`, and UARTX TX progress, but the CM5 +still stays in `hello`, the fault is on the reverse physical path: +MCU UART0 TX GP0 to CM5 RX, or in the CM5 tty receive configuration. + +If the MCU logs a `peer_sid` that does not match the CM5's current `local_sid`, +repeat the run after stopping all old CM5 processes, confirming `/dev/ttyAMA0` +has one owner, power-cycling the MCU, and starting the CM5 process again. diff --git a/docs/test-plan.md b/docs/test-plan.md new file mode 100644 index 0000000..ea2e24c --- /dev/null +++ b/docs/test-plan.md @@ -0,0 +1,344 @@ +# Big Box MCU Fabric/update hardware test guide + +This guide is for testing the Pico 2 MCU firmware on real Big Box hardware, with the CM5/Lua Devicecode side acting as the Fabric sender. + +## Current baseline + +The current MCU firmware has passed the following local gates: + +* idle appliance firmware at 3 KB stack; +* two-Pico Fabric transfer testing with a Pico 1 CM5 emulator; +* 200 KiB streamed transfer at 2,048-byte chunks; +* transfer completion through `xfer_done`; +* receiver-driven retry of damaged chunks; +* 512/512 HAL serial session rings; +* yield-free `serial_raw` bounded pump; +* stable large transfer at 6 KB stack. + +Important stack baseline: + +```text +3 KB: idle/non-transfer firmware only +5 KB: known to overflow during transfer +6 KB: tested successfully for transfer +8 KB: diagnostic/probe headroom only +``` + +Use **6 KB** for real Big Box transfer, staging and commit tests. + +## Hardware assumptions + +* MCU target: Pico 2. +* TinyGo scheduler: `tasks`. +* USB monitor is connected to the Pico 2 for MCU logs. +* CM5/Lua side is the real Fabric peer and update sender. +* MCU Fabric link is on `uart1`. +* Fabric protocol is JSONL over UART. +* CM5 should see the MCU as node `mcu`. +* The updater target is `updater/main`. +* Normal appliance telemetry continues during the test. + +## General test rules + +Do not start with the commit/reboot build. + +Proceed in this order: + +1. idle boot; +2. transfer-safe hwtest; +3. real flash staging with commit disabled; +4. commit/reboot only after staging succeeds. + +For each MCU build, record: + +* exact TinyGo command; +* full MCU USB monitor log; +* CM5/Lua update job id; +* CM5-side transfer result; +* final MCU software/update state observed by Device service; +* whether the MCU rebooted; +* whether `boot_id` changed after commit. + +Stop and capture the full logs if any of these occur: + +```text +panic: +goroutine stack overflow +repeated Fabric session open/close +allocation grows monotonically over several samples +Fabric transfer does not complete or retry +unexpected reboot before commit +``` + +## Gate 1: idle appliance boot + +Purpose: confirm the normal appliance firmware boots and remains stable. + +```sh +tinygo flash -stack-size=3KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1" main.go +``` + +Expected MCU log: + +```text +[updater] policy safe-defaults:apply-disabled +[uart1] fabric session opening node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-disabled +[uart1] fabric session opened node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-disabled +``` + +Expected behaviour: + +* no update transfer is accepted; +* `xfer_begin` should be refused or ignored as staging disabled; +* temperature/power logs continue; +* memory remains in a stable band. + +Run for at least 60 seconds. + +## Gate 2: transfer protocol hwtest + +Purpose: test real CM5/Lua Fabric transfer without writing to production flash. + +```sh +tinygo flash -stack-size=6KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1 fabric_uart_hwtest" main.go +``` + +Expected MCU log: + +```text +[updater] policy safe-defaults:apply-disabled +[uart1] fabric session opening node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-controller:hwtest +[uart1] fabric session opened node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-controller:hwtest +``` + +Expected CM5/Lua behaviour: + +* CM5 sends Fabric `hello`; +* MCU replies `hello_ack`; +* CM5 calls `prepare-update`; +* MCU returns target `updater/main`; +* CM5 streams the update body using `xfer_begin`, `xfer_chunk`, `xfer_commit`; +* MCU returns `xfer_ready`, `xfer_need`, and finally `xfer_done`. + +This gate uses the updater-owned stage-controller path, but the backend is the safe digest/count hwtest sink. It must not reboot and must not write a real staged image. + +Receiver retries are acceptable if the transfer completes. A retry means the MCU detected a bad chunk digest, kept the same offset, and requested that offset again. It is a recovery mechanism, not a failed test. + +Pass criteria: + +```text +prepare-update succeeds +xfer_done is observed +MCU does not reboot +no panic or stack overflow +heartbeat_stop reason transfer_done appears, if OTA diagnostics are enabled +``` + +## Optional Gate 2a: quiet transfer probe + +Use this only if the CM5/Lua transfer does not complete, or if retry behaviour needs explanation. + +```sh +tinygo flash -stack-size=6KB -monitor -scheduler tasks \ + -target=pico2 \ + -tags "pico_bb_proto_1 fabric_uart_hwtest fabric_xfer_probe" \ + main.go +``` + +The probe is intentionally narrower than full Fabric trace. It records: + +```text +begin / begin_ok +chunk_digest_error +corrupt_retry +idle_retry +chunk_stale / chunk_future +commit_rx / commit_start / commit_done +``` + +Avoid `fabric_trace`, `updater_trace`, `ota_trace`, and `uartx_probe` unless debugging a specific low-level fault. They materially perturb timing and should not be used for the normal handover test. + +## Gate 3: real flash staging, commit disabled + +Purpose: test production staging of a valid signed `.dcmcu` image without allowing reboot. + +```sh +tinygo flash -stack-size=6KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1 fabric_stage_enabled" main.go +``` + +Expected MCU log: + +```text +[updater] policy safe-defaults:apply-disabled +[uart1] fabric session opening node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-controller:flash-stage +[uart1] fabric session opened node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-controller:flash-stage +``` + +Expected CM5/Lua behaviour: + +* `prepare-update` succeeds; +* transfer target is `updater/main`; +* CM5 streams a valid signed `.dcmcu` artefact; +* MCU stages the image through the production flash staging path; +* transfer reaches `xfer_done`; +* `commit-update` is refused because apply is disabled. + +Pass criteria: + +```text +xfer_done observed +staging state is visible to CM5/Device service +commit-update is refused safely +MCU does not reboot +no panic or stack overflow +``` + +Do not include `fabric_uart_hwtest` in this gate. That tag deliberately selects the safe digest/count backend instead of production flash staging. + +## Gate 4: real commit and reboot + +Purpose: test the full production update path, including commit and reboot. + +Only run this after Gate 3 has passed on the same hardware, wiring and CM5/Lua sender. + +```sh +tinygo flash -stack-size=6KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1 fabric_stage_enabled fabric_apply_enabled" main.go +``` + +Expected MCU log: + +```text +[updater] policy production-applier:commit-reboots +[uart1] fabric session opening node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-controller:flash-stage +[uart1] fabric session opened node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=stage-controller:flash-stage +``` + +Expected CM5/Lua behaviour: + +* update is prepared; +* signed `.dcmcu` image is transferred and staged; +* CM5 calls `commit-update`; +* MCU accepts commit; +* MCU reboots; +* after reconnect, CM5 observes the expected new image identity and a changed `boot_id`; +* CM5 update job resolves to succeeded. + +Pass criteria: + +```text +commit-update succeeds +MCU reboots intentionally +new boot_id observed +expected image_id observed +CM5 update job reaches succeeded +``` + +## Recommended CM5/Lua sender settings + +Use the MCU-advertised `max_chunk_size` from `prepare-update`. + +Current expected value: + +```text +max_chunk_size: 2048 +``` + +The sender should treat `xfer_need.next` as authoritative. If the MCU re-requests an earlier offset, resend from that offset. Do not assume monotonically increasing acknowledgements on a UART link. + +Expected sender behaviour: + +```text +send xfer_begin +wait for xfer_ready +wait for xfer_need next=N +send xfer_chunk offset=N +repeat until next == size +send xfer_commit +wait for xfer_done +``` + +A correct sender must tolerate: + +```text +duplicate xfer_need +same-offset retry +ack timeout and resend +session restart with a new peer sid +``` + +## Notes on buffers and retries + +The current MCU transport deliberately uses bounded serial session rings rather than full-frame buffering. + +Current target constraint: + +```text +HAL serial session RX/TX: 512/512 +Fabric chunk size: 2048 +``` + +This means Fabric must behave as a streaming protocol. It must not require the HAL serial session ring to hold a complete JSONL transfer frame. + +Occasional chunk retries are acceptable. The important property is that the MCU detects corrupted chunks, does not advance the offset, and requests the same offset again. + +A retry is suspicious only if: + +```text +the same offset repeats many times +transfer never reaches xfer_done +the MCU panics or overflows stack +CM5 sees impossible future offsets +commit occurs without a completed transfer +``` + +## Known current limitations + +* 5 KB stack overflows during transfer. +* 6 KB stack is the current tested transfer baseline. +* Diagnostic probes can perturb UART timing. +* Full `fabric_trace` is too heavy for normal transfer testing. +* `uartx_probe` is for low-level attribution only, not routine gate testing. +* Gate 4 can reboot the MCU and should only be run with a known-good signed image and a planned recovery path. + +## Summary of commands + +Idle only: + +```sh +tinygo flash -stack-size=3KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1" main.go +``` + +Safe transfer hwtest: + +```sh +tinygo flash -stack-size=6KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1 fabric_uart_hwtest" main.go +``` + +Safe transfer hwtest with quiet probe: + +```sh +tinygo flash -stack-size=6KB -monitor -scheduler tasks \ + -target=pico2 \ + -tags "pico_bb_proto_1 fabric_uart_hwtest fabric_xfer_probe" \ + main.go +``` + +Real staging, no reboot: + +```sh +tinygo flash -stack-size=6KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1 fabric_stage_enabled" main.go +``` + +Real staging, commit and reboot: + +```sh +tinygo flash -stack-size=6KB -monitor -scheduler tasks \ + -target=pico2 -tags "pico_bb_proto_1 fabric_stage_enabled fabric_apply_enabled" main.go +``` diff --git a/docs/uartx-probe.md b/docs/uartx-probe.md new file mode 100644 index 0000000..6abc756 --- /dev/null +++ b/docs/uartx-probe.md @@ -0,0 +1,54 @@ +# UARTX probe build + +This tree vendors a local copy of `github.com/jangala-dev/tinygo-uartx` under +`third_party/tinygo-uartx` so that the firmware can expose loss-attribution +counters while we debug Fabric transfers over real UART. + +Normal builds use the same UART API. To enable the extra serial diagnostics, +add the `uartx_probe` build tag to the Pico 2 build: + +```sh +tinygo flash -stack-size=8KB -monitor -scheduler tasks \ + -target=pico2 \ + -tags "pico_bb_proto_1 fabric_uart_hwtest fabric_xfer_probe uartx_probe" \ + main.go +``` + +The probe prints compact lines from the HAL `serial_raw` session worker: + +```text +[uartx-probe] uart1 reason periodic rx_hw ... rx_drop ... rx_oe ... rx_fe ... sess_rx_avail ... +``` + +The most useful fields are: + +- `rx_hw`: bytes read from the PL011 data register by the UARTX ISR. +- `rx_enq`: bytes successfully enqueued into UARTX's ISR RX ring. +- `rx_read`: bytes drained from UARTX by the HAL serial session worker. +- `rx_drop`: bytes dropped because the UARTX ISR RX ring was full. +- `rx_oe`, `rx_fe`, `rx_pe`, `rx_be`: PL011 overrun, framing, parity and break errors. +- `rx_max`: maximum observed UARTX ISR RX ring occupancy. +- `sess_rx_avail` / `sess_rx_space`: bytes in the HAL session shmring from UART to Fabric. +- `sess_tx_avail` / `sess_tx_space`: bytes in the HAL session shmring from Fabric to UART. + +When Fabric logs `chunk_digest_error` with a shortened `encoded_len`, compare the +nearest `[uartx-probe]` lines. If `rx_drop` or `rx_oe` increments, the loss is at +or below the UARTX ISR ring. If those counters remain flat while the HAL session +ring is full, the loss is likely at the session boundary. If all counters remain +flat, look higher in the line assembly / Fabric parser path. + + +## Current bounded-session test + +The Pico 2 board setups now use symmetrical 512-byte HAL serial session rings +for both raw UART devices. The Pico 1 CM5 emulator opens its UART session with +the same 512/512 constraint. This is deliberately smaller than a Fabric transfer +line; Fabric must rely on streaming, flow control and retry rather than requiring +the HAL session ring to hold an entire JSONL frame. + +The old 32-byte RX rings came from the earlier raw-JSON telemetry shape and are +now too small for bidirectional Fabric traffic. The 512-byte setting is intended +as a bounded engineering test, not as a hidden full-frame buffer. + +The local UARTX copy should be treated as an instrumentation branch. Once the +cause is confirmed, port only the relevant counter or behavioural fix upstream. diff --git a/go.mod b/go.mod index bbf004f..4452920 100644 --- a/go.mod +++ b/go.mod @@ -3,9 +3,9 @@ module devicecode-go go 1.25.1 require ( - pico2-a-b v0.0.0 github.com/jangala-dev/tinygo-uartx v0.0.0-20251028085354-58b6258234b3 golang.org/x/exp v0.0.0-20251002181428-27f1f14c8bb9 + pico2-a-b v0.0.0 tinygo.org/x/drivers v0.33.0 ) diff --git a/go.sum b/go.sum index a00618c..796115a 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,5 @@ github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= -github.com/jangala-dev/tinygo-uartx v0.0.0-20251028085354-58b6258234b3 h1:b6mCDQEeeICoGpsbKyh/kfIRnr2DMK/wACLLi0t8uoU= -github.com/jangala-dev/tinygo-uartx v0.0.0-20251028085354-58b6258234b3/go.mod h1:e3HxjGzBZBIsn/oYvWr707ug3IbkglEyivyYVxHRph4= golang.org/x/exp v0.0.0-20251002181428-27f1f14c8bb9 h1:TQwNpfvNkxAVlItJf6Cr5JTsVZoC/Sj7K3OZv2Pc14A= golang.org/x/exp v0.0.0-20251002181428-27f1f14c8bb9/go.mod h1:TwQYMMnGpvZyc+JpB/UAuTNIsVJifOlSkrZkhcvpVUk= tinygo.org/x/drivers v0.33.0 h1:5r8Ab0IxjWQi7LzYLNWpya6U4nedo9ZtxeMaAzrJTG8= diff --git a/main.go b/main.go index 182def0..1a77591 100644 --- a/main.go +++ b/main.go @@ -7,26 +7,14 @@ import ( "devicecode-go/bus" "devicecode-go/services/hal" "devicecode-go/services/reactor" - "devicecode-go/services/updater" "devicecode-go/types" "devicecode-go/utilities" - "pico2-a-b/abupdate" ) // HAL const halTimeout = 5 * time.Second - var halReadiness = bus.T("hal", "state") -// Firmware identity is set by host build tooling before main runs. The e2e -// harness generates a same-package init file because TinyGo's -X support is -// narrower than the standard Go linker's support. -var ( - FirmwareVersion = "0.0.0-dev" - FirmwareBuild = "local" - FirmwareImageID = "img-dev" -) - // ----------------------------------------------------------------------------- // Main // ----------------------------------------------------------------------------- @@ -36,20 +24,10 @@ func main() { time.Sleep(3 * time.Second) log.SetStart(time.Now()) - bootBuyRC := abupdate.CheckAndBuy() - if bootBuyRC != 0 { - log.Println("[main] abupdate CheckAndBuy rc =", bootBuyRC) - } - ctx := context.Background() log.Println("[main] bootstrapping bus …") - // Queue length must cover the retained replay burst when fabric - // subscribes to wildcard export patterns (hal/cap/env/#, - // hal/cap/power/#). Each capability publishes retained info + - // status + value; pico_bb_proto_1 has ~26 retained topics across - // env and power domains. 32 provides margin for growth. - b := bus.NewBus(32, "+", "#") + b := bus.NewBus(3, "+", "#") halConn := b.NewConnection("hal") uiConn := b.NewConnection("ui") @@ -64,17 +42,8 @@ func main() { } } - // boot_id: generate AFTER HAL ready and BEFORE the reactor opens - // fabric. RAM-only — never persisted. - bootID := updater.GenerateBootID() - log.Println("[main] boot_id =", bootID) - - reactor.FirmwareVersion = FirmwareVersion - reactor.FirmwareBuild = FirmwareBuild - reactor.FirmwareImageID = FirmwareImageID - // Reactor - r := reactor.NewReactorWithOptions(b, uiConn, reactor.Options{BootBuyRC: bootBuyRC}) + r := reactor.NewReactor(uiConn) r.Run(ctx) } @@ -102,4 +71,4 @@ func waitHALReady(ctx context.Context, c *bus.Connection, d time.Duration) bool } // Global logger instance -var log = utilities.Logger{LineStart: true} +var log = utilities.Logger{LineStart: true} \ No newline at end of file diff --git a/services/fabric/buffers.go b/services/fabric/buffers.go new file mode 100644 index 0000000..e355614 --- /dev/null +++ b/services/fabric/buffers.go @@ -0,0 +1,47 @@ +package fabric + +// MaxAcceptedChunkSize is the v1 MCU receive-side raw-byte limit for one +// xfer_chunk. The sender chooses the actual chunk size; the MCU only enforces +// this upper bound. The fabric-jsonl/1 release contract requires accepting at +// least 2048 raw bytes per chunk. +const MaxAcceptedChunkSize = 2048 + +// maxChunkBase64Len is base64.RawURLEncoding.EncodedLen(MaxAcceptedChunkSize). +// Kept as a constant so FabricBuffers can be fully static on TinyGo. +const maxChunkBase64Len = 2731 + +// FabricBuffers owns all fixed-size scratch storage used by one MCU Fabric +// session. Allocate this once at the top level, or once per session in host +// tests. Transfer-sized buffers must not be constructed in the per-frame or +// per-chunk hot path. +type FabricBuffers struct { + // TransportLine is used by shmring transport while assembling one line from + // the UART RX ring. The session read loop copies completed lines into a free + // RXLines slot before handing them to the reactor goroutine. + TransportLine [maxLineLen]byte + + // RXLines backs the bounded reader queue between the blocking transport + // reader goroutine and the session reactor. Ownership of each slot is + // explicit via the free-slot channel in session.run. + RXLines [lineQueueSize][maxLineLen]byte + + // ChunkRaw receives the decoded raw bytes for one inbound xfer_chunk. + ChunkRaw [MaxAcceptedChunkSize]byte + + // ChunkB64 is available to future sender-side tests or MCU-originated bulk + // frames without allocating a per-chunk base64 buffer. + ChunkB64 [maxChunkBase64Len]byte +} + +func NewFabricBuffers() *FabricBuffers { return &FabricBuffers{} } + +func ensureFabricBuffers(b *FabricBuffers) *FabricBuffers { + if b != nil { + return b + } + return NewFabricBuffers() +} + +type boundedLineTransport interface { + ReadLineInto(dst []byte) (int, error) +} diff --git a/services/fabric/buffers_alloc_test.go b/services/fabric/buffers_alloc_test.go new file mode 100644 index 0000000..53e5d9e --- /dev/null +++ b/services/fabric/buffers_alloc_test.go @@ -0,0 +1,86 @@ +package fabric + +import ( + "bytes" + "encoding/base64" + "testing" + + "devicecode-go/x/shmring" +) + +func TestDecodeChunkDataUsesFixedBuffersAcrossManyChunks(t *testing.T) { + cfg := DefaultLinkConfig() + cfg.applyDefaults() + s := session{cfg: cfg, buffers: NewFabricBuffers()} + raw := bytes.Repeat([]byte{0x5a}, int(cfg.MaxAcceptedChunkSize)) + encoded := base64.RawURLEncoding.EncodeToString(raw) + + allocs := testing.AllocsPerRun(100, func() { + for i := 0; i < 64; i++ { + got, errStr := s.decodeChunkData(encoded) + if errStr != "" { + t.Fatalf("decodeChunkData error = %s", errStr) + } + if len(got) != len(raw) || got[0] != raw[0] || got[len(got)-1] != raw[len(raw)-1] { + t.Fatalf("decoded chunk mismatch") + } + } + }) + if allocs > 0 { + t.Fatalf("decodeChunkData allocations per 64 chunks = %.2f, want zero", allocs) + } +} + +func TestDecodeChunkDataRejectsOversizeBeforeDecode(t *testing.T) { + cfg := DefaultLinkConfig() + cfg.applyDefaults() + s := session{cfg: cfg, buffers: NewFabricBuffers()} + raw := bytes.Repeat([]byte{0xa5}, int(cfg.MaxAcceptedChunkSize)+1) + encoded := base64.RawURLEncoding.EncodeToString(raw) + + got, errStr := s.decodeChunkData(encoded) + if got != nil || errStr != "chunk_too_large" { + t.Fatalf("decodeChunkData oversize = (%v, %q), want chunk_too_large", got, errStr) + } +} + +func TestShmringTransportReadLineIntoUsesCallerBuffer(t *testing.T) { + rx := shmringForFabricTest(t, 256) + tx := shmringForFabricTest(t, 256) + tr := NewShmringTransportWithBuffers(rx, tx, NewFabricBuffers()) + defer tr.Close() + + line := []byte(`{"type":"ping","sid":"s"}`) + writeRingForFabricTest(t, rx, append(line, '\n')) + + var dst [maxLineLen]byte + n, err := tr.ReadLineInto(dst[:]) + if err != nil { + t.Fatalf("ReadLineInto error = %v", err) + } + if string(dst[:n]) != string(line) { + t.Fatalf("ReadLineInto = %q, want %q", string(dst[:n]), string(line)) + } +} + +func shmringForFabricTest(t *testing.T, size int) *shmring.Ring { + t.Helper() + return shmring.New(size) +} + +func writeRingForFabricTest(t *testing.T, r *shmring.Ring, data []byte) { + t.Helper() + written := 0 + for written < len(data) { + p1, p2 := r.WriteAcquire() + if len(p1)+len(p2) == 0 { + t.Fatalf("ring full while writing test data") + } + n := copy(p1, data[written:]) + if n < len(data)-written && len(p2) > 0 { + n += copy(p2, data[written+n:]) + } + r.WriteCommit(n) + written += n + } +} diff --git a/services/fabric/counters.go b/services/fabric/counters.go new file mode 100644 index 0000000..8f13c25 --- /dev/null +++ b/services/fabric/counters.go @@ -0,0 +1,21 @@ +package fabric + +// FabricCounters is the compact normal-build diagnostic surface for the MCU +// Fabric link. Counters are updated by the session reactor and published with +// retained link state; they replace per-frame/per-chunk logging in release +// builds. +type FabricCounters struct { + RXLines uint64 `json:"rx_lines"` + RXLineTooLong uint64 `json:"rx_line_too_long"` + RXBadJSON uint64 `json:"rx_bad_json"` + RXFrames uint64 `json:"rx_frames"` + TXFrames uint64 `json:"tx_frames"` + TransferBegins uint64 `json:"transfer_begins"` + TransferChunks uint64 `json:"transfer_chunks"` + TransferBytes uint64 `json:"transfer_bytes"` + TransferDecodeErrors uint64 `json:"transfer_decode_errors"` + TransferDigestErrors uint64 `json:"transfer_digest_errors"` + TransferOffsetRetries uint64 `json:"transfer_offset_retries"` + TransferAborts uint64 `json:"transfer_aborts"` + TransferCompletions uint64 `json:"transfer_completions"` +} diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index 3a62b7e..1a40d32 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -25,10 +25,10 @@ const defaultLinkID = "mcu-uart0" // MCU-facing link. Missing fields fall back to release defaults via // applyDefaults so callers can pass `LinkConfig{}` to mean "release". type LinkConfig struct { - // ChunkSize is the expected raw-byte payload per xfer_chunk. The MCU - // is receive-only for transfers, so this is informational/validation - // only on the Go side. Release: 2048 bytes. - ChunkSize uint32 + // MaxAcceptedChunkSize is the receive-side upper bound for the raw-byte + // payload in one xfer_chunk. The sender owns the actual chunk size; the MCU + // must accept at least 2048 bytes for fabric-jsonl/1 v1. Release: 2048 bytes. + MaxAcceptedChunkSize uint32 // PhaseTimeout is the idle-chunk watchdog: an active inbound transfer // is aborted with reason="timeout" if no xfer_chunk arrives within // this window. Mirrors transfer_mgr.lua's `phase_timeout`. @@ -43,6 +43,11 @@ type LinkConfig struct { // this window once established. Mirrors session_ctl.lua's // liveness_timeout_s. Release: 30s. LivenessTimeout time.Duration + // TargetCallTimeout is the local updater/main stage RPC deadline after + // xfer_commit has verified the wire transfer. The Fabric session owns this + // as pending operation state; it must not block the reactor loop. + // Release: 5s. + TargetCallTimeout time.Duration // MaxInboundHelpers caps the number of in-flight inbound RPC calls. // Excess inbound calls reply `{ok=false, err="busy"}` per // rpc_bridge.lua's `spawn_local_call_helper`. Lua default is 64 @@ -57,20 +62,21 @@ type LinkConfig struct { func DefaultLinkConfig() LinkConfig { return LinkConfig{ - ChunkSize: 2048, - PhaseTimeout: 15 * time.Second, - PingInterval: 10 * time.Second, - LivenessTimeout: 30 * time.Second, - MaxInboundHelpers: 64, - RPCQuantum: 4, - BulkQuantum: 1, + MaxAcceptedChunkSize: MaxAcceptedChunkSize, + PhaseTimeout: 15 * time.Second, + PingInterval: 10 * time.Second, + LivenessTimeout: 30 * time.Second, + TargetCallTimeout: 5 * time.Second, + MaxInboundHelpers: 64, + RPCQuantum: 4, + BulkQuantum: 1, } } func (c *LinkConfig) applyDefaults() { d := DefaultLinkConfig() - if c.ChunkSize == 0 { - c.ChunkSize = d.ChunkSize + if c.MaxAcceptedChunkSize == 0 { + c.MaxAcceptedChunkSize = d.MaxAcceptedChunkSize } if c.PhaseTimeout == 0 { c.PhaseTimeout = d.PhaseTimeout @@ -81,6 +87,9 @@ func (c *LinkConfig) applyDefaults() { if c.LivenessTimeout == 0 { c.LivenessTimeout = d.LivenessTimeout } + if c.TargetCallTimeout == 0 { + c.TargetCallTimeout = d.TargetCallTimeout + } if c.MaxInboundHelpers == 0 { c.MaxInboundHelpers = d.MaxInboundHelpers } @@ -102,6 +111,26 @@ func newLocalSID() string { return "mcu-sid-" + bootID + "-" + strconvx.Utoa64(nextSessionID.Add(1)) } +// StageController is Fabric's narrow boundary to an updater/main staging +// owner. Fabric submits transfer bytes and observes command results; it does +// not own updater state or flash/verifier work. +type StageController interface { + BeginStreamedStage(xferID string, size uint32) (uint64, error) + WriteStreamedStage(xferID string, generation uint64, data []byte) error + CommitStreamedStage(xferID string, generation uint64) (uint32, error) + AbortStreamedStage(xferID string, generation uint64, reason string) + CancelStreamedStage(xferID string, generation uint64, reason string) +} + +// RunOptions carries optional dependencies that do not belong in the wire +// LinkConfig. Keeping the updater staging controller here makes the local +// Fabric-to-Updater boundary explicit; Fabric no longer locates the updater +// service through package-global state. +type RunOptions struct { + Buffers *FabricBuffers + StageController StageController +} + // Run starts the fabric session. Blocks until ctx is cancelled or the // transport returns an unrecoverable error. The MCU is a hello // responder (CM5 always initiates hello/hello_ack), but otherwise @@ -110,14 +139,24 @@ func newLocalSID() string { // arrives within LivenessTimeout. Mirrors session_ctl.lua at // devicecode-lua@2c88090. func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID string, cfg LinkConfig) { + RunWithBuffers(ctx, tr, conn, nodeID, peerID, cfg, nil) +} + +func RunWithBuffers(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID string, cfg LinkConfig, buffers *FabricBuffers) { + RunWithOptions(ctx, tr, conn, nodeID, peerID, cfg, RunOptions{Buffers: buffers}) +} + +func RunWithOptions(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID string, cfg LinkConfig, opts RunOptions) { s := session{ - linkID: defaultLinkID, - nodeID: nodeID, - peerID: peerID, - localSID: newLocalSID(), - tr: tr, - conn: conn, - cfg: cfg, + linkID: defaultLinkID, + nodeID: nodeID, + peerID: peerID, + localSID: newLocalSID(), + tr: tr, + conn: conn, + cfg: cfg, + stageController: opts.StageController, + buffers: ensureFabricBuffers(opts.Buffers), } s.run(ctx) } diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index b326e4c..d914771 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -153,6 +153,31 @@ func TestCodecAllTypes(t *testing.T) { } } +func TestFastStringDecoderAcceptsEscapedSlashProtocol(t *testing.T) { + line := []byte(`{"node":"bigbox-cm5","type":"hello","identity":{"id":"bigbox-cm5","role":"controller"},"proto":"fabric-jsonl\/1","sid":"s1"}`) + if got := protoType(line); got != msgHello { + t.Fatalf("protoType = %q", got) + } + msg, ok := decodeHelloFast(line) + if !ok { + t.Fatalf("decodeHelloFast rejected escaped slash proto") + } + if msg.Proto != protocolName || msg.SID != "s1" || msg.Node != "bigbox-cm5" { + t.Fatalf("bad hello: %+v", msg) + } +} + +func TestFastStringArrayDecoderAcceptsEscapedValues(t *testing.T) { + line := []byte(`{"type":"call","id":"c1","topic":["cap","update-manager","main","rpc","commit-job\/test"],"payload":{}}`) + msg, ok := decodeCallFast(line) + if !ok { + t.Fatalf("decodeCallFast rejected escaped topic") + } + if len(msg.Topic) != 5 || msg.Topic[4] != "commit-job/test" { + t.Fatalf("topic = %#v", msg.Topic) + } +} + func TestWireTypeBadInput(t *testing.T) { for _, b := range [][]byte{[]byte("not json"), []byte(`{"no_type":true}`), nil} { if got := protoType(b); got != "" { @@ -229,7 +254,7 @@ func TestOversizeLineRecovery(t *testing.T) { } func TestReleaseTransferChunkFitsLineLimit(t *testing.T) { - raw := bytes.Repeat([]byte{'x'}, int(DefaultLinkConfig().ChunkSize)) + raw := bytes.Repeat([]byte{'x'}, int(DefaultLinkConfig().MaxAcceptedChunkSize)) line := marshal(protoXferChunk{ Type: msgXferChunk, XferID: "xfer-line-limit", @@ -426,8 +451,8 @@ func TestDuplicateSameSIDHelloRefreshesWithoutReset(t *testing.T) { peerSID: "s1", peerNode: "bigbox-cm5", incomingTransfer: &incomingTransfer{ - meta: transferMeta{ID: "xfer-1"}, - sink: sink, + meta: transferMeta{ID: "xfer-1"}, + worker: newTransferSinkWorker("xfer-1", sink), }, } @@ -463,8 +488,8 @@ func TestDuplicateSameSIDHelloAckRefreshesWithoutReset(t *testing.T) { peerSID: "s1", peerNode: "bigbox-cm5", incomingTransfer: &incomingTransfer{ - meta: transferMeta{ID: "xfer-1"}, - sink: sink, + meta: transferMeta{ID: "xfer-1"}, + worker: newTransferSinkWorker("xfer-1", sink), }, } @@ -943,11 +968,10 @@ func TestInboundCallBusyAtCapacity(t *testing.T) { // First call holds the only helper slot. The bus has no handler, so // the call sits as a pending request until timeout. sendMsg(t, cm5, protoCall{ - Type: msgCall, - ID: "c1", - Topic: []string{"rpc", "test", "noop"}, - Payload: json.RawMessage(`{}`), - TimeoutMs: 5000, + Type: msgCall, + ID: "c1", + Topic: []string{"rpc", "test", "noop"}, + Payload: json.RawMessage(`{}`), }) // Second call arrives while the helper is full → busy reply. @@ -1251,7 +1275,7 @@ func TestDrainExportsWaitsForStartupHoldoff(t *testing.T) { } } -func TestDrainExportsPausesDuringIncomingTransfer(t *testing.T) { +func TestDrainExportsContinuesDuringIncomingTransfer(t *testing.T) { b := newBus() fabricConn := b.NewConnection("fabric") pubConn := b.NewConnection("publisher") @@ -1274,19 +1298,12 @@ func TestDrainExportsPausesDuringIncomingTransfer(t *testing.T) { )) s.drainExports() - if len(tr.writes) != 0 { - t.Fatalf("writes during transfer = %d, want 0", len(tr.writes)) - } - - s.incomingTransfer = nil - s.drainExports() - if len(tr.writes) != 1 { - t.Fatalf("writes after transfer = %d, want 1", len(tr.writes)) + t.Fatalf("writes during transfer = %d, want 1", len(tr.writes)) } } -func TestDrainExportsPausesAfterPrepareCall(t *testing.T) { +func TestDrainExportsContinuesAfterPrepareCall(t *testing.T) { b := newBus() fabricConn := b.NewConnection("fabric") pubConn := b.NewConnection("publisher") @@ -1332,32 +1349,22 @@ func TestDrainExportsPausesAfterPrepareCall(t *testing.T) { )) s.drainExports() - if len(tr.writes) != 0 { - t.Fatalf("writes during prepare quiet = %d, want 0", len(tr.writes)) - } - - s.transferQuietUntil = time.Time{} - s.transferQuietReason = "" - s.drainExports() - if len(tr.writes) != 1 { - t.Fatalf("writes after prepare quiet = %d, want 1", len(tr.writes)) + t.Fatalf("writes after prepare call = %d, want 1", len(tr.writes)) } } -func TestDrainExportsAllowsOnlyCriticalFactsDuringPostTransferQuiet(t *testing.T) { +func TestDrainExportsDoesNotUsePostTransferQuietWindow(t *testing.T) { b := bus.NewBus(16, "+", "#") fabricConn := b.NewConnection("fabric") pubConn := b.NewConnection("publisher") tr := &captureTransport{} s := session{ - conn: fabricConn, - tr: tr, - link: linkUp, - exportsEnabled: true, - exportReadyAt: time.Now().Add(-time.Second), - transferQuietUntil: time.Now().Add(time.Second), - transferQuietReason: "xfer_done", + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), } s.setupExports() @@ -1387,8 +1394,8 @@ func TestDrainExportsAllowsOnlyCriticalFactsDuringPostTransferQuiet(t *testing.T for i := 0; i < len(criticalExportTopics)+4; i++ { s.drainExports() } - if len(tr.writes) != len(criticalExportTopics) { - t.Fatalf("writes during post-transfer quiet = %d, want %d critical facts", + if len(tr.writes) < len(criticalExportTopics) { + t.Fatalf("writes after transfer = %d, want at least %d critical facts", len(tr.writes), len(criticalExportTopics)) } want := [][]string{ @@ -1527,6 +1534,51 @@ func TestDrainCriticalExportsCoalescesLatestRetainedFact(t *testing.T) { } } +func TestDrainExportsCoalescesQueuedRetainedTelemetry(t *testing.T) { + b := bus.NewBus(16, "+", "#") + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + sub := fabricConn.Subscribe(bus.T("state", "self", "runtime", "#")) + defer fabricConn.Unsubscribe(sub) + + s := session{ + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), + exportSubs: []*bus.Subscription{sub}, + } + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "memory"), + map[string]int{"seq": 1}, + true, + )) + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "memory"), + map[string]int{"seq": 2}, + true, + )) + + s.drainExports() + if len(tr.writes) != 1 { + t.Fatalf("writes = %d, want one coalesced retained export", len(tr.writes)) + } + pub := decodePubWrite(t, tr.writes[0]) + if !slicesEqual(pub.Topic, []string{"state", "self", "runtime", "memory"}) { + t.Fatalf("topic = %v, want state/self/runtime/memory", pub.Topic) + } + var payload map[string]int + if err := json.Unmarshal(pub.Payload, &payload); err != nil { + t.Fatalf("payload unmarshal: %v", err) + } + if payload["seq"] != 2 { + t.Fatalf("payload seq = %d, want latest seq 2", payload["seq"]) + } +} + func TestReadyWaitsForQueuedCriticalReplayAdmission(t *testing.T) { b := bus.NewBus(16, "+", "#") fabricConn := b.NewConnection("fabric") @@ -1837,21 +1889,19 @@ func TestPongAllowedDuringIncomingTransfer(t *testing.T) { } } -func TestPongAllowedDuringPrepareQuietForEstablishedPeer(t *testing.T) { +func TestPongAllowedForEstablishedPeerWithoutQuietWindow(t *testing.T) { tr := &captureTransport{} s := session{ - tr: tr, - link: linkUp, - localSID: "mcu-sid-test", - peerSID: "cm5-sid", - transferQuietUntil: time.Now().Add(time.Second), - transferQuietReason: "prepare_call_rx", + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + peerSID: "cm5-sid", } s.onPing(&protoPing{Type: msgPing, SID: "cm5-sid"}) if len(tr.writes) != 1 { - t.Fatalf("pong writes during prepare quiet = %d, want 1", len(tr.writes)) + t.Fatalf("pong writes = %d, want 1", len(tr.writes)) } var pong protoPong if err := json.Unmarshal(tr.writes[0], &pong); err != nil { @@ -1862,15 +1912,13 @@ func TestPongAllowedDuringPrepareQuietForEstablishedPeer(t *testing.T) { } } -func TestPongRejectsWrongSIDDuringPrepareQuiet(t *testing.T) { +func TestPongRejectsWrongSIDWithoutQuietWindow(t *testing.T) { tr := &captureTransport{} s := session{ - tr: tr, - link: linkUp, - localSID: "mcu-sid-test", - peerSID: "cm5-sid", - transferQuietUntil: time.Now().Add(time.Second), - transferQuietReason: "prepare_call_rx", + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + peerSID: "cm5-sid", } s.onPing(&protoPing{Type: msgPing, SID: "other-sid"}) @@ -1910,15 +1958,13 @@ func TestWrongSIDPingPongDoNotRefreshLiveness(t *testing.T) { } } -func TestPongRejectsSelfSIDDuringPrepareQuiet(t *testing.T) { +func TestPongRejectsSelfSIDWithoutQuietWindow(t *testing.T) { tr := &captureTransport{} s := session{ - tr: tr, - link: linkUp, - localSID: "mcu-sid-test", - peerSID: "mcu-sid-test", - transferQuietUntil: time.Now().Add(time.Second), - transferQuietReason: "prepare_call_rx", + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + peerSID: "mcu-sid-test", } s.onPing(&protoPing{Type: msgPing, SID: "mcu-sid-test"}) @@ -1999,7 +2045,7 @@ func TestCallIgnoredBeforeHandshake(t *testing.T) { sendMsg(t, cm5, protoCall{ Type: "call", ID: "pre-hello-1", Topic: []string{"rpc", "hal", "dump"}, - Payload: json.RawMessage(`{}`), TimeoutMs: 5000, + Payload: json.RawMessage(`{}`), }) select { @@ -2031,7 +2077,7 @@ func TestCallImport(t *testing.T) { sendMsg(t, cm5, protoCall{ Type: "call", ID: "test-corr-1", Topic: []string{"cap", "self", "updater", "main", "rpc", "prepare-update"}, - Payload: json.RawMessage(`{"job_id":"job-prepare","expected_image_id":"mcu-dev-15.3"}`), TimeoutMs: 5000, + Payload: json.RawMessage(`{"job_id":"job-prepare","expected_image_id":"mcu-dev-15.3"}`), }) reply := readMsg[protoReply](t, cm5) @@ -2044,7 +2090,7 @@ func TestCallImport(t *testing.T) { lines := diag.snapshot() assertDiagContains(t, lines, "[fabric-rpc]", "ev call_rx", "call_id test-corr-1", "job_id job-prepare", "expected_image_id mcu-dev-15.3") assertDiagContains(t, lines, "[fabric-rpc]", "ev call_route_ok", "local_topic rpc/updater/prepare") - assertDiagContains(t, lines, "[fabric-rpc]", "ev call_dispatch_start", "timeout_ms 5000") + assertDiagContains(t, lines, "[fabric-rpc]", "ev call_dispatch_start") waitDiagContains(t, diag, "[fabric-rpc]", "ev call_reply_tx", "ok true", "sent true") } @@ -2058,7 +2104,7 @@ func TestCallNoRoute(t *testing.T) { sendMsg(t, cm5, protoCall{ Type: "call", ID: "no-route-1", Topic: []string{"unknown", "endpoint"}, - Payload: json.RawMessage(`{}`), TimeoutMs: 1000, + Payload: json.RawMessage(`{}`), }) reply := readMsg[protoReply](t, cm5) diff --git a/services/fabric/mcu_update_flow_test.go b/services/fabric/mcu_update_flow_test.go new file mode 100644 index 0000000..a6e06fc --- /dev/null +++ b/services/fabric/mcu_update_flow_test.go @@ -0,0 +1,333 @@ +package fabric + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "io" + "strings" + "sync" + "testing" + "time" + + "devicecode-go/services/updater" +) + +type integrationVerifier struct { + mu sync.Mutex + want []byte + got []byte + manifest updater.Manifest + err error +} + +func (v *integrationVerifier) Verify(r io.Reader, sink updater.SlotSink) (updater.Manifest, error) { + data, err := io.ReadAll(r) + if err != nil { + if sink != nil { + _ = sink.Abort() + } + return updater.Manifest{}, err + } + v.mu.Lock() + v.got = append([]byte(nil), data...) + want := append([]byte(nil), v.want...) + verr := v.err + v.mu.Unlock() + if verr != nil { + if sink != nil { + _ = sink.Abort() + } + return updater.Manifest{}, verr + } + if want != nil && !bytes.Equal(data, want) { + if sink != nil { + _ = sink.Abort() + } + return updater.Manifest{}, errors.New("artefact_bytes_mismatch") + } + if sink != nil { + if _, err := sink.Write(data); err != nil { + return updater.Manifest{}, err + } + if err := sink.Commit(); err != nil { + return updater.Manifest{}, err + } + } + return v.manifest, nil +} + +func (v *integrationVerifier) bytesSeen() []byte { + v.mu.Lock() + defer v.mu.Unlock() + return append([]byte(nil), v.got...) +} + +type integrationApplier struct { + mu sync.Mutex + canCalls []updater.StagedDescriptor + rebootCalls []updater.StagedDescriptor + rebootCh chan updater.StagedDescriptor +} + +func (a *integrationApplier) CanApply(d updater.StagedDescriptor) error { + a.mu.Lock() + defer a.mu.Unlock() + a.canCalls = append(a.canCalls, d) + return nil +} + +func (a *integrationApplier) ArmReboot(d updater.StagedDescriptor) error { + a.mu.Lock() + a.rebootCalls = append(a.rebootCalls, d) + ch := a.rebootCh + a.mu.Unlock() + if ch != nil { + select { + case ch <- d: + default: + } + } + return nil +} + +func (a *integrationApplier) counts() (int, int) { + a.mu.Lock() + defer a.mu.Unlock() + return len(a.canCalls), len(a.rebootCalls) +} + +func waitForMcuUpdateDone(t *testing.T, tr Transport, id string) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for { + if time.Now().After(deadline) { + t.Fatalf("timed out waiting for xfer_done id=%s", id) + } + line, err := tr.ReadLine() + if err != nil { + t.Fatalf("ReadLine: %v", err) + } + var probe struct { + Type string `json:"type"` + XferID string `json:"xfer_id"` + Err string `json:"err"` + } + if err := json.Unmarshal(line, &probe); err != nil { + t.Fatalf("Unmarshal %q: %v", line, err) + } + switch probe.Type { + case msgXferDone: + if probe.XferID != id { + t.Fatalf("xfer_done id = %q, want %q", probe.XferID, id) + } + return + case msgXferAbort: + t.Fatalf("transfer aborted while waiting for done: %+v", probe) + } + } +} + +func sendMcuUpdateArtefact(t *testing.T, tr Transport, id string, payload []byte, chunkSizes ...int) { + t.Helper() + sendMsg(t, tr, xferBegin(id, payload, nil)) + readTransferReady(t, tr, id, 0) + writeRawLine(t, tr, `{"type":"unknown_noise","ignored":true}`) + off := 0 + for len(payload[off:]) > 0 { + n := len(payload) - off + if len(chunkSizes) > 0 { + n = chunkSizes[0] + chunkSizes = chunkSizes[1:] + if n > len(payload)-off { + n = len(payload) - off + } + } + part := payload[off : off+n] + sendMsg(t, tr, xferChunk(id, uint32(off), part)) + off += n + readTransferNeed(t, tr, id, uint32(off)) + } + sendMsg(t, tr, xferCommit(id, payload)) + waitForMcuUpdateDone(t, tr, id) +} + +func TestMCUUpdateFullWirePathStagesAndCommitsReboot(t *testing.T) { + b := newBus() + caller := b.NewConnection("caller") + observer := b.NewConnection("observer") + upSub := observer.Subscribe(updater.TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + + payload := []byte("signed-envelope-and-payload-for-mcu") + manifest := updater.Manifest{ + Version: "2.0.0", + BuildID: "build-2.0.0", + ImageID: "mcu-image-new", + PayloadSHA256: strings.Repeat("c", 64), + PayloadLength: uint32(len(payload)), + } + verif := &integrationVerifier{want: payload, manifest: manifest} + memMD := updater.NewMemoryMetadata() + app := &integrationApplier{rebootCh: make(chan updater.StagedDescriptor, 1)} + cancelUpdater, updaterSvc := runUpdaterForFabricTest(t, b, updater.Options{ + Verifier: verif, + Applier: app, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancelUpdater() + prepareUpdaterForFabricTest(t, caller) + + cm5, mcu := pipePair() + ctx, cancelFabric := context.WithCancel(context.Background()) + defer cancelFabric() + go RunWithOptions(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig(), RunOptions{StageController: updaterSvc}) + bringUp(t, cm5) + + sendMcuUpdateArtefact(t, cm5, "xfer-full-path", payload, 7, 5) + waitUpdaterFactForFabricTest(t, upSub, func(f updater.UpdaterFact) bool { return f.State == updater.StateStaged }) + if got := verif.bytesSeen(); !bytes.Equal(got, payload) { + t.Fatalf("verifier saw %q, want %q", got, payload) + } + desc, ok := memMD.StagedDescriptor() + if !ok { + t.Fatal("staged descriptor not persisted") + } + if desc.Version != manifest.Version || desc.ImageID != manifest.ImageID || desc.PayloadSHA256 != manifest.PayloadSHA256 || desc.Length != manifest.PayloadLength { + t.Fatalf("staged descriptor = %+v, want manifest %+v", desc, manifest) + } + + payloadReply := requestUpdaterForFabricTest(t, caller, updater.TopicCommitRPC, updater.CommitRequest{}) + commit, ok := payloadReply.(updater.CommitReply) + if !ok || !commit.Accepted || !commit.RebootRequired { + t.Fatalf("commit reply = %#v, want accepted reboot_required", payloadReply) + } + select { + case rebootDesc := <-app.rebootCh: + if rebootDesc.ImageID != manifest.ImageID || rebootDesc.Version != manifest.Version { + t.Fatalf("reboot descriptor = %+v, want image %s version %s", rebootDesc, manifest.ImageID, manifest.Version) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for ArmReboot") + } + waitUpdaterFactForFabricTest(t, upSub, func(f updater.UpdaterFact) bool { return f.State == updater.StateRebooting }) + can, reboot := app.counts() + if can != 1 || reboot != 1 { + t.Fatalf("applier calls: CanApply=%d ArmReboot=%d, want 1 and 1", can, reboot) + } +} + +func TestMCUUpdateFullWirePathCommitRejectsExpectedImageMismatch(t *testing.T) { + b := newBus() + caller := b.NewConnection("caller") + payload := []byte("firmware-bytes") + manifest := updater.Manifest{ + Version: "2.1.0", + BuildID: "build-2.1.0", + ImageID: "mcu-image-real", + PayloadSHA256: strings.Repeat("d", 64), + PayloadLength: uint32(len(payload)), + } + memMD := updater.NewMemoryMetadata() + app := &integrationApplier{rebootCh: make(chan updater.StagedDescriptor, 1)} + cancelUpdater, updaterSvc := runUpdaterForFabricTest(t, b, updater.Options{ + Verifier: &integrationVerifier{want: payload, manifest: manifest}, + Applier: app, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancelUpdater() + prepareUpdaterForFabricTest(t, caller) + + cm5, mcu := pipePair() + ctx, cancelFabric := context.WithCancel(context.Background()) + defer cancelFabric() + go RunWithOptions(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig(), RunOptions{StageController: updaterSvc}) + bringUp(t, cm5) + + sendMcuUpdateArtefact(t, cm5, "xfer-mismatch", payload, 4) + if _, ok := memMD.StagedDescriptor(); !ok { + t.Fatal("staged descriptor not persisted before mismatch commit") + } + + payloadReply := requestUpdaterForFabricTest(t, caller, updater.TopicCommitRPC, updater.CommitRequest{ExpectedImageID: "mcu-image-other"}) + reply, ok := payloadReply.(updater.Reply) + if !ok || reply.OK || reply.Error != updater.ErrImageIDMismatch { + t.Fatalf("commit mismatch reply = %#v, want image_id_mismatch", payloadReply) + } + can, reboot := app.counts() + if can != 0 || reboot != 0 { + t.Fatalf("applier called despite image mismatch: CanApply=%d ArmReboot=%d", can, reboot) + } + select { + case d := <-app.rebootCh: + t.Fatalf("unexpected reboot after mismatch: %+v", d) + default: + } +} + +func TestMCUUpdateWireDigestMismatchCancelsLeaseAndLeavesNoStagedImage(t *testing.T) { + b := newBus() + caller := b.NewConnection("caller") + observer := b.NewConnection("observer") + upSub := observer.Subscribe(updater.TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + memMD := updater.NewMemoryMetadata() + cancelUpdater, updaterSvc := runUpdaterForFabricTest(t, b, updater.Options{ + Verifier: &integrationVerifier{}, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancelUpdater() + prepareUpdaterForFabricTest(t, caller) + + cm5, mcu := pipePair() + ctx, cancelFabric := context.WithCancel(context.Background()) + defer cancelFabric() + go RunWithOptions(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig(), RunOptions{StageController: updaterSvc}) + bringUp(t, cm5) + + payload := []byte("abcd") + bogusDigest := strings.Repeat("0", 8) + sendMsg(t, cm5, protoXferBegin{ + Type: msgXferBegin, + XferID: "xfer-digest-mismatch-real", + Target: updater.TargetUpdaterMain, + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: bogusDigest, + }) + readTransferReady(t, cm5, "xfer-digest-mismatch-real", 0) + sendMsg(t, cm5, xferChunk("xfer-digest-mismatch-real", 0, payload)) + readTransferNeed(t, cm5, "xfer-digest-mismatch-real", uint32(len(payload))) + sendMsg(t, cm5, protoXferCommit{ + Type: msgXferCommit, + XferID: "xfer-digest-mismatch-real", + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: bogusDigest, + }) + readTransferAbort(t, cm5, "xfer-digest-mismatch-real", "digest_mismatch") + + failed := waitUpdaterFactForFabricTest(t, upSub, func(f updater.UpdaterFact) bool { return f.State == updater.StateFailed }) + if got := strValueFabric(failed.LastError); got != "digest_mismatch" { + t.Fatalf("last_error = %q, want digest_mismatch", got) + } + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatal("digest mismatch left a staged descriptor") + } + payloadReply := requestUpdaterForFabricTest(t, caller, updater.TopicCommitRPC, updater.CommitRequest{}) + reply, ok := payloadReply.(updater.Reply) + if !ok || reply.OK || reply.Error != updater.ErrNoStagedImage { + t.Fatalf("commit after digest mismatch = %#v, want no_staged_image", payloadReply) + } +} + +func strValueFabric(p *string) string { + if p == nil { + return "" + } + return *p +} diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index cf2b5a5..6629e5b 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -1,6 +1,9 @@ package fabric -import "encoding/json" +import ( + "encoding/json" + "strconv" +) // ---- Wire message type identifiers ---- // @@ -73,11 +76,10 @@ type protoUnretain struct { } type protoCall struct { - Type string `json:"type"` - ID string `json:"id"` - Topic []string `json:"topic"` - Payload json.RawMessage `json:"payload"` - TimeoutMs int `json:"timeout_ms"` + Type string `json:"type"` + ID string `json:"id"` + Topic []string `json:"topic"` + Payload json.RawMessage `json:"payload"` } // protoReply mirrors Lua's reply frame: {type, id, ok, payload, err}. The Go @@ -118,6 +120,7 @@ type protoXferChunk struct { Offset uint32 `json:"offset"` Data string `json:"data"` ChunkDigest string `json:"chunk_digest"` + LineLen int `json:"-"` } // protoXferNeed (control) acks the MCU's expected next byte offset. @@ -163,6 +166,366 @@ func marshal(v any) []byte { return append(b, '\n') } +// marshalHelloAck returns a compact hello_ack frame without using reflection. +func marshalHelloAck(sid, node string) []byte { + b := make([]byte, 0, 96) + b = append(b, `{"type":"hello_ack","proto":"fabric-jsonl/1","sid":"`...) + b = appendJSONString(b, sid) + b = append(b, `","node":"`...) + b = appendJSONString(b, node) + b = append(b, `"}`...) + return append(b, '\n') +} + +func marshalPing(sid string) []byte { return marshalSIDControl(msgPing, sid) } +func marshalPong(sid string) []byte { return marshalSIDControl(msgPong, sid) } + +func marshalSIDControl(typ, sid string) []byte { + b := make([]byte, 0, 48+len(sid)) + b = append(b, `{"type":"`...) + b = appendJSONString(b, typ) + b = append(b, `","sid":"`...) + b = appendJSONString(b, sid) + b = append(b, `"}`...) + return append(b, '\n') +} + +func marshalReplyErr(id, errText string) []byte { + b := make([]byte, 0, 64+len(id)+len(errText)) + b = append(b, `{"type":"reply","id":"`...) + b = appendJSONString(b, id) + b = append(b, `","ok":false,"err":"`...) + b = appendJSONString(b, errText) + b = append(b, `"}`...) + return append(b, '\n') +} + +func marshalReplyOKRaw(id string, payload json.RawMessage) []byte { + b := make([]byte, 0, 48+len(id)+len(payload)) + b = append(b, `{"type":"reply","id":"`...) + b = appendJSONString(b, id) + b = append(b, `","ok":true`...) + if len(payload) > 0 { + b = append(b, `,"payload":`...) + b = append(b, payload...) + } + b = append(b, `}`...) + return append(b, '\n') +} + +func marshalXferReady(id string) []byte { return marshalXferControl(msgXferReady, id, 0, false, "") } +func marshalXferNeed(id string, next uint32) []byte { + return marshalXferControl(msgXferNeed, id, next, true, "") +} +func marshalXferDone(id string) []byte { return marshalXferControl(msgXferDone, id, 0, false, "") } +func marshalXferAbort(id, reason string) []byte { + return marshalXferControl(msgXferAbort, id, 0, false, reason) +} + +func marshalXferControl(typ, id string, next uint32, hasNext bool, errText string) []byte { + b := make([]byte, 0, 80+len(id)+len(errText)) + b = append(b, `{"type":"`...) + b = appendJSONString(b, typ) + b = append(b, `","xfer_id":"`...) + b = appendJSONString(b, id) + b = append(b, `"`...) + if hasNext { + b = append(b, `,"next":`...) + b = strconv.AppendUint(b, uint64(next), 10) + } + if errText != "" { + b = append(b, `,"err":"`...) + b = appendJSONString(b, errText) + b = append(b, `"`...) + } + b = append(b, `}`...) + return append(b, '\n') +} + +func appendJSONString(dst []byte, s string) []byte { + for i := 0; i < len(s); i++ { + c := s[i] + if c == '\\' || c == '"' { + dst = append(dst, '\\') + } + dst = append(dst, c) + } + return dst +} + +// protoTopRaw returns the complete top-level JSON value for field. +func protoTopRaw(line []byte, field string) (json.RawMessage, bool) { + i, ok := findTopJSONValue(line, field) + if !ok { + return nil, false + } + end, ok := skipJSONValue(line, i) + if !ok || end < i || end > len(line) { + return nil, false + } + out := make(json.RawMessage, end-i) + copy(out, line[i:end]) + return out, true +} + +func protoTopUint32(line []byte, field string) (uint32, bool) { + i, ok := findTopJSONValue(line, field) + if !ok || i >= len(line) || line[i] < '0' || line[i] > '9' { + return 0, false + } + var v uint32 + for i < len(line) && line[i] >= '0' && line[i] <= '9' { + d := uint32(line[i] - '0') + if v > (1<<32-1-d)/10 { + return 0, false + } + v = v*10 + d + i++ + } + return v, true +} + +func findTopJSONValue(line []byte, field string) (int, bool) { + n := len(line) + i := skipJSONSpace(line, 0) + if i >= n || line[i] != '{' { + return 0, false + } + i++ + for { + i = skipJSONSpace(line, i) + if i >= n { + return 0, false + } + switch line[i] { + case '}': + return 0, false + case ',': + i++ + continue + } + if line[i] != '"' { + return 0, false + } + keyStart := i + 1 + keyEnd, ok := scanJSONString(line, i) + if !ok { + return 0, false + } + i = keyEnd + i = skipJSONSpace(line, i) + if i >= n || line[i] != ':' { + return 0, false + } + i++ + i = skipJSONSpace(line, i) + if i >= n { + return 0, false + } + if jsonKeyEquals(line[keyStart:keyEnd-1], field) { + return i, true + } + i, ok = skipJSONValue(line, i) + if !ok { + return 0, false + } + } +} + +func topFieldsAllowed(line []byte, allowed ...string) bool { + n := len(line) + i := skipJSONSpace(line, 0) + if i >= n || line[i] != '{' { + return false + } + i++ + for { + i = skipJSONSpace(line, i) + if i >= n { + return false + } + if line[i] == '}' { + i++ + i = skipJSONSpace(line, i) + return i == n + } + if line[i] == ',' { + i++ + continue + } + if line[i] != '"' { + return false + } + keyStart := i + 1 + keyEnd, ok := scanJSONString(line, i) + if !ok { + return false + } + if !jsonKeyInAllowed(line[keyStart:keyEnd-1], allowed) { + return false + } + i = skipJSONSpace(line, keyEnd) + if i >= n || line[i] != ':' { + return false + } + i++ + i = skipJSONSpace(line, i) + if i >= n { + return false + } + i, ok = skipJSONValue(line, i) + if !ok { + return false + } + } +} + +func jsonKeyInAllowed(key []byte, allowed []string) bool { + for _, field := range allowed { + if jsonKeyEquals(key, field) { + return true + } + } + return false +} + +func decodeHelloFast(line []byte) (protoHello, bool) { + if !topFieldsAllowed(line, "type", "proto", "sid", "node", "identity", "auth") { + return protoHello{}, false + } + var msg protoHello + msg.Type = protoTopString(line, "type") + msg.Proto = protoTopString(line, "proto") + msg.SID = protoTopString(line, "sid") + msg.Node = protoTopString(line, "node") + return msg, msg.Type == msgHello && msg.Proto != "" && msg.SID != "" +} + +func decodePingFast(line []byte, want string) (protoPing, bool) { + if !topFieldsAllowed(line, "type", "sid") { + return protoPing{}, false + } + var msg protoPing + msg.Type = protoTopString(line, "type") + msg.SID = protoTopString(line, "sid") + return msg, msg.Type == want +} + +func decodePongFast(line []byte) (protoPong, bool) { + if !topFieldsAllowed(line, "type", "sid") { + return protoPong{}, false + } + var msg protoPong + msg.Type = protoTopString(line, "type") + msg.SID = protoTopString(line, "sid") + return msg, msg.Type == msgPong +} + +func decodeCallFast(line []byte) (protoCall, bool) { + if !topFieldsAllowed(line, "type", "id", "topic", "payload") { + return protoCall{}, false + } + var msg protoCall + msg.Type = protoTopString(line, "type") + msg.ID = protoTopString(line, "id") + msg.Topic = protoTopStringArray(line, "topic") + if payload, ok := protoTopRaw(line, "payload"); ok { + msg.Payload = payload + } + return msg, msg.Type == msgCall && msg.ID != "" && len(msg.Topic) > 0 +} + +func decodeXferBeginFast(line []byte) (protoXferBegin, bool) { + if !topFieldsAllowed(line, "type", "xfer_id", "target", "size", "digest_alg", "digest", "meta") { + return protoXferBegin{}, false + } + var msg protoXferBegin + msg.Type = protoTopString(line, "type") + msg.XferID = protoTopString(line, "xfer_id") + msg.Target = protoTopString(line, "target") + msg.Size, _ = protoTopUint32(line, "size") + msg.DigestAlg = protoTopString(line, "digest_alg") + msg.Digest = protoTopString(line, "digest") + if meta, ok := protoTopRaw(line, "meta"); ok { + msg.Meta = meta + } + return msg, msg.Type == msgXferBegin && msg.XferID != "" +} + +func decodeXferChunkFast(line []byte) (protoXferChunk, bool) { + if !topFieldsAllowed(line, "type", "xfer_id", "offset", "data", "chunk_digest") { + return protoXferChunk{}, false + } + var msg protoXferChunk + msg.Type = protoTopString(line, "type") + msg.XferID = protoTopString(line, "xfer_id") + msg.Offset, _ = protoTopUint32(line, "offset") + msg.Data = protoTopString(line, "data") + msg.ChunkDigest = protoTopString(line, "chunk_digest") + msg.LineLen = len(line) + return msg, msg.Type == msgXferChunk && msg.XferID != "" +} + +func decodeXferCommitFast(line []byte) (protoXferCommit, bool) { + if !topFieldsAllowed(line, "type", "xfer_id", "size", "digest_alg", "digest") { + return protoXferCommit{}, false + } + var msg protoXferCommit + msg.Type = protoTopString(line, "type") + msg.XferID = protoTopString(line, "xfer_id") + msg.Size, _ = protoTopUint32(line, "size") + msg.DigestAlg = protoTopString(line, "digest_alg") + msg.Digest = protoTopString(line, "digest") + return msg, msg.Type == msgXferCommit && msg.XferID != "" +} + +func decodeXferAbortFast(line []byte) (protoXferAbort, bool) { + if !topFieldsAllowed(line, "type", "xfer_id", "err") { + return protoXferAbort{}, false + } + var msg protoXferAbort + msg.Type = protoTopString(line, "type") + msg.XferID = protoTopString(line, "xfer_id") + msg.Err = protoTopString(line, "err") + return msg, msg.Type == msgXferAbort && msg.XferID != "" +} + +func protoTopStringArray(line []byte, field string) []string { + i, ok := findTopJSONValue(line, field) + if !ok || i >= len(line) || line[i] != '[' { + return nil + } + i++ + out := make([]string, 0, 8) + for { + i = skipJSONSpace(line, i) + if i >= len(line) { + return nil + } + if line[i] == ']' { + return out + } + if line[i] == ',' { + i++ + continue + } + if line[i] != '"' { + return nil + } + start := i + 1 + end, ok := scanJSONString(line, i) + if !ok { + return nil + } + value, ok := decodeJSONStringValue(line[start : end-1]) + if !ok { + return nil + } + out = append(out, value) + i = end + } +} + // protoType extracts the wire-discriminator "type" field from a JSON // envelope via a depth-aware scan. We avoid json.Unmarshal here because // TinyGo's reflect path was observed silently leaving the field empty @@ -228,7 +591,11 @@ func protoTopString(line []byte, field string) string { if !ok { return "" } - return string(line[valStart : valEnd-1]) + value, ok := decodeJSONStringValue(line[valStart : valEnd-1]) + if !ok { + return "" + } + return value } i, ok = skipJSONValue(line, i) if !ok { @@ -237,6 +604,47 @@ func protoTopString(line []byte, field string) string { } } +func decodeJSONStringValue(raw []byte) (string, bool) { + for _, c := range raw { + if c == '\\' { + return decodeEscapedJSONStringValue(raw) + } + } + return string(raw), true +} + +func decodeEscapedJSONStringValue(raw []byte) (string, bool) { + out := make([]byte, 0, len(raw)) + for i := 0; i < len(raw); i++ { + c := raw[i] + if c != '\\' { + out = append(out, c) + continue + } + if i+1 >= len(raw) { + return "", false + } + i++ + switch raw[i] { + case '"', '\\', '/': + out = append(out, raw[i]) + case 'b': + out = append(out, '\b') + case 'f': + out = append(out, '\f') + case 'n': + out = append(out, '\n') + case 'r': + out = append(out, '\r') + case 't': + out = append(out, '\t') + default: + return "", false + } + } + return string(out), true +} + func jsonKeyEquals(key []byte, field string) bool { if len(key) != len(field) { return false diff --git a/services/fabric/selftest.go b/services/fabric/selftest.go new file mode 100644 index 0000000..807c3ca --- /dev/null +++ b/services/fabric/selftest.go @@ -0,0 +1,443 @@ +//go:build !tinygo || fabric_uart_selftest + +package fabric + +import ( + "context" + "encoding/base64" + "errors" + "strconv" + "time" + + "devicecode-go/bus" + "devicecode-go/services/updater" + "devicecode-go/x/shmring" + "devicecode-go/x/xxhash" +) + +// UARTSelfTestOptions describes an opt-in in-process Fabric transfer test. It +// uses the same newline JSONL and shmring transport shape as the UART session, +// but cross-connects the rings in memory so a board can exercise Fabric and the +// updater stage-controller boundary without an external serial peer. +type UARTSelfTestOptions struct { + Conn *bus.Connection + StageController StageController + PayloadSize int + ChunkSize int + Timeout time.Duration +} + +type UARTSelfTestResult struct { + PayloadSize uint32 + ChunkSize uint32 + Digest string + XferID string +} + +func (r UARTSelfTestResult) OK() bool { return r.XferID != "" && r.PayloadSize > 0 } + +const defaultSelfTestPayloadSize = 1024 +const defaultSelfTestChunkSize = 256 +const defaultSelfTestTimeout = 10 * time.Second + +// Keep the self-test's large scratch areas out of goroutine stacks. The normal +// hardware path already keeps the MCU Fabric buffer package-level in the +// Reactor. The self-test is single-shot and opt-in, so package-level scratch is +// acceptable and avoids invalidating the 3 KB stack gate. +var selfTestMCUBuffers FabricBuffers +var selfTestPeerLine [maxLineLen]byte +var selfTestB64 [maxChunkBase64Len]byte + +// RunUARTSelfTest starts an MCU Fabric session and a tiny in-process CM5 peer +// connected by cross-wired shmring transports. It performs prepare-update and a +// transfer to updater/main, then stops before commit-update/reboot. This is a +// hardware smoke gate for Fabric framing and the updater stage-controller seam; +// it is not a production A/B flash test. +func RunUARTSelfTest(ctx context.Context, opts UARTSelfTestOptions) (UARTSelfTestResult, error) { + if opts.Conn == nil { + return UARTSelfTestResult{}, errors.New("missing_bus_connection") + } + if opts.StageController == nil { + return UARTSelfTestResult{}, errors.New("missing_stage_controller") + } + payloadSize := opts.PayloadSize + if payloadSize <= 0 { + payloadSize = defaultSelfTestPayloadSize + } + chunkSize := opts.ChunkSize + if chunkSize <= 0 { + chunkSize = defaultSelfTestChunkSize + } + if chunkSize > payloadSize { + chunkSize = payloadSize + } + timeout := opts.Timeout + if timeout <= 0 { + timeout = defaultSelfTestTimeout + } + + // Cross-wired UART-shaped rings: + // peer TX -> MCU RX on a + // MCU TX -> peer RX on b + // The rings only carry this self-test's small line frames, so 2048 bytes per + // direction is enough and avoids permanently retaining another pair of full + // UART-sized rings on the MCU. + a := shmring.New(2048) + b := shmring.New(2048) + mcuTr := NewShmringTransportWithBuffers(a, b, &selfTestMCUBuffers) + peer := newUARTSelfTestPeer(b, a, "cm5-selftest-sid") + + testCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + defer mcuTr.Close() + defer peer.Close() + go func() { + <-testCtx.Done() + _ = mcuTr.Close() + _ = peer.Close() + }() + + fabricDone := make(chan struct{}) + go func() { + defer close(fabricDone) + RunWithOptions(testCtx, mcuTr, opts.Conn, "mcu", "bigbox-cm5", DefaultLinkConfig(), RunOptions{Buffers: &selfTestMCUBuffers, StageController: opts.StageController}) + }() + + if err := peer.writeHello(); err != nil { + return UARTSelfTestResult{}, err + } + ackLine, err := peer.waitType(msgHelloAck, "") + if err != nil { + return UARTSelfTestResult{}, err + } + if protoTopString(ackLine, "node") != "mcu" || protoTopString(ackLine, "sid") == "" { + return UARTSelfTestResult{}, errors.New("bad_hello_ack") + } + + prepareID := "selftest-prepare-1" + if err := peer.writePrepare(prepareID); err != nil { + return UARTSelfTestResult{}, err + } + replyLine, err := peer.waitReply(prepareID) + if err != nil { + return UARTSelfTestResult{}, err + } + ok, okField := protoTopBool(replyLine, "ok") + if !okField || !ok { + errText := protoTopString(replyLine, "err") + if errText != "" { + return UARTSelfTestResult{}, errors.New("prepare_failed:" + errText) + } + return UARTSelfTestResult{}, errors.New("prepare_failed") + } + + payload := selfTestPayload(payloadSize) + digest := selfTestXXHash(payload) + xferID := "selftest-xfer-1" + if err := peer.writeXferBegin(xferID, uint32(len(payload)), digest); err != nil { + return UARTSelfTestResult{}, err + } + if _, err := peer.waitType(msgXferReady, xferID); err != nil { + return UARTSelfTestResult{}, err + } + if err := peer.waitNeed(xferID, 0); err != nil { + return UARTSelfTestResult{}, err + } + + for off := 0; off < len(payload); off += chunkSize { + end := off + chunkSize + if end > len(payload) { + end = len(payload) + } + chunk := payload[off:end] + if err := peer.writeXferChunk(xferID, uint32(off), chunk); err != nil { + return UARTSelfTestResult{}, err + } + if err := peer.waitNeed(xferID, uint32(end)); err != nil { + return UARTSelfTestResult{}, err + } + } + if err := peer.writeXferCommit(xferID, uint32(len(payload)), digest); err != nil { + return UARTSelfTestResult{}, err + } + if _, err := peer.waitType(msgXferDone, xferID); err != nil { + return UARTSelfTestResult{}, err + } + + cancel() + select { + case <-fabricDone: + case <-time.After(200 * time.Millisecond): + } + + return UARTSelfTestResult{PayloadSize: uint32(len(payload)), ChunkSize: uint32(chunkSize), Digest: digest, XferID: xferID}, nil +} + +type uartSelfTestPeer struct { + rx *shmring.Ring + tx *shmring.Ring + ctx context.Context + cancel context.CancelFunc + lineBuf *[maxLineLen]byte + n int + over bool + sid string +} + +func newUARTSelfTestPeer(rx, tx *shmring.Ring, sid string) *uartSelfTestPeer { + ctx, cancel := context.WithCancel(context.Background()) + return &uartSelfTestPeer{rx: rx, tx: tx, ctx: ctx, cancel: cancel, lineBuf: &selfTestPeerLine, sid: sid} +} + +func (p *uartSelfTestPeer) Close() error { + p.cancel() + return nil +} + +func (p *uartSelfTestPeer) writeHello() error { + return p.writeLineBytes([]byte(`{"type":"hello","proto":"fabric-jsonl/1","sid":"cm5-selftest-sid","node":"bigbox-cm5"}`)) +} + +func (p *uartSelfTestPeer) writePrepare(id string) error { + b := make([]byte, 0, 192) + b = append(b, `{"type":"call","id":"`...) + b = appendJSONString(b, id) + b = append(b, `","topic":["cap","self","updater","main","rpc","prepare-update"],"payload":{"job_id":"selftest-job","target":"mcu","expected_image_id":"hwtest-image"}}`...) + return p.writeLineBytes(b) +} + +func (p *uartSelfTestPeer) writeXferBegin(id string, size uint32, digest string) error { + b := make([]byte, 0, 192) + b = append(b, `{"type":"xfer_begin","xfer_id":"`...) + b = appendJSONString(b, id) + b = append(b, `","target":"`...) + b = appendJSONString(b, updater.TargetUpdaterMain) + b = append(b, `","size":`...) + b = strconv.AppendUint(b, uint64(size), 10) + b = append(b, `,"digest_alg":"`...) + b = appendJSONString(b, updater.DigestAlgXXHash32) + b = append(b, `","digest":"`...) + b = appendJSONString(b, digest) + b = append(b, `","meta":{"source":"mcu-selftest"}}`...) + return p.writeLineBytes(b) +} + +func (p *uartSelfTestPeer) writeXferChunk(id string, off uint32, chunk []byte) error { + n := base64.RawURLEncoding.EncodedLen(len(chunk)) + if n > len(selfTestB64) { + return ErrLineTooLong + } + base64.RawURLEncoding.Encode(selfTestB64[:n], chunk) + chunkDigest := selfTestXXHash(chunk) + b := make([]byte, 0, 160+n) + b = append(b, `{"type":"xfer_chunk","xfer_id":"`...) + b = appendJSONString(b, id) + b = append(b, `","offset":`...) + b = strconv.AppendUint(b, uint64(off), 10) + b = append(b, `,"data":"`...) + b = append(b, selfTestB64[:n]...) + b = append(b, `","chunk_digest":"`...) + b = appendJSONString(b, chunkDigest) + b = append(b, `"}`...) + return p.writeLineBytes(b) +} + +func (p *uartSelfTestPeer) writeXferCommit(id string, size uint32, digest string) error { + b := make([]byte, 0, 144) + b = append(b, `{"type":"xfer_commit","xfer_id":"`...) + b = appendJSONString(b, id) + b = append(b, `","size":`...) + b = strconv.AppendUint(b, uint64(size), 10) + b = append(b, `,"digest_alg":"`...) + b = appendJSONString(b, updater.DigestAlgXXHash32) + b = append(b, `","digest":"`...) + b = appendJSONString(b, digest) + b = append(b, `"}`...) + return p.writeLineBytes(b) +} + +func (p *uartSelfTestPeer) waitType(wantType, wantXfer string) ([]byte, error) { + for { + line, err := p.readLine() + if err != nil { + return nil, err + } + mt := protoType(line) + if mt == msgXferAbort { + id := protoTopString(line, "xfer_id") + if wantXfer == "" || id == wantXfer { + return nil, errors.New("xfer_abort:" + protoTopString(line, "err")) + } + } + if mt != wantType { + continue + } + if wantXfer == "" || protoTopString(line, "xfer_id") == wantXfer { + return line, nil + } + } +} + +func (p *uartSelfTestPeer) waitReply(id string) ([]byte, error) { + for { + line, err := p.readLine() + if err != nil { + return nil, err + } + if protoType(line) == msgReply && protoTopString(line, "id") == id { + return line, nil + } + } +} + +func (p *uartSelfTestPeer) waitNeed(id string, next uint32) error { + for { + line, err := p.waitType(msgXferNeed, id) + if err != nil { + return err + } + got, ok := protoTopUint32(line, "next") + if ok && got == next { + return nil + } + if ok { + return errors.New("unexpected_xfer_need") + } + } +} + +func (p *uartSelfTestPeer) readLine() ([]byte, error) { + p.n = 0 + p.over = false + for { + p1, p2 := p.rx.ReadAcquire() + if len(p1)+len(p2) == 0 { + select { + case <-p.ctx.Done(): + return nil, errors.New("transport_closed") + case <-p.rx.Readable(): + continue + } + } + if idx := findByte(p1, '\n'); idx >= 0 { + if !p.over && !p.appendLineChunk(p1[:idx]) { + p.over = true + } + p.rx.ReadRelease(idx + 1) + return p.finishLine() + } + if !p.over && !p.appendLineChunk(p1) { + p.over = true + } + if idx := findByte(p2, '\n'); idx >= 0 { + if !p.over && !p.appendLineChunk(p2[:idx]) { + p.over = true + } + p.rx.ReadRelease(len(p1) + idx + 1) + return p.finishLine() + } + if !p.over && !p.appendLineChunk(p2) { + p.over = true + } + p.rx.ReadRelease(len(p1) + len(p2)) + } +} + +func (p *uartSelfTestPeer) appendLineChunk(b []byte) bool { + if len(b) == 0 { + return true + } + if p.n+len(b) > len(p.lineBuf) { + p.n = 0 + return false + } + copy(p.lineBuf[p.n:], b) + p.n += len(b) + return true +} + +func (p *uartSelfTestPeer) finishLine() ([]byte, error) { + if p.over { + p.n = 0 + p.over = false + return nil, ErrLineTooLong + } + return p.lineBuf[:p.n], nil +} + +func (p *uartSelfTestPeer) writeLineBytes(data []byte) error { + if len(data) > maxLineLen { + return ErrLineTooLong + } + if err := p.writeBytes(data); err != nil { + return err + } + return p.writeByte('\n') +} + +func (p *uartSelfTestPeer) writeBytes(data []byte) error { + written := 0 + for written < len(data) { + p1, p2 := p.tx.WriteAcquire() + if len(p1)+len(p2) == 0 { + select { + case <-p.ctx.Done(): + return errors.New("transport_closed") + case <-p.tx.Writable(): + continue + } + } + remaining := data[written:] + n := copy(p1, remaining) + remaining = remaining[n:] + if len(remaining) > 0 && len(p2) > 0 { + n += copy(p2, remaining) + } + p.tx.WriteCommit(n) + written += n + } + return nil +} + +func (p *uartSelfTestPeer) writeByte(c byte) error { + for { + p1, _ := p.tx.WriteAcquire() + if len(p1) == 0 { + select { + case <-p.ctx.Done(): + return errors.New("transport_closed") + case <-p.tx.Writable(): + continue + } + } + p1[0] = c + p.tx.WriteCommit(1) + return nil + } +} + +func selfTestPayload(n int) []byte { + out := make([]byte, n) + var x uint32 = 0x12345678 + for i := range out { + x = x*1664525 + 1013904223 + out[i] = byte(x >> 24) + } + return out +} + +func selfTestXXHash(data []byte) string { return xxhashHex(xxhash.Sum32(data, 0)) } + +func protoTopBool(line []byte, field string) (bool, bool) { + i, ok := findTopJSONValue(line, field) + if !ok { + return false, false + } + if i+4 <= len(line) && string(line[i:i+4]) == "true" { + return true, true + } + if i+5 <= len(line) && string(line[i:i+5]) == "false" { + return false, true + } + return false, false +} diff --git a/services/fabric/selftest_test.go b/services/fabric/selftest_test.go new file mode 100644 index 0000000..d767209 --- /dev/null +++ b/services/fabric/selftest_test.go @@ -0,0 +1,50 @@ +package fabric + +import ( + "context" + "io" + "strings" + "testing" + "time" + + "devicecode-go/bus" + "devicecode-go/services/updater" +) + +type selfTestAcceptVerifier struct{} + +func (selfTestAcceptVerifier) Verify(r io.Reader, sink updater.SlotSink) (updater.Manifest, error) { + n, err := io.Copy(sink, r) + if err != nil { + _ = sink.Abort() + return updater.Manifest{}, err + } + if err := sink.Commit(); err != nil { + return updater.Manifest{}, err + } + return updater.Manifest{Version: "selftest", BuildID: "host", ImageID: "hwtest-image", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: uint32(n)}, nil +} + +func TestRunUARTSelfTest(t *testing.T) { + b := bus.NewBus(8, "+", "#") + conn := b.NewConnection("fabric-selftest") + updaterConn := b.NewConnection("updater") + mem := updater.NewMemoryMetadata() + svc := updater.New(updater.Options{ + Conn: updaterConn, + Verifier: selfTestAcceptVerifier{}, + Metadata: mem, + MetadataWrite: mem, + Identity: updater.Identity{Version: "test", Build: "build", ImageID: "old-image"}, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go svc.Run(ctx) + res, err := RunUARTSelfTest(ctx, UARTSelfTestOptions{Conn: conn, StageController: svc, PayloadSize: 512, ChunkSize: 128, Timeout: 3 * time.Second}) + if err != nil { + t.Fatalf("RunUARTSelfTest: %v", err) + } + if !res.OK() || res.PayloadSize != 512 || res.ChunkSize != 128 { + t.Fatalf("bad result: %+v", res) + } +} diff --git a/services/fabric/session.go b/services/fabric/session.go index 133ba30..ff5c986 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -30,7 +30,12 @@ const ( statusReady = "ready" statusOpening = "opening" statusDown = "down" - lineQueueSize = 32 + // lineQueueSize is deliberately tiny on MCU builds. The UART RX ring is + // already the byte-level shock absorber; keeping only two fully decoded + // JSONL frames avoids reserving 32 * maxLineLen bytes of static RAM. This + // preserves allocation discipline without starving the reactor behind a + // large preallocated line queue. + lineQueueSize = 2 ) // ---- timeouts (local policy) ---- @@ -49,23 +54,8 @@ const ( // exportMaxPerTick caps the total export messages sent per drain // cycle across all subscriptions, keeping UART throughput within // the 115200-baud link capacity. - exportMaxPerTick = 1 - exportTickInterval = 50 * time.Millisecond - errPayloadMarshal = "payload_marshal_failed" - - // Temporary transport recovery policy: the USB/UART path used during OTA - // can echo MCU-originated JSONL back into the MCU receiver. If exported - // retained state is in flight while CM5 starts an OTA transfer, the echoed - // line can contain CM5's xfer_begin spliced into the middle of the state pub. - // Hold exports quiet from prepare until either xfer_begin arrives or this - // window expires. Revisit after CM5 update-admission hardening so this does - // not become OTA semantics. - transferPrepareQuiet = 10 * time.Second - // Temporary transport recovery policy: keep telemetry/state exports quiet - // long enough for the host to send the follow-up updater commit call after - // xfer_done. On echo-prone UART links, retained export backlog can otherwise - // splice into the commit JSONL frame. - transferCompleteQuiet = 10 * time.Second + exportMaxPerTick = 1 + errPayloadMarshal = "payload_marshal_failed" ) // ---- link reasons and error strings ---- @@ -86,13 +76,12 @@ const ( // ---- types ---- type inboundCall struct { - id string - topic []string - localTopic bus.Topic - payload json.RawMessage - sub *bus.Subscription - deadline time.Time - transferPrepare bool + id string + topic []string + localTopic bus.Topic + payload json.RawMessage + sub *bus.Subscription + deadline time.Time } type outboundCall struct { @@ -103,26 +92,37 @@ type outboundCall struct { type readResult struct { line []byte + slot int err error } type linkStatePayload struct { - LinkID string `json:"link_id"` - Status string `json:"status"` - Ready bool `json:"ready"` - Established bool `json:"established"` - PeerID string `json:"peer_id"` - LocalSID string `json:"local_sid"` - PeerSID string `json:"peer_sid,omitempty"` - PeerNode string `json:"peer_node,omitempty"` - PeerProto string `json:"peer_proto,omitempty"` - LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` - LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` - LastPongUnixMilli int64 `json:"last_pong_unix_ms,omitempty"` - InboundCalls int `json:"inbound_calls"` - OutboundCalls int `json:"outbound_calls"` - Reason string `json:"reason,omitempty"` - Err string `json:"err,omitempty"` + LinkID string `json:"link_id"` + Status string `json:"status"` + Ready bool `json:"ready"` + Established bool `json:"established"` + PeerID string `json:"peer_id"` + LocalSID string `json:"local_sid"` + PeerSID string `json:"peer_sid,omitempty"` + PeerNode string `json:"peer_node,omitempty"` + PeerProto string `json:"peer_proto,omitempty"` + LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` + LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` + LastPongUnixMilli int64 `json:"last_pong_unix_ms,omitempty"` + InboundCalls int `json:"inbound_calls"` + OutboundCalls int `json:"outbound_calls"` + Reason string `json:"reason,omitempty"` + Err string `json:"err,omitempty"` + Counters FabricCounters `json:"counters"` +} + +// FabricLinkObservation gives other in-process services a small, typed way to +// observe Fabric readiness without JSON-probing this package's private retained +// payload shape. The payload remains package-private so the wire/schema contract +// is still centralised here, but Telemetry and Updater can avoid reflection on +// TinyGo's hot path. +func (p linkStatePayload) FabricLinkObservation() (ready bool, peerSID string, localSID string) { + return p.Ready, p.PeerSID, p.LocalSID } // session manages the fabric link state machine over a Transport. @@ -147,10 +147,13 @@ type session struct { lastTxAt time.Time lastPongAt time.Time exportReadyAt time.Time + exportDrainAt time.Time exportsEnabled bool criticalExportSubs []*bus.Subscription criticalExportReplayPending []bool + criticalExportPendingMsgs []*bus.Message + exportPendingMsgs []*bus.Message exportSubs []*bus.Subscription exportCallSubs []*bus.Subscription inboundCalls []*inboundCall @@ -164,9 +167,13 @@ type session struct { rpcReady bool // bridge replay complete; gates linkStatePayload.Ready incomingTransfer *incomingTransfer completedTransfers []completedTransfer - transferQuietUntil time.Time - transferQuietReason string + pendingTargetCall *pendingTargetCall beginTransfer func(transferMeta) (transferSink, error) + stageController StageController + buffers *FabricBuffers + counters FabricCounters + busSubs *bus.SubscriptionSet + ctx context.Context } func (s *session) log(msg string) { @@ -186,65 +193,27 @@ func (s *session) logKV(msg, key, value string) { // run is the main loop. Blocks until ctx is cancelled. func (s *session) run(ctx context.Context) { s.cfg.applyDefaults() + s.buffers = ensureFabricBuffers(s.buffers) + s.ctx = ctx + s.busSubs = s.conn.NewSubscriptionSet() lines := make(chan readResult, lineQueueSize) + freeSlots := make(chan int, lineQueueSize) + for i := 0; i < lineQueueSize; i++ { + freeSlots <- i + } - go func() { - defer close(lines) - lastLineAt := time.Now() - for { - started := time.Now() - line, err := s.tr.ReadLine() - now := time.Now() - readDur := now.Sub(started) - sinceLine := now.Sub(lastLineAt) - if err != nil { - if errors.Is(err, ErrLineTooLong) { - otadiag.Event( - "[fabric-rx]", "read_error", otadiag.XferNone, - otadiag.KV("reason", "line_too_long"), - otadiag.KV("read_ms", int(readDur/time.Millisecond)), - otadiag.KV("since_line_ms", int(sinceLine/time.Millisecond)), - ) - s.log("oversized line dropped") - continue - } - otadiag.Event( - "[fabric-rx]", "read_error", otadiag.XferNone, - otadiag.KV("reason", err.Error()), - otadiag.KV("read_ms", int(readDur/time.Millisecond)), - otadiag.KV("since_line_ms", int(sinceLine/time.Millisecond)), - ) - select { - case lines <- readResult{err: err}: - case <-ctx.Done(): - } - return - } - t := protoType(line) - if shouldLogFabricRead(t, readDur, sinceLine) { - otadiag.Event( - "[fabric-rx]", "read_line", protoXferID(line), - otadiag.KV("type", t), - otadiag.KV("line_len", len(line)), - otadiag.KV("read_ms", int(readDur/time.Millisecond)), - otadiag.KV("since_line_ms", int(sinceLine/time.Millisecond)), - ) - } - lastLineAt = now - cp := make([]byte, len(line)) - copy(cp, line) - select { - case lines <- readResult{line: cp}: - case <-ctx.Done(): - return - } - } - }() + go s.readLoop(ctx, lines, freeSlots) defer s.tr.Close() + defer func() { + if s.busSubs != nil { + s.busSubs.Close() + } + }() defer s.teardownExports() defer s.teardownInbound() defer s.teardownOutbound(reasonLinkDown) + defer s.cancelTargetCall(reasonLinkDown) defer s.abortTransfer(reasonLinkDown) defer s.log("run stop") @@ -254,16 +223,21 @@ func (s *session) run(ctx context.Context) { waitTick := time.NewTicker(waitLogEvery) defer waitTick.Stop() - // Poll subscription channels periodically. Needed because select - // blocks until a line/timer fires; without this, exported bus - // messages and async call replies would sit in subscription channels. - exportTick := time.NewTicker(exportTickInterval) - defer exportTick.Stop() + // Bus subscriptions and pending transfer/updater operations wake the + // reactor directly. Timers below cover deadlines and periodic liveness only. + + pendingDeadline := time.NewTimer(time.Hour) + if !pendingDeadline.Stop() { + <-pendingDeadline.C + } + defer pendingDeadline.Stop() s.publishLinkState("", "") s.log("run start") for { + pendingAt, pendingOK := s.nextPendingDeadline(time.Now()) + pendingDeadlineCh := resetOptionalTimer(pendingDeadline, pendingAt, pendingOK) select { case <-ctx.Done(): return @@ -273,23 +247,49 @@ func (s *session) run(ctx context.Context) { return } if res.err != nil { + s.releaseReadSlot(freeSlots, res.slot) + if errors.Is(res.err, ErrLineTooLong) { + s.counters.RXLineTooLong++ + continue + } s.handleLinkDown(reasonTransportDown, res.err.Error()) return } + s.counters.RXLines++ + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "rx_frame", + "type", protoType(res.line), + "xfer", protoXferID(res.line), + "len", len(res.line), + "line", tracePreview(res.line), + ) + } beforeRx := s.lastRxAt s.dispatch(res.line) + s.releaseReadSlot(freeSlots, res.slot) if s.lastRxAt.After(beforeRx) { resetTimer(stale, s.cfg.LivenessTimeout) } - case <-exportTick.C: - now := time.Now() - s.drainExports() - s.drainInbound(now) - s.drainOutbound(now) - s.checkTransferTimeout(now) - s.tickPing(now) - s.tickReady(now) + case res := <-s.pendingChunkReady(): + s.finishChunkWrite(time.Now(), res) + + case res := <-s.pendingCommitReady(): + s.finishTransferCommit(time.Now(), res) + + case rep, ok := <-s.pendingTargetReady(): + s.finishTargetReply(rep, ok) + + case <-pendingDeadlineCh: + s.handlePendingDeadline(time.Now()) + + case _, ok := <-s.busReady(): + if !ok { + return + } + s.drainBusEvents(time.Now()) case <-waitTick.C: s.logWaiting() @@ -304,6 +304,66 @@ func (s *session) run(ctx context.Context) { } } +func (s *session) readLoop(ctx context.Context, lines chan<- readResult, freeSlots <-chan int) { + defer close(lines) + lastLineAt := time.Now() + _ = lastLineAt + for { + var slot int + select { + case slot = <-freeSlots: + case <-ctx.Done(): + return + } + started := time.Now() + _ = started + buf := s.buffers.RXLines[slot][:] + n, err := s.readTransportLine(buf) + now := time.Now() + _ = now + if err != nil { + s.traceWireError("rx", "read_error", err.Error(), map[string]any{"slot": slot}) + select { + case lines <- readResult{slot: slot, err: err}: + case <-ctx.Done(): + return + } + if !errors.Is(err, ErrLineTooLong) { + return + } + continue + } + lastLineAt = now + select { + case lines <- readResult{line: buf[:n], slot: slot}: + case <-ctx.Done(): + return + } + } +} + +func (s *session) readTransportLine(dst []byte) (int, error) { + if tr, ok := s.tr.(boundedLineTransport); ok { + return tr.ReadLineInto(dst) + } + line, err := s.tr.ReadLine() + if err != nil { + return 0, err + } + if len(line) > len(dst) { + return 0, ErrLineTooLong + } + copy(dst, line) + return len(line), nil +} + +func (s *session) releaseReadSlot(freeSlots chan<- int, slot int) { + if slot < 0 { + return + } + freeSlots <- slot +} + func shouldLogFabricRead(msgType string, _, _ time.Duration) bool { switch msgType { case msgHello, msgHelloAck, msgCall, msgReply, msgXferBegin, msgXferCommit, msgXferAbort: @@ -312,6 +372,24 @@ func shouldLogFabricRead(msgType string, _, _ time.Duration) bool { return false } +func resetOptionalTimer(t *time.Timer, deadline time.Time, ok bool) <-chan time.Time { + if !t.Stop() { + select { + case <-t.C: + default: + } + } + if !ok || deadline.IsZero() { + return nil + } + d := time.Until(deadline) + if d < 0 { + d = 0 + } + t.Reset(d) + return t.C +} + func resetTimer(t *time.Timer, d time.Duration) { if !t.Stop() { select { @@ -341,6 +419,7 @@ func (s *session) publishLinkState(reason, err string) { return } status := s.currentStatus() + counters := s.counters if s.link != linkUp && (reason != "" || err != "") { status = statusDown } @@ -363,6 +442,7 @@ func (s *session) publishLinkState(reason, err string) { OutboundCalls: len(s.outboundCalls), Reason: reason, Err: err, + Counters: counters, }, true, )) @@ -372,8 +452,13 @@ func (s *session) markRx() { s.lastRxAt = time.Now() } +func (s *session) markFrameRX() { + s.counters.RXFrames++ +} + func (s *session) markTx() { s.lastTxAt = time.Now() + s.counters.TXFrames++ } func (s *session) handleLinkDown(reason, err string) { @@ -386,14 +471,14 @@ func (s *session) handleLinkDown(reason, err string) { s.peerSID = "" s.peerProto = "" s.exportReadyAt = time.Time{} + s.exportDrainAt = time.Time{} s.exportsEnabled = false s.rpcReady = false - s.transferQuietUntil = time.Time{} - s.transferQuietReason = "" s.teardownExports() s.teardownInbound() s.teardownOutbound(pendingReason) s.teardownImportedRetained() + s.cancelTargetCall(pendingReason) s.abortTransfer(pendingReason) s.clearCompletedTransfers() s.publishLinkState(reason, err) @@ -420,6 +505,7 @@ func (s *session) promoteLink(reason string) { if reason == "" { reason = reasonPeerReset } + s.cancelTargetCall(reason) s.abortTransfer(reason) s.teardownExports() s.teardownInbound() @@ -432,6 +518,7 @@ func (s *session) promoteLink(reason string) { s.setupExports() s.exportsEnabled = true s.exportReadyAt = time.Now().Add(exportStartHoldoff) + s.exportDrainAt = time.Time{} s.nextPingAt = time.Now().Add(s.cfg.PingInterval) s.log("exports enabled") s.publishLinkState(reason, "") @@ -475,24 +562,39 @@ func (s *session) tickReady(now time.Time) { return } if !s.criticalExportReplayDrained() { + s.scheduleExportDrain(now) return } s.rpcReady = true s.publishLinkState("", "") + s.drainQueuedExports() } // ---- dispatch ---- func (s *session) dispatch(line []byte) { t := protoType(line) + if t != "" { + s.traceWire("rx", "", line, nil) + } if t == "" { s.logMalformed(line, nil) return } + if fabricTraceEnabled { + println("[fabric]", "sid", s.localSID, "dispatch", "type", t, "xfer", protoXferID(line), "link", int(s.link), "ready", s.rpcReady) + } + switch t { case msgHello: - typedDispatch(s, t, line, s.onHello) + msg, ok := decodeHelloFast(line) + if !ok { + s.logMalformed(line, errors.New("bad_hello")) + return + } + s.markFrameRX() + s.onHello(&msg) return case msgHelloAck: typedDispatch(s, t, line, s.onHelloAck) @@ -505,29 +607,77 @@ func (s *session) dispatch(line []byte) { switch t { case msgPing: - typedDispatch(s, t, line, s.onPing) + msg, ok := decodePingFast(line, msgPing) + if !ok { + s.logMalformed(line, errors.New("bad_ping")) + return + } + s.markFrameRX() + s.onPing(&msg) case msgPong: - typedDispatch(s, t, line, s.onPong) + msg, ok := decodePongFast(line) + if !ok { + s.logMalformed(line, errors.New("bad_pong")) + return + } + s.markFrameRX() + s.onPong(&msg) case msgPub: typedDispatch(s, t, line, s.onPub) case msgUnretain: typedDispatch(s, t, line, s.onUnretain) case msgCall: - typedDispatch(s, t, line, s.onCall) + msg, ok := decodeCallFast(line) + if !ok { + s.logMalformed(line, errors.New("bad_call")) + return + } + s.markFrameRX() + s.onCall(&msg) case msgReply: typedDispatch(s, t, line, s.onReply) case msgXferBegin: - otadiag.Event( - "[fabric-xfer]", "begin_route_start", protoXferID(line), - otadiag.KV("line_len", len(line)), - ) - typedDispatch(s, t, line, s.onTransferBegin) + if fabricXferDiagEnabled("begin_route_start") { + otadiag.Event( + "[fabric-xfer]", "begin_route_start", protoXferID(line), + otadiag.KV("line_len", len(line)), + ) + } + msg, ok := decodeXferBeginFast(line) + if !ok { + s.logMalformed(line, errors.New("bad_xfer_begin")) + return + } + s.markFrameRX() + s.onTransferBegin(&msg) + if fabricXferDiagEnabled("begin_route_done") { + otadiag.Event("[fabric-xfer]", "begin_route_done", protoXferID(line)) + } case msgXferChunk: - typedDispatch(s, t, line, s.onTransferChunk) + msg, ok := decodeXferChunkFast(line) + if !ok { + s.logMalformed(line, errors.New("bad_xfer_chunk")) + s.retryMalformedTransferFrame(t, line) + return + } + s.markFrameRX() + s.onTransferChunk(&msg) case msgXferCommit: - typedDispatch(s, t, line, s.onTransferCommit) + msg, ok := decodeXferCommitFast(line) + if !ok { + s.logMalformed(line, errors.New("bad_xfer_commit")) + return + } + s.markFrameRX() + s.onTransferCommit(&msg) case msgXferAbort: - typedDispatch(s, t, line, s.onTransferAbort) + msg, ok := decodeXferAbortFast(line) + if !ok { + s.logMalformed(line, errors.New("bad_xfer_abort")) + return + } + s.markFrameRX() + s.onTransferAbort(&msg) case msgXferReady, msgXferNeed, msgXferDone: s.logKV("echoed transfer control ignored", "type", t) default: @@ -569,9 +719,12 @@ func typedDispatch[T any](s *session, msgType string, line []byte, handler func( s.retryMalformedTransferFrame(msgType, line) return } + s.markFrameRX() handler(&msg) if msgType == msgXferBegin { - otadiag.Event("[fabric-xfer]", "begin_route_done", protoXferID(line)) + if fabricXferDiagEnabled("begin_route_done") { + otadiag.Event("[fabric-xfer]", "begin_route_done", protoXferID(line)) + } } } @@ -592,6 +745,15 @@ func (s *session) retryMalformedTransferFrame(msgType string, line []byte) { s.logKV("malformed xfer_chunk dropped", "id", id) return } + offset, _ := protoTopUint32(line, "offset") + otadiag.Event( + "[fabric-xfer]", "chunk_reject", id, + otadiag.KV("reason", "bad_message"), + otadiag.KV("offset", offset), + otadiag.KV("expected", cur.bytesWritten), + otadiag.KV("encoded_len", len(protoTopString(line, "data"))), + otadiag.KV("line_len", len(line)), + ) s.retryCorruptTransferFrame("bad_message") } @@ -604,10 +766,12 @@ func (s *session) requireLinkUp(t string) bool { } func (s *session) logMalformed(line []byte, err error) { + s.counters.RXBadJSON++ errStr := "" if err != nil { errStr = err.Error() } + s.traceWireError("rx", "malformed", errStr, map[string]any{"line_len": len(line), "frame_type": protoType(line), "xfer_id": protoXferID(line)}) if fabricTraceEnabled { println( "[fabric]", "sid", s.localSID, @@ -805,38 +969,6 @@ func validWireTopic(topic []string) bool { return true } -func (s *session) extendTransferQuiet(reason string, d time.Duration) { - now := time.Now() - until := now.Add(d) - if until.After(s.transferQuietUntil) { - s.transferQuietUntil = until - s.transferQuietReason = reason - } -} - -func (s *session) transferQuiet(now time.Time) (bool, string) { - if cur := s.incomingTransfer; cur != nil { - return true, "incoming_transfer:" + cur.meta.ID - } - if !s.transferQuietUntil.IsZero() && now.Before(s.transferQuietUntil) { - reason := s.transferQuietReason - if reason == "" { - reason = "quiet_window" - } - return true, reason - } - return false, "" -} - -func quietAllowsCriticalExports(reason string) bool { - switch reason { - case "xfer_commit_target", "xfer_target_rejected", "xfer_done": - return true - default: - return false - } -} - func (s *session) onHello(msg *protoHello) { if msg.Proto != protocolName { s.log("hello dropped: unsupported proto") @@ -854,12 +986,7 @@ func (s *session) onHello(msg *protoHello) { reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) s.logKV("hello rx", "peer_sid", msg.SID) - if !s.sendControl(marshal(protoHelloAck{ - Type: msgHelloAck, - Proto: protocolName, - SID: s.localSID, - Node: s.nodeID, - })) { + if !s.sendControl(marshalHelloAck(s.localSID, s.nodeID)) { return } s.log("hello_ack tx") @@ -903,7 +1030,7 @@ func (s *session) onPing(msg *protoPing) { } s.markRx() s.logKV("ping rx", "peer_sid", msg.SID) - if !s.sendControl(marshal(protoPong{Type: msgPong, SID: s.localSID})) { + if !s.sendControl(marshalPong(s.localSID)) { return } s.log("pong tx") @@ -916,17 +1043,10 @@ func (s *session) tickPing(now time.Time) { if s.link != linkUp { return } - if quiet, _ := s.transferQuiet(now); quiet { - // Keep the UART quiet while CM5 is preparing or streaming a firmware - // image; chunk recovery depends on xfer_need being the only periodic - // MCU-originated frame on the fabric link. - s.nextPingAt = now.Add(s.cfg.PingInterval) - return - } if s.nextPingAt.IsZero() || now.Before(s.nextPingAt) { return } - if !s.sendControl(marshal(protoPing{Type: msgPing, SID: s.localSID})) { + if !s.sendControl(marshalPing(s.localSID)) { return } s.nextPingAt = now.Add(s.cfg.PingInterval) @@ -995,24 +1115,22 @@ func (s *session) onCall(msg *protoCall) { if !validWireTopic(msg.Topic) { s.rpcDiag("call_reject", msg, nil, "bad_topic") s.log("incoming call dropped: bad_topic") - s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: "bad_topic"})) + s.sendRPC(marshalReplyErr(msg.ID, "bad_topic")) return } - s.rpcDiag("call_rx", msg, nil, "", - otadiag.KV("timeout_ms", strconvx.Itoa(msg.TimeoutMs)), - ) + s.rpcDiag("call_rx", msg, nil, "") for _, call := range s.inboundCalls { if call.id == msg.ID { s.rpcDiag("call_reject", msg, nil, "duplicate_call_id") s.logKV("incoming call dropped", "err", "duplicate_call_id") - s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: "duplicate_call_id"})) + s.sendRPC(marshalReplyErr(msg.ID, "duplicate_call_id")) return } } if len(s.inboundCalls) >= s.cfg.MaxInboundHelpers { s.rpcDiag("call_reject", msg, nil, reasonBusy) s.log("incoming call dropped: busy") - s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonBusy})) + s.sendRPC(marshalReplyErr(msg.ID, reasonBusy)) return } @@ -1020,36 +1138,26 @@ func (s *session) onCall(msg *protoCall) { if localTopic == nil { s.rpcDiag("call_reject", msg, nil, reasonNoRoute) s.log("incoming call dropped: no_route") - s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) + s.sendRPC(marshalReplyErr(msg.ID, reasonNoRoute)) return } s.rpcDiag("call_route_ok", msg, localTopic, "") s.markRx() - isTransferPrepare := wireTopicEquals(msg.Topic, wireUpdaterPrepare) - if isTransferPrepare { - s.extendTransferQuiet("prepare_call_rx", transferPrepareQuiet) - } - timeout := callTimeoutDef - if msg.TimeoutMs > 0 { - timeout = time.Duration(msg.TimeoutMs) * time.Millisecond - } busMsg := s.conn.NewMessage(localTopic, msg.Payload, false) - s.rpcDiag("call_dispatch_start", msg, localTopic, "", - otadiag.KV("timeout_ms", strconvx.Itoa(int(timeout/time.Millisecond))), - ) - sub := s.conn.Request(busMsg) + s.rpcDiag("call_dispatch_start", msg, localTopic, "") + sub := s.requestBus(busMsg) topicCopy := append([]string(nil), msg.Topic...) - s.inboundCalls = append(s.inboundCalls, &inboundCall{ - id: msg.ID, - topic: topicCopy, - localTopic: localTopic, - payload: append(json.RawMessage(nil), msg.Payload...), - sub: sub, - deadline: time.Now().Add(timeout), - transferPrepare: isTransferPrepare, - }) + call := &inboundCall{ + id: msg.ID, + topic: topicCopy, + localTopic: localTopic, + payload: append(json.RawMessage(nil), msg.Payload...), + sub: sub, + deadline: time.Now().Add(timeout), + } + s.inboundCalls = append(s.inboundCalls, call) } func (s *session) onReply(msg *protoReply) { @@ -1123,19 +1231,56 @@ func marshalPayload(payload any) (json.RawMessage, error) { // Exports are drained inline in the main loop (no extra goroutines) // to avoid TinyGo cooperative scheduler mutex panics. +func (s *session) busReady() <-chan struct{} { + if s.busSubs == nil { + return nil + } + return s.busSubs.Ready() +} + +func (s *session) subscribeBus(tp bus.Topic) *bus.Subscription { + if s.busSubs == nil { + return s.conn.Subscribe(tp) + } + return s.busSubs.Subscribe(tp) +} + +func (s *session) requestBus(msg *bus.Message) *bus.Subscription { + if s.busSubs == nil { + return s.conn.Request(msg) + } + return s.busSubs.Request(msg) +} + +func (s *session) drainBusEvents(now time.Time) { + // A single bus readiness edge may cover several subscriptions. Drain each + // class without blocking. Export drains collect/coalesce retained facts and + // admit only a small number of frames per tick. This keeps the UART writer + // event-driven without adding a second writer actor or a transfer-specific + // quiet window. + s.drainExports() + s.drainOutboundMessages(now) + s.drainInbound(now) + s.tickReady(now) +} + func (s *session) setupExports() { if s.conn == nil { return } for _, p := range criticalExportTopics { - s.criticalExportSubs = append(s.criticalExportSubs, s.conn.Subscribe(p)) + sub := s.subscribeBus(p) + s.criticalExportSubs = append(s.criticalExportSubs, sub) s.criticalExportReplayPending = append(s.criticalExportReplayPending, true) + s.criticalExportPendingMsgs = append(s.criticalExportPendingMsgs, nil) } for _, p := range exportPatterns() { - s.exportSubs = append(s.exportSubs, s.conn.Subscribe(p)) + sub := s.subscribeBus(p) + s.exportSubs = append(s.exportSubs, sub) } for _, p := range exportCallPatterns() { - s.exportCallSubs = append(s.exportCallSubs, s.conn.Subscribe(p)) + sub := s.subscribeBus(p) + s.exportCallSubs = append(s.exportCallSubs, sub) } } @@ -1145,6 +1290,8 @@ func (s *session) teardownExports() { } s.criticalExportSubs = nil s.criticalExportReplayPending = nil + s.criticalExportPendingMsgs = nil + s.exportPendingMsgs = nil for _, sub := range s.exportSubs { s.conn.Unsubscribe(sub) } @@ -1244,65 +1391,160 @@ func (s *session) criticalExportReplayDrained() bool { func (s *session) drainCriticalExports(total *int) bool { for i, sub := range s.criticalExportSubs { - if *total >= exportMaxPerTick { - return true + if m := latestSubscriptionMessage(sub); m != nil { + s.queueCriticalExport(i, m) } - m := latestSubscriptionMessage(sub) + } + for i, m := range s.criticalExportPendingMsgs { if m == nil { if i < len(s.criticalExportReplayPending) && s.criticalExportReplayPending[i] { return true } continue } + if *total >= exportMaxPerTick { + return true + } sent, ok := s.sendExportMessage(m) if !ok { return false } - if sent && i < len(s.criticalExportReplayPending) && s.criticalExportReplayPending[i] { - s.criticalExportReplayPending[i] = false - } if sent { (*total)++ + if i < len(s.criticalExportReplayPending) && s.criticalExportReplayPending[i] { + s.criticalExportReplayPending[i] = false + } + s.criticalExportPendingMsgs[i] = nil } } return true } -// drainExports does a non-blocking read of each export subscription -// and writes any messages to the wire. Called from the main loop. -func (s *session) drainExports() { - if s.link != linkUp { +func (s *session) exportCanSend(now time.Time) bool { + return s.link == linkUp && s.exportsEnabled && (s.exportReadyAt.IsZero() || !now.Before(s.exportReadyAt)) +} + +func (s *session) queueCriticalExport(idx int, m *bus.Message) { + if idx < 0 || idx >= len(s.criticalExportPendingMsgs) { return } - now := time.Now() - quiet, quietReason := s.transferQuiet(now) - if !s.exportsEnabled { + s.criticalExportPendingMsgs[idx] = m +} + +func (s *session) queueExport(m *bus.Message) { + if m == nil { + return + } + // Retained exports are a cache: when several retained facts for the same + // topic arrive before the UART writer has capacity, only the newest value is + // useful on the wire. Coalescing here keeps the output path event-driven and + // avoids a semantic "pause exports during transfer" policy. + if m.Retained { + for i, pending := range s.exportPendingMsgs { + if pending != nil && pending.Retained && topicEquals(pending.Topic, m.Topic) { + s.exportPendingMsgs[i] = m + return + } + } + } + // Non-retained events are sparse, but keep the FIFO bounded. If the link is + // congested, old non-critical observations are less valuable than keeping the + // reactor and control frames moving. + const maxPendingExports = 32 + if len(s.exportPendingMsgs) >= maxPendingExports { + copy(s.exportPendingMsgs, s.exportPendingMsgs[1:]) + s.exportPendingMsgs[len(s.exportPendingMsgs)-1] = m return } - if !s.exportReadyAt.IsZero() && now.Before(s.exportReadyAt) { + s.exportPendingMsgs = append(s.exportPendingMsgs, m) +} + +func (s *session) hasCriticalExportBacklog() bool { + for i, m := range s.criticalExportPendingMsgs { + if m != nil { + return true + } + if i < len(s.criticalExportReplayPending) && s.criticalExportReplayPending[i] { + return true + } + } + return false +} + +func (s *session) hasExportBacklog() bool { + return s.hasCriticalExportBacklog() || len(s.exportPendingMsgs) > 0 +} + +func (s *session) handleCriticalExportEvent(idx int, m *bus.Message) { + if idx < 0 || idx >= len(s.criticalExportReplayPending) { return } - if quiet && !quietAllowsCriticalExports(quietReason) { - // Avoid colliding telemetry/state exports with prepare/xfer traffic on - // echo-prone links. Post-transfer quiet allows critical facts below so - // state=rebooting can reach CM5 before the reboot arm. + s.queueCriticalExport(idx, m) + s.scheduleExportDrain(time.Now()) +} + +func (s *session) handleExportEvent(m *bus.Message) { + if m == nil || (len(s.criticalExportSubs) > 0 && isCriticalExportTopic(m.Topic)) { + return + } + s.queueExport(m) + s.scheduleExportDrain(time.Now()) +} + +func (s *session) drainQueuedExports() { + now := time.Now() + if !s.exportCanSend(now) { return } total := 0 if !s.drainCriticalExports(&total) { return } - if quiet { + if !s.criticalExportReplayDrained() { + s.scheduleExportDrain(now) return } - if !s.criticalExportReplayDrained() { + for total < exportMaxPerTick && len(s.exportPendingMsgs) > 0 { + m := s.exportPendingMsgs[0] + s.exportPendingMsgs = s.exportPendingMsgs[1:] + if m == nil || (len(s.criticalExportSubs) > 0 && isCriticalExportTopic(m.Topic)) { + continue + } + sent, ok := s.sendExportMessage(m) + if !ok { + s.exportPendingMsgs = append([]*bus.Message{m}, s.exportPendingMsgs...) + return + } + if sent { + total++ + } + } + if s.hasExportBacklog() { + s.scheduleExportDrain(now) + } +} + +func (s *session) scheduleExportDrain(now time.Time) { + when := now.Add(time.Millisecond) + if s.exportDrainAt.IsZero() || when.Before(s.exportDrainAt) { + s.exportDrainAt = when + } +} + +// drainExports does a non-blocking read of each export subscription +// and writes any messages to the wire. Called from the main loop. +func (s *session) drainExports() { + now := time.Now() + s.exportDrainAt = time.Time{} + if !s.exportCanSend(now) { return } + // Collect all immediately-available export notifications into the session's + // coalesced retained queue. Sending is handled by drainQueuedExports below, + // with a small per-tick budget. This keeps retained replay fair without + // making transfer state a special case. for _, sub := range s.exportSubs { for { - if total >= exportMaxPerTick { - return - } select { case m, ok := <-sub.Channel(): if !ok || m == nil { @@ -1311,165 +1553,171 @@ func (s *session) drainExports() { if len(s.criticalExportSubs) > 0 && isCriticalExportTopic(m.Topic) { continue } - sent, ok := s.sendExportMessage(m) - if !ok { - return - } - if sent { - total++ - } + s.queueExport(m) default: goto nextSub } } nextSub: } + s.drainQueuedExports() } -func (s *session) drainInbound(now time.Time) { - if len(s.inboundCalls) == 0 { +func (s *session) findInboundCall(id string) (*inboundCall, int) { + for i, call := range s.inboundCalls { + if call.id == id { + return call, i + } + } + return nil, -1 +} + +func (s *session) removeInboundCall(idx int) { + if idx < 0 || idx >= len(s.inboundCalls) { + return + } + s.inboundCalls = append(s.inboundCalls[:idx], s.inboundCalls[idx+1:]...) +} + +func (s *session) handleInboundReplyEvent(id string, reply *bus.Message, closed bool) { + call, idx := s.findInboundCall(id) + if call == nil { return } + if call.sub != nil { + s.conn.Unsubscribe(call.sub) + call.sub = nil + } + s.removeInboundCall(idx) + if closed || reply == nil { + sent := s.sendRPC(marshalReplyErr(call.id, reasonTimeout)) + s.rpcDiagInbound("call_reply_tx", call, false, reasonTimeout, otadiag.KV("sent", sent)) + return + } + if errStr := checkBusError(reply.Payload); errStr != "" { + sent := s.sendRPC(marshalReplyErr(call.id, errStr)) + s.rpcDiagInbound("call_reply_tx", call, false, errStr, otadiag.KV("sent", sent)) + return + } + payload, err := marshalPayload(reply.Payload) + if err != nil { + sent := s.sendRPC(marshalReplyErr(call.id, errPayloadMarshal)) + s.rpcDiagInbound("call_reply_tx", call, false, errPayloadMarshal, otadiag.KV("sent", sent)) + return + } + sent := s.sendRPC(marshalReplyOKRaw(call.id, payload)) + s.rpcDiagInbound("call_reply_tx", call, true, "", otadiag.KV("sent", sent)) +} +func (s *session) expireInbound(now time.Time) { + if len(s.inboundCalls) == 0 { + return + } keep := s.inboundCalls[:0] for _, call := range s.inboundCalls { - select { - case reply, ok := <-call.sub.Channel(): - s.conn.Unsubscribe(call.sub) - call.sub = nil // prevent double-unsubscribe in teardownInbound - if !ok || reply == nil { - if call.transferPrepare { - s.extendTransferQuiet("prepare_reply_timeout", transferPrepareQuiet) - } - sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) - s.rpcDiagInbound("call_reply_tx", call, false, reasonTimeout, otadiag.KV("sent", sent)) - if !sent { - return - } - continue - } - if errStr := checkBusError(reply.Payload); errStr != "" { - if call.transferPrepare { - s.extendTransferQuiet("prepare_reply_error", transferPrepareQuiet) - } - sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errStr})) - s.rpcDiagInbound("call_reply_tx", call, false, errStr, otadiag.KV("sent", sent)) - if !sent { - return - } - continue - } - payload, err := marshalPayload(reply.Payload) - if err != nil { - if call.transferPrepare { - s.extendTransferQuiet("prepare_reply_marshal_failed", transferPrepareQuiet) - } - sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) - s.rpcDiagInbound("call_reply_tx", call, false, errPayloadMarshal, otadiag.KV("sent", sent)) - if !sent { - return - } - continue - } - if call.transferPrepare { - s.extendTransferQuiet("prepare_reply_ok", transferPrepareQuiet) - } - sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Payload: payload})) - s.rpcDiagInbound("call_reply_tx", call, true, "", otadiag.KV("sent", sent)) - if !sent { - return - } - continue - default: - } - if !now.Before(call.deadline) { - s.conn.Unsubscribe(call.sub) - call.sub = nil - if call.transferPrepare { - s.extendTransferQuiet("prepare_call_timeout", transferPrepareQuiet) + if call.sub != nil { + s.conn.Unsubscribe(call.sub) + call.sub = nil } - sent := s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) + sent := s.sendRPC(marshalReplyErr(call.id, reasonTimeout)) s.rpcDiagInbound("call_reply_tx", call, false, reasonTimeout, otadiag.KV("sent", sent)) - if !sent { - return - } continue } - keep = append(keep, call) } - s.inboundCalls = keep } -func (s *session) drainOutbound(now time.Time) { - // Forward new outgoing calls from the local bus onto the wire. - if s.link == linkUp && len(s.exportCallSubs) > 0 { - for _, sub := range s.exportCallSubs { - for { - select { - case msg, ok := <-sub.Channel(): - if !ok || msg == nil { - goto nextSub - } - - wireTopic := exportCallTopic(msg.Topic) - if wireTopic == nil { - continue - } - - payload, err := marshalPayload(msg.Payload) - if err != nil { - s.logKV("outgoing call dropped", "err", err.Error()) - if msg.CanReply() { - s.conn.Reply(msg, types.ErrorReply{OK: false, Error: errPayloadMarshal}, false) - } - continue - } - id := s.nextOutboundID - s.nextOutboundID++ - corr := "wire-" + strconvx.Utoa64(id) - if msg.CanReply() { - s.outboundCalls = append(s.outboundCalls, &outboundCall{ - id: corr, - req: msg, - deadline: now.Add(callTimeoutDef), - }) - } - if !s.sendRPC(marshal(protoCall{ - Type: msgCall, - ID: corr, - Topic: wireTopic, - Payload: payload, - TimeoutMs: int(callTimeoutDef / time.Millisecond), - })) { - return - } - default: - goto nextSub - } +func (s *session) drainInbound(now time.Time) { + // Reactor path: bus readiness is coalesced by SubscriptionSet, then this + // reducer drains all ready inbound replies without blocking. Direct calls + // still let unit tests exercise the reducer without running the event loop. + calls := append([]*inboundCall(nil), s.inboundCalls...) + for _, call := range calls { + if call == nil || call.sub == nil { + continue + } + select { + case reply, ok := <-call.sub.Channel(): + s.handleInboundReplyEvent(call.id, reply, !ok) + default: + } + } + s.expireInbound(now) +} + +func (s *session) handleOutboundCallEvent(now time.Time, msg *bus.Message) { + if s.link != linkUp || msg == nil { + return + } + wireTopic := exportCallTopic(msg.Topic) + if wireTopic == nil { + return + } + payload, err := marshalPayload(msg.Payload) + if err != nil { + s.logKV("outgoing call dropped", "err", err.Error()) + if msg.CanReply() { + s.conn.Reply(msg, types.ErrorReply{OK: false, Error: errPayloadMarshal}, false) + } + return + } + id := s.nextOutboundID + s.nextOutboundID++ + corr := "wire-" + strconvx.Utoa64(id) + if msg.CanReply() { + s.outboundCalls = append(s.outboundCalls, &outboundCall{ + id: corr, + req: msg, + deadline: now.Add(callTimeoutDef), + }) + } + _ = s.sendRPC(marshal(protoCall{ + Type: msgCall, + ID: corr, + Topic: wireTopic, + Payload: payload, + })) +} + +func (s *session) expireOutbound(now time.Time) { + if len(s.outboundCalls) == 0 { + return + } + keep := s.outboundCalls[:0] + for _, call := range s.outboundCalls { + if !now.Before(call.deadline) { + if call.req != nil && call.req.CanReply() { + s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: reasonTimeout}, false) } - nextSub: + continue } + keep = append(keep, call) } + s.outboundCalls = keep +} - // Expire outbound calls that have timed out waiting for a remote reply. - if len(s.outboundCalls) > 0 { - keep := s.outboundCalls[:0] - for _, call := range s.outboundCalls { - if !now.Before(call.deadline) { - if call.req != nil && call.req.CanReply() { - s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: reasonTimeout}, false) +func (s *session) drainOutboundMessages(now time.Time) { + for _, sub := range s.exportCallSubs { + for { + select { + case msg, ok := <-sub.Channel(): + if !ok || msg == nil { + goto nextSub } - continue + s.handleOutboundCallEvent(now, msg) + default: + goto nextSub } - keep = append(keep, call) } - s.outboundCalls = keep + nextSub: } + s.expireOutbound(now) } +func (s *session) drainOutbound(now time.Time) { s.drainOutboundMessages(now) } + // ---- transport write ---- // sendControl, sendRPC, sendBulk are the lane-tagged enqueue entry diff --git a/services/fabric/trace.go b/services/fabric/trace.go index 7453e17..8d47017 100644 --- a/services/fabric/trace.go +++ b/services/fabric/trace.go @@ -1,6 +1,16 @@ package fabric -import "devicecode-go/x/xxhash" +import ( + "encoding/json" + "os" + "strings" + "time" + + "devicecode-go/bus" + "devicecode-go/x/xxhash" +) + +var fabricTraceTypes = parseFabricTraceTypes(os.Getenv("FABRIC_TRACE_TYPES")) func traceLine(dir string, data []byte) { if !fabricTraceEnabled { @@ -9,6 +19,108 @@ func traceLine(dir string, data []byte) { println("[fabric-trace]", dir, "len", len(data), "line", tracePreview(data)) } +func fabricTraceTypeAllowed(t string) bool { + if !fabricTraceEnabled { + return false + } + if len(fabricTraceTypes) == 0 { + return true + } + return fabricTraceTypes[t] || fabricTraceTypes["*"] || fabricTraceTypes["all"] +} + +func parseFabricTraceTypes(v string) map[string]bool { + out := map[string]bool{} + for _, p := range strings.FieldsFunc(v, func(r rune) bool { + return r == ',' || r == ' ' || r == '\t' || r == '\n' + }) { + if p != "" { + out[p] = true + } + } + if len(out) == 0 { + return nil + } + return out +} + +func (s *session) traceWire(direction, lane string, line []byte, extra map[string]any) { + if !fabricTraceEnabled { + return + } + t := protoType(line) + if !fabricTraceTypeAllowed(t) { + return + } + payload := map[string]any{ + "kind": "fabric.wire", + "direction": direction, + "lane": lane, + "frame_type": t, + "link_id": s.linkID, + "local_sid": s.localSID, + "peer_sid": s.peerSID, + "peer_node": s.peerNode, + "line_len": len(line), + "ts_unix_ms": time.Now().UnixMilli(), + } + if x := protoXferID(line); x != "" { + payload["xfer_id"] = x + } + if off, ok := protoTopUint32(line, "offset"); ok { + payload["offset"] = off + } + if next, ok := protoTopUint32(line, "next"); ok { + payload["next"] = next + } + if size, ok := protoTopUint32(line, "size"); ok { + payload["size"] = size + } + if id := protoTopString(line, "id"); id != "" { + payload["call_id"] = id + } + if topic := protoTopStringArray(line, "topic"); len(topic) > 0 { + payload["topic"] = strings.Join(topic, "/") + } + if os.Getenv("FABRIC_TRACE_PREVIEW") != "" { + payload["preview"] = tracePreview(line) + } + for k, v := range extra { + payload[k] = v + } + + println("[fabric-wire]", direction, "lane", lane, "type", t, "xfer", protoXferID(line), "len", len(line)) + if s.conn != nil && os.Getenv("FABRIC_TRACE_OBS") != "" { + s.conn.Publish(s.conn.NewMessage(bus.T("obs", "v1", "fabric", "event", "wire"), payload, false)) + s.conn.Publish(s.conn.NewMessage(bus.T("obs", "event", "fabric", "wire"), payload, false)) + } +} + +func (s *session) traceWireError(direction, event string, err string, extra map[string]any) { + if !fabricTraceEnabled { + return + } + payload := map[string]any{ + "kind": "fabric.wire", + "direction": direction, + "event": event, + "err": err, + "link_id": s.linkID, + "local_sid": s.localSID, + "peer_sid": s.peerSID, + "peer_node": s.peerNode, + "ts_unix_ms": time.Now().UnixMilli(), + } + for k, v := range extra { + payload[k] = v + } + println("[fabric-wire]", direction, "event", event, "err", err) + if s.conn != nil && os.Getenv("FABRIC_TRACE_OBS") != "" { + s.conn.Publish(s.conn.NewMessage(bus.T("obs", "v1", "fabric", "event", "wire"), payload, false)) + s.conn.Publish(s.conn.NewMessage(bus.T("obs", "event", "fabric", "wire"), payload, false)) + } +} + func tracePreview(data []byte) string { const max = 200 if len(data) > max { @@ -57,3 +169,11 @@ func hexNibble(v byte) byte { } return 'a' + (v - 10) } + +func traceJSON(v any) string { + b, err := json.Marshal(v) + if err != nil { + return err.Error() + } + return string(b) +} diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 95b10a7..6b6a593 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -3,10 +3,11 @@ package fabric import ( "encoding/base64" "encoding/json" - "runtime" + "errors" "strings" "time" + "devicecode-go/bus" "devicecode-go/services/otadiag" "devicecode-go/services/updater" "devicecode-go/x/strconvx" @@ -17,7 +18,11 @@ const transferTargetUpdaterMain = "updater/main" const transferIdleRetryLimit = 3 const transferCorruptRetryLimit = 3 const completedTransferCacheLimit = 4 -const transferMemSampleStride = 64 * 1024 +const transferProgressLogStep uint32 = 32 * 1024 + +func fabricXferDiagEnabled(event string) bool { + return otadiag.Enabled("[fabric-xfer]", event) +} // transferMeta captures xfer_begin contents. The transfer target is explicit // on the wire; firmware update uses target="updater/main". meta remains opaque @@ -38,6 +43,13 @@ type transferInfo struct { BytesWritten uint32 SlotXIPAddr uint32 Generation uint64 + cancel func(reason string) +} + +func (i transferInfo) cancelStage(reason string) { + if i.cancel != nil { + i.cancel(reason) + } } // transferSink is the firmware-side write target for an incoming transfer. @@ -45,38 +57,286 @@ type transferInfo struct { // canonical wire fields). No sequence number is passed — the caller has // already validated offset against expected progress. // -// Bytes() returns the committed payload bytes for target invocation. -// Only valid after Commit() has succeeded. May return nil if the sink -// streamed the bytes elsewhere (e.g. the RP2350 sink writes directly to -// flash and doesn't keep a RAM copy); updater/main consumes that staged -// stream from the updater package. +// The sink owns transfer bytes. Fabric never asks it for a whole-image +// []byte; after Commit succeeds the updater/main stage RPC consumes the +// committed streamed stage by xfer_id/generation. type transferSink interface { WriteChunk(offset uint32, data []byte) error Commit() (transferInfo, error) Apply() error Abort(reason string) error - Bytes() []byte } type incomingTransfer struct { meta transferMeta - sink transferSink + worker *transferSinkWorker bytesWritten uint32 chunksSeen uint32 hasher *xxhash.Hasher idleRetries uint8 corruptRetryOffset uint32 corruptRetriesAtOffset uint8 + pendingChunk *pendingChunkWrite + pendingCommit *pendingTransferCommit + nextProgressLogAt uint32 // deadline is the idle-chunk watchdog: bumped on every accepted chunk // and on initial xfer_begin. checkTransferTimeout fires if now > deadline. // Mirrors transfer_mgr.lua: `active.deadline = runtime.now() + phase_timeout`. + // While a chunk write is pending this also bounds the staging operation; + // the Fabric session loop stays live and the next xfer_need is not sent + // until the updater sink reports that the chunk has been accepted. deadline time.Time } +type pendingChunkWrite struct { + xferID string + offset uint32 + data []byte + started time.Time + warned bool + resultCh chan transferChunkResult +} + +type transferChunkResult struct { + err error +} + +type pendingTransferCommit struct { + xferID string + started time.Time + warned bool + resultCh chan transferCommitResult +} + +type transferCommitResult struct { + info transferInfo + err error +} + +type transferSinkCommandKind uint8 + +const ( + transferSinkCommandWrite transferSinkCommandKind = iota + 1 + transferSinkCommandCommit + transferSinkCommandAbort +) + +type transferSinkCommand struct { + kind transferSinkCommandKind + xferID string + offset uint32 + data []byte + reason string + timeout time.Duration + chunkResult chan<- transferChunkResult + commitResult chan<- transferCommitResult +} + +type transferSinkWorker struct { + xferID string + cmdCh chan transferSinkCommand +} + +func newTransferSinkWorker(xferID string, sink transferSink) *transferSinkWorker { + w := &transferSinkWorker{ + xferID: xferID, + cmdCh: make(chan transferSinkCommand, 1), + } + go w.run(sink) + return w +} + +func (w *transferSinkWorker) write(xferID string, offset uint32, data []byte, timeout time.Duration, result chan<- transferChunkResult) bool { + return w.send(transferSinkCommand{ + kind: transferSinkCommandWrite, + xferID: xferID, + offset: offset, + data: data, + timeout: timeout, + chunkResult: result, + }) +} + +func (w *transferSinkWorker) commit(xferID string, timeout time.Duration, result chan<- transferCommitResult) bool { + return w.send(transferSinkCommand{ + kind: transferSinkCommandCommit, + xferID: xferID, + timeout: timeout, + commitResult: result, + }) +} + +func (w *transferSinkWorker) abort(reason string) bool { + if reason == "" { + reason = "abort" + } + return w.send(transferSinkCommand{ + kind: transferSinkCommandAbort, + xferID: w.xferID, + reason: reason, + }) +} + +func (w *transferSinkWorker) send(cmd transferSinkCommand) bool { + select { + case w.cmdCh <- cmd: + return true + default: + return false + } +} + +func (w *transferSinkWorker) run(sink transferSink) { + for cmd := range w.cmdCh { + switch cmd.kind { + case transferSinkCommandWrite: + if !w.runWrite(sink, cmd) { + return + } + case transferSinkCommandCommit: + w.runCommit(sink, cmd) + return + case transferSinkCommandAbort: + _ = sink.Abort(cmd.reason) + return + } + } +} + +func (w *transferSinkWorker) runWrite(sink transferSink, cmd transferSinkCommand) bool { + start := time.Now() + err := sink.WriteChunk(cmd.offset, cmd.data) + if err != nil { + _ = sink.Abort(err.Error()) + cmd.chunkResult <- transferChunkResult{err: err} + return false + } + if cmd.timeout > 0 && time.Since(start) > cmd.timeout { + reason := "chunk_write_timeout" + _ = sink.Abort(reason) + cmd.chunkResult <- transferChunkResult{err: errors.New(reason)} + return false + } + cmd.chunkResult <- transferChunkResult{} + return true +} + +func (w *transferSinkWorker) runCommit(sink transferSink, cmd transferSinkCommand) { + start := time.Now() + info, err := sink.Commit() + res := transferCommitResult{info: info, err: err} + if err != nil { + _ = sink.Abort(err.Error()) + cmd.commitResult <- res + return + } + if cmd.timeout > 0 && time.Since(start) > cmd.timeout { + reason := "transfer_commit_timeout" + info.cancelStage(reason) + cmd.commitResult <- transferCommitResult{err: errors.New(reason)} + return + } + cmd.commitResult <- res +} + type completedTransfer struct { meta transferMeta } +type pendingTargetCall struct { + xferID string + meta transferMeta + info transferInfo + sub *bus.Subscription + deadline time.Time +} + +func (s *session) pendingChunkReady() <-chan transferChunkResult { + cur := s.incomingTransfer + if cur == nil || cur.pendingChunk == nil { + return nil + } + return cur.pendingChunk.resultCh +} + +func (s *session) pendingCommitReady() <-chan transferCommitResult { + cur := s.incomingTransfer + if cur == nil || cur.pendingCommit == nil { + return nil + } + return cur.pendingCommit.resultCh +} + +func (s *session) pendingTargetReady() <-chan *bus.Message { + call := s.pendingTargetCall + if call == nil || call.sub == nil { + return nil + } + return call.sub.Channel() +} + +func earlierDeadline(a time.Time, aOK bool, b time.Time, bOK bool) (time.Time, bool) { + if !aOK { + return b, bOK + } + if !bOK { + return a, true + } + if b.Before(a) { + return b, true + } + return a, true +} + +func (s *session) nextPendingDeadline(now time.Time) (time.Time, bool) { + var out time.Time + var ok bool + if cur := s.incomingTransfer; cur != nil { + if cur.pendingChunk != nil && !cur.pendingChunk.started.IsZero() { + out, ok = earlierDeadline(out, ok, cur.pendingChunk.started.Add(s.cfg.PhaseTimeout), true) + } else if cur.pendingCommit != nil && !cur.pendingCommit.started.IsZero() { + out, ok = earlierDeadline(out, ok, cur.pendingCommit.started.Add(s.cfg.TargetCallTimeout), true) + } else if !cur.deadline.IsZero() { + out, ok = earlierDeadline(out, ok, cur.deadline, true) + } + } + if call := s.pendingTargetCall; call != nil && !call.deadline.IsZero() { + out, ok = earlierDeadline(out, ok, call.deadline, true) + } + for _, call := range s.inboundCalls { + if !call.deadline.IsZero() { + out, ok = earlierDeadline(out, ok, call.deadline, true) + } + } + for _, call := range s.outboundCalls { + if !call.deadline.IsZero() { + out, ok = earlierDeadline(out, ok, call.deadline, true) + } + } + if s.link == linkUp && !s.nextPingAt.IsZero() { + out, ok = earlierDeadline(out, ok, s.nextPingAt, true) + } + if s.link == linkUp && !s.rpcReady && !s.exportReadyAt.IsZero() { + out, ok = earlierDeadline(out, ok, s.exportReadyAt, true) + } + if s.link == linkUp && !s.exportDrainAt.IsZero() { + out, ok = earlierDeadline(out, ok, s.exportDrainAt, true) + } + return out, ok +} + +func (s *session) handlePendingDeadline(now time.Time) { + s.checkTransferTimeout(now) + call := s.pendingTargetCall + if call != nil && !now.Before(call.deadline) { + s.finishTargetCall(call, false, "stage_timeout") + } + s.expireInbound(now) + s.expireOutbound(now) + s.tickPing(now) + s.drainBusEvents(now) +} + func sameTransferTuple(a, b transferMeta) bool { return a.ID == b.ID && a.Target == b.Target && @@ -111,65 +371,156 @@ func u32s(v uint32) string { return strconvx.Itoa(int(v)) } -func decodeChunkData(encoded string) ([]byte, string) { - raw, err := base64.RawURLEncoding.DecodeString(encoded) +func nextTransferProgressAfter(v uint32) uint32 { + next := ((v / transferProgressLogStep) + 1) * transferProgressLogStep + if next <= v { + return v + transferProgressLogStep + } + return next +} + +func (cur *incomingTransfer) shouldLogProgress() bool { + if cur == nil { + return false + } + if cur.bytesWritten == 0 { + return false + } + if cur.nextProgressLogAt == 0 { + cur.nextProgressLogAt = transferProgressLogStep + } + if cur.bytesWritten < cur.nextProgressLogAt && cur.bytesWritten < cur.meta.Size { + return false + } + for cur.nextProgressLogAt <= cur.bytesWritten && cur.nextProgressLogAt < cur.meta.Size { + cur.nextProgressLogAt = nextTransferProgressAfter(cur.nextProgressLogAt) + } + return true +} + +func (s *session) decodeChunkData(encoded string) ([]byte, string) { + s.buffers = ensureFabricBuffers(s.buffers) + maxAccepted := int(s.cfg.MaxAcceptedChunkSize) + if maxAccepted <= 0 || maxAccepted > len(s.buffers.ChunkRaw) { + maxAccepted = len(s.buffers.ChunkRaw) + } + if len(encoded) > len(s.buffers.ChunkB64) { + return nil, "chunk_too_large" + } + decodedLen := base64.RawURLEncoding.DecodedLen(len(encoded)) + if decodedLen > maxAccepted { + return nil, "chunk_too_large" + } + copy(s.buffers.ChunkB64[:], encoded) + raw := s.buffers.ChunkRaw[:maxAccepted] + n, err := base64.RawURLEncoding.Decode(raw, s.buffers.ChunkB64[:len(encoded)]) if err != nil { return nil, "invalid_chunk_encoding" } - if base64.RawURLEncoding.EncodeToString(raw) != encoded { + encLen := base64.RawURLEncoding.EncodedLen(n) + if encLen != len(encoded) || encLen > len(s.buffers.ChunkB64) { return nil, "invalid_chunk_encoding" } - return raw, "" + base64.RawURLEncoding.Encode(s.buffers.ChunkB64[:encLen], raw[:n]) + for i := 0; i < encLen; i++ { + if s.buffers.ChunkB64[i] != encoded[i] { + return nil, "invalid_chunk_encoding" + } + } + return raw[:n], "" } func (s *session) sendTransferReady(id string) bool { - return s.sendControl(marshal(protoXferReady{ - Type: msgXferReady, - XferID: id, - })) + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "tx_ready_start", "xfer", id) + } + ok := s.sendControl(marshalXferReady(id)) + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "tx_ready_done", "xfer", id, "ok", ok) + } + return ok } func (s *session) sendTransferNeed(id string, next uint32) bool { - return s.sendControl(marshal(protoXferNeed{ - Type: msgXferNeed, - XferID: id, - Next: next, - })) + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "tx_need_start", "xfer", id, "next", next) + } + ok := s.sendControl(marshalXferNeed(id, next)) + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "tx_need_done", "xfer", id, "next", next, "ok", ok) + } + return ok } func (s *session) sendTransferDone(id string) bool { - return s.sendControl(marshal(protoXferDone{ - Type: msgXferDone, - XferID: id, - })) + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "tx_done_start", "xfer", id) + } + ok := s.sendControl(marshalXferDone(id)) + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "tx_done_done", "xfer", id, "ok", ok) + } + return ok } func (s *session) sendTransferAbort(id, reason string) bool { - return s.sendControl(marshal(protoXferAbort{ - Type: msgXferAbort, - XferID: id, - Err: reason, - })) + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "tx_abort_start", "xfer", id, "reason", reason) + } + ok := s.sendControl(marshalXferAbort(id, reason)) + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "tx_abort_done", "xfer", id, "reason", reason, "ok", ok) + } + return ok } func (s *session) clearTransfer() *incomingTransfer { cur := s.incomingTransfer s.incomingTransfer = nil + if cur != nil && cur.pendingChunk != nil { + cur.pendingChunk.data = nil + cur.pendingChunk = nil + } + if cur != nil { + cur.pendingCommit = nil + } return cur } func (s *session) abortTransfer(reason string) { + s.counters.TransferAborts++ cur := s.clearTransfer() if cur == nil { return } - otadiag.Event("[fabric-xfer]", "abort_local", cur.meta.ID, otadiag.KV("reason", reason)) + otadiag.Event("[fabric-xfer]", "abort_local", cur.meta.ID, + otadiag.KV("reason", reason), + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("size", cur.meta.Size), + otadiag.KV("chunks", cur.chunksSeen), + otadiag.KV("idle_retries", int(cur.idleRetries)), + ) otadiag.StopUpdateWindow(reason) - if err := cur.sink.Abort(reason); err != nil { - s.logKV("transfer abort failed", "err", err.Error()) + if cur.worker != nil && !cur.worker.abort(reason) { + s.logKV("transfer abort enqueue failed", "reason", reason) } } +func (s *session) clearTransferAfterWorkerFailure(reason string) { + cur := s.clearTransfer() + if cur == nil { + return + } + otadiag.Event("[fabric-xfer]", "abort_local", cur.meta.ID, + otadiag.KV("reason", reason), + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("size", cur.meta.Size), + otadiag.KV("chunks", cur.chunksSeen), + otadiag.KV("worker_owned", true), + ) + otadiag.StopUpdateWindow(reason) +} + // checkTransferTimeout enforces the idle-chunk watchdog. Fires once per // drain tick from the session run loop; cheap when no transfer is active. // On expiry both the local sink is aborted and an xfer_abort frame is sent @@ -183,14 +534,53 @@ func (s *session) checkTransferTimeout(now time.Time) { if !now.After(cur.deadline) { return } + if cur.pendingChunk != nil { + // Pending sink operations own their own deadlines. The worker reports a + // timeout event to the reactor, then aborts the sink after the in-flight + // sink method reaches a safe point. The reactor must not call Abort while + // WriteChunk may still be executing. The standard log still records a + // single slow-write marker so a stuck flash/verifier write is distinguishable + // from a missing peer chunk. + if !cur.pendingChunk.warned && now.After(cur.pendingChunk.started.Add(s.cfg.PhaseTimeout)) { + cur.pendingChunk.warned = true + otadiag.Event("[fabric-xfer]", "sink_write_slow", cur.meta.ID, + otadiag.KV("offset", cur.pendingChunk.offset), + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("elapsed_ms", int(now.Sub(cur.pendingChunk.started)/time.Millisecond)), + ) + } + return + } + if cur.pendingCommit != nil { + if !cur.pendingCommit.warned && now.After(cur.pendingCommit.started.Add(s.cfg.TargetCallTimeout)) { + cur.pendingCommit.warned = true + otadiag.Event("[fabric-xfer]", "commit_slow", cur.meta.ID, + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("chunks", cur.chunksSeen), + otadiag.KV("elapsed_ms", int(now.Sub(cur.pendingCommit.started)/time.Millisecond)), + ) + } + return + } if cur.idleRetries < transferIdleRetryLimit { + s.counters.TransferOffsetRetries++ cur.idleRetries++ cur.deadline = now.Add(s.cfg.PhaseTimeout) s.logKV("transfer idle retry", "offset", u32s(cur.bytesWritten)) + if xferProbeEnabled { + xferProbe("idle_retry", "id", cur.meta.ID, "next", cur.bytesWritten, "retry", int(cur.idleRetries)) + } s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) return } id := cur.meta.ID + otadiag.Event("[fabric-xfer]", "timeout", id, + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("size", cur.meta.Size), + otadiag.KV("chunks", cur.chunksSeen), + otadiag.KV("idle_retries", int(cur.idleRetries)), + otadiag.KV("late_ms", int(now.Sub(cur.deadline)/time.Millisecond)), + ) s.abortTransfer("timeout") abortOK := s.sendTransferAbort(id, "timeout") otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", "timeout"), otadiag.KV("ok", abortOK)) @@ -213,7 +603,11 @@ func (s *session) retryCorruptTransferFrame(reason string) bool { otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) return false } + s.counters.TransferOffsetRetries++ cur.corruptRetriesAtOffset++ + if xferProbeEnabled { + xferProbe("corrupt_retry", "id", cur.meta.ID, "next", cur.bytesWritten, "retry", int(cur.corruptRetriesAtOffset), "reason", reason) + } needOK := s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) otadiag.Event( "[fabric-xfer]", "need_tx", cur.meta.ID, @@ -284,16 +678,20 @@ func validateTransferBegin(msg *protoXferBegin) (transferMeta, string) { } func (s *session) onTransferBegin(msg *protoXferBegin) { - s.extendTransferQuiet("xfer_begin_rx", transferPrepareQuiet) otadiag.SetActiveXfer(msg.XferID) - otadiag.Event( - "[fabric-xfer]", "begin_rx", msg.XferID, - otadiag.KV("target", msg.Target), - otadiag.KV("size", msg.Size), - otadiag.KV("digest_alg", msg.DigestAlg), - otadiag.KV("digest", msg.Digest), - otadiag.KV("meta_len", len(msg.Meta)), - ) + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "begin_enter", "xfer", msg.XferID, "target", msg.Target, "size", msg.Size, "digest", msg.Digest) + } + if fabricXferDiagEnabled("begin_rx") { + otadiag.Event( + "[fabric-xfer]", "begin_rx", msg.XferID, + otadiag.KV("target", msg.Target), + otadiag.KV("size", msg.Size), + otadiag.KV("digest_alg", msg.DigestAlg), + otadiag.KV("digest", msg.Digest), + otadiag.KV("meta_len", len(msg.Meta)), + ) + } meta, errStr := validateTransferBegin(msg) if errStr != "" { if msg.XferID != "" { @@ -314,10 +712,15 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { s.logKV("xfer_begin dropped", "err", errStr) return } - otadiag.Event( - "[fabric-xfer]", "begin_validate_ok", meta.ID, - otadiag.KV("target", meta.Target), - ) + if fabricXferDiagEnabled("begin_validate_ok") { + otadiag.Event( + "[fabric-xfer]", "begin_validate_ok", meta.ID, + otadiag.KV("target", meta.Target), + ) + } + if xferProbeEnabled { + xferProbe("begin", "id", meta.ID, "size", meta.Size, "target", meta.Target, "digest", meta.Digest) + } s.markRx() now := time.Now() if s.incomingTransfer != nil { @@ -325,10 +728,14 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { if sameTransferTuple(cur.meta, meta) { s.logKV("xfer_begin duplicate", "id", meta.ID) readyOK := s.sendTransferReady(meta.ID) - otadiag.Event("[fabric-xfer]", "ready_tx", meta.ID, otadiag.KV("ok", readyOK), otadiag.KV("duplicate", true)) + if fabricXferDiagEnabled("ready_tx") { + otadiag.Event("[fabric-xfer]", "ready_tx", meta.ID, otadiag.KV("ok", readyOK), otadiag.KV("duplicate", true)) + } if readyOK { needOK := s.sendTransferNeed(meta.ID, cur.bytesWritten) - otadiag.Event("[fabric-xfer]", "need_tx", meta.ID, otadiag.KV("next", cur.bytesWritten), otadiag.KV("ok", needOK), otadiag.KV("duplicate", true)) + if fabricXferDiagEnabled("need_tx") { + otadiag.Event("[fabric-xfer]", "need_tx", meta.ID, otadiag.KV("next", cur.bytesWritten), otadiag.KV("ok", needOK), otadiag.KV("duplicate", true)) + } } cur.deadline = now.Add(s.cfg.PhaseTimeout) return @@ -347,10 +754,24 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { otadiag.StopUpdateWindow("begin_reject") return } + if s.pendingTargetCall != nil { + reason := "busy" + abortOK := s.sendTransferAbort(meta.ID, reason) + otadiag.Event( + "[fabric-xfer]", "begin_reject", meta.ID, + otadiag.KV("reason", reason), + otadiag.KV("pending_xfer", s.pendingTargetCall.xferID), + otadiag.KV("abort_tx", abortOK), + ) + otadiag.StopUpdateWindow("begin_reject") + return + } if done, ok := s.completedTransferFor(meta.ID); ok { if sameTransferTuple(done, meta) { doneOK := s.sendTransferDone(meta.ID) - otadiag.Event("[fabric-xfer]", "begin_duplicate_done", meta.ID, otadiag.KV("done_tx", doneOK)) + if fabricXferDiagEnabled("begin_duplicate_done") { + otadiag.Event("[fabric-xfer]", "begin_duplicate_done", meta.ID, otadiag.KV("done_tx", doneOK)) + } return } abortOK := s.sendTransferAbort(meta.ID, "conflicting_transfer") @@ -360,16 +781,26 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { } beginFn := s.beginTransfer if beginFn == nil { - beginFn = beginTransfer + beginFn = func(meta transferMeta) (transferSink, error) { + return beginUpdaterTransfer(s.stageController, meta) + } + } + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "begin_sink_start", "xfer", meta.ID, "target", meta.Target, "size", meta.Size) } beginStart := time.Now() - otadiag.Event( - "[fabric-xfer]", "begin_transfer_start", meta.ID, - otadiag.KV("target", meta.Target), - otadiag.KV("size", meta.Size), - ) + if fabricXferDiagEnabled("begin_transfer_start") { + otadiag.Event( + "[fabric-xfer]", "begin_transfer_start", meta.ID, + otadiag.KV("target", meta.Target), + otadiag.KV("size", meta.Size), + ) + } sink, err := beginFn(meta) if err != nil { + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "begin_sink_error", "xfer", meta.ID, "err", err.Error()) + } durMS := int(time.Since(beginStart) / time.Millisecond) abortOK := s.sendTransferAbort(meta.ID, err.Error()) otadiag.Event( @@ -381,26 +812,204 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { otadiag.StopUpdateWindow("begin_transfer_error") return } - otadiag.Event( - "[fabric-xfer]", "begin_transfer_done", meta.ID, - otadiag.KV("dur_ms", int(time.Since(beginStart)/time.Millisecond)), - ) + if fabricXferDiagEnabled("begin_transfer_done") { + otadiag.Event( + "[fabric-xfer]", "begin_transfer_done", meta.ID, + otadiag.KV("dur_ms", int(time.Since(beginStart)/time.Millisecond)), + ) + } + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "begin_sink_ok", "xfer", meta.ID) + } + s.counters.TransferBegins++ s.incomingTransfer = &incomingTransfer{ - meta: meta, - sink: sink, - hasher: xxhash.New(0), - deadline: now.Add(s.cfg.PhaseTimeout), + meta: meta, + worker: newTransferSinkWorker(meta.ID, sink), + hasher: xxhash.New(0), + nextProgressLogAt: transferProgressLogStep, + deadline: now.Add(s.cfg.PhaseTimeout), + } + if xferProbeEnabled { + xferProbe("begin_ok", "id", meta.ID, "next", uint32(0)) } readyOK := s.sendTransferReady(meta.ID) - otadiag.Event("[fabric-xfer]", "ready_tx", meta.ID, otadiag.KV("ok", readyOK)) + if fabricXferDiagEnabled("ready_tx") { + otadiag.Event("[fabric-xfer]", "ready_tx", meta.ID, otadiag.KV("ok", readyOK)) + } if readyOK { needOK := s.sendTransferNeed(meta.ID, 0) - otadiag.Event("[fabric-xfer]", "need_tx", meta.ID, otadiag.KV("next", 0), otadiag.KV("ok", needOK)) + if fabricXferDiagEnabled("need_tx") { + otadiag.Event("[fabric-xfer]", "need_tx", meta.ID, otadiag.KV("next", 0), otadiag.KV("ok", needOK)) + } } else { - otadiag.Event("[fabric-xfer]", "need_tx", meta.ID, otadiag.KV("next", 0), otadiag.KV("ok", false), otadiag.KV("skipped", "ready_failed")) + if fabricXferDiagEnabled("need_tx") { + otadiag.Event("[fabric-xfer]", "need_tx", meta.ID, otadiag.KV("next", 0), otadiag.KV("ok", false), otadiag.KV("skipped", "ready_failed")) + } + } +} + +func (s *session) startPendingChunkWrite(cur *incomingTransfer, offset uint32, raw []byte) { + if xferProbeEnabled { + xferProbe("write_start", "id", cur.meta.ID, "offset", offset, "len", len(raw)) + } + if fabricTraceEnabled { + println("[fabric-xfer]", "sid", s.localSID, "chunk_worker_start", "xfer", cur.meta.ID, "offset", offset, "len", len(raw)) + } + ch := make(chan transferChunkResult, 1) + data := raw + started := time.Now() + cur.pendingChunk = &pendingChunkWrite{ + xferID: cur.meta.ID, + offset: offset, + data: data, + started: started, + resultCh: ch, + } + if cur.worker == nil || !cur.worker.write(cur.meta.ID, offset, data, s.cfg.PhaseTimeout, ch) { + ch <- transferChunkResult{err: errors.New("transfer_worker_busy")} + } +} + +func (s *session) finishChunkWrite(now time.Time, res transferChunkResult) { + if fabricTraceEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + println("[fabric-xfer]", "sid", s.localSID, "chunk_worker_done", "err", errText) + } + cur := s.incomingTransfer + if cur == nil || cur.pendingChunk == nil { + return + } + pending := cur.pendingChunk + cur.pendingChunk = nil + writeMS := int(time.Since(pending.started) / time.Millisecond) + if xferProbeEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + xferProbe("write_done", "id", pending.xferID, "offset", pending.offset, "dur_ms", writeMS, "err", errText) + } + if res.err != nil { + reason := res.err.Error() + otadiag.Event( + "[fabric-xfer]", "sink_write_error", pending.xferID, + otadiag.KV("reason", reason), + otadiag.KV("dur_ms", writeMS), + ) + s.logKV("transfer write failed", "err", reason) + s.clearTransferAfterWorkerFailure(reason) + abortOK := s.sendTransferAbort(pending.xferID, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", pending.xferID, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) + return + } + _, _ = cur.hasher.Write(pending.data) + cur.bytesWritten += uint32(len(pending.data)) + cur.chunksSeen++ + s.counters.TransferChunks++ + s.counters.TransferBytes += uint64(len(pending.data)) + cur.idleRetries = 0 + cur.corruptRetryOffset = cur.bytesWritten + cur.corruptRetriesAtOffset = 0 + cur.deadline = now.Add(s.cfg.PhaseTimeout) + if cur.shouldLogProgress() { + otadiag.Event( + "[fabric-xfer]", "progress", pending.xferID, + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("size", cur.meta.Size), + otadiag.KV("chunks", cur.chunksSeen), + otadiag.KV("last_offset", pending.offset), + otadiag.KV("last_len", len(pending.data)), + otadiag.KV("write_ms", writeMS), + ) + } + if fabricXferDiagEnabled("sink_write_done") { + otadiag.Event( + "[fabric-xfer]", "sink_write_done", pending.xferID, + otadiag.KV("dur_ms", writeMS), + otadiag.KV("next", u32s(cur.bytesWritten)), + ) + } + pending.data = nil + if xferProbeEnabled { + xferProbe("need_after_write", "id", cur.meta.ID, "next", cur.bytesWritten) + } + needOK := s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + if fabricXferDiagEnabled("need_tx") { + otadiag.Event( + "[fabric-xfer]", "need_tx", cur.meta.ID, + otadiag.KV("next", cur.bytesWritten), + otadiag.KV("ok", needOK), + otadiag.KV("accepted", true), + ) } } +func (s *session) startPendingTransferCommit(cur *incomingTransfer) { + if xferProbeEnabled { + xferProbe("commit_start", "id", cur.meta.ID, "bytes", cur.bytesWritten, "chunks", cur.chunksSeen) + } + ch := make(chan transferCommitResult, 1) + started := time.Now() + cur.pendingCommit = &pendingTransferCommit{ + xferID: cur.meta.ID, + started: started, + resultCh: ch, + } + if cur.worker == nil || !cur.worker.commit(cur.meta.ID, s.cfg.TargetCallTimeout, ch) { + ch <- transferCommitResult{err: errors.New("transfer_worker_busy")} + } +} + +func (s *session) finishTransferCommit(now time.Time, res transferCommitResult) { + if fabricTraceEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + println("[fabric-xfer]", "sid", s.localSID, "commit_worker_done", "bytes", res.info.BytesWritten, "generation", res.info.Generation, "err", errText) + } + cur := s.incomingTransfer + if cur == nil || cur.pendingCommit == nil { + return + } + pending := cur.pendingCommit + cur.pendingCommit = nil + if xferProbeEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + xferProbe("commit_done", "id", pending.xferID, "dur_ms", int(time.Since(pending.started)/time.Millisecond), "bytes", res.info.BytesWritten, "generation", res.info.Generation, "err", errText) + } + if res.err != nil { + reason := res.err.Error() + s.logKV("transfer commit failed", "err", reason) + s.clearTransferAfterWorkerFailure(reason) + abortOK := s.sendTransferAbort(pending.xferID, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", pending.xferID, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) + return + } + meta := cur.meta + s.clearTransfer() + if fabricXferDiagEnabled("transfer_commit_done") { + otadiag.Event( + "[fabric-xfer]", "transfer_commit_done", pending.xferID, + otadiag.KV("dur_ms", int(time.Since(pending.started)/time.Millisecond)), + otadiag.KV("bytes", res.info.BytesWritten), + otadiag.KV("generation", res.info.Generation), + ) + } + if reason := s.startTransferTargetCall(meta, pending.xferID, res.info); reason != "" { + res.info.cancelStage(reason) + abortOK := s.sendTransferAbort(pending.xferID, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", pending.xferID, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) + } + _ = now +} + func (s *session) onTransferChunk(msg *protoXferChunk) { cur := s.incomingTransfer if cur == nil || cur.meta.ID != msg.XferID { @@ -408,53 +1017,114 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { return } id := cur.meta.ID - otadiag.Event( - "[fabric-xfer]", "chunk_rx", id, - otadiag.KV("offset", u32s(msg.Offset)), - otadiag.KV("expected", u32s(cur.bytesWritten)), - otadiag.KV("encoded_len", strconvx.Itoa(len(msg.Data))), - ) - if msg.Offset < cur.bytesWritten { + if xferProbeEnabled { + xferProbe("chunk_rx", "id", id, "offset", msg.Offset, "expected", cur.bytesWritten, "encoded_len", len(msg.Data)) + } + if cur.pendingChunk != nil { + if xferProbeEnabled { + xferProbe("chunk_drop_write_pending", "id", id, "offset", msg.Offset, "expected", cur.bytesWritten, "pending", cur.pendingChunk.offset) + } s.markRx() - needOK := s.sendTransferNeed(id, cur.bytesWritten) + if fabricXferDiagEnabled("chunk_while_write_pending") { + otadiag.Event( + "[fabric-xfer]", "chunk_while_write_pending", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("pending_offset", u32s(cur.pendingChunk.offset)), + otadiag.KV("expected", u32s(cur.bytesWritten)), + ) + } + return + } + if cur.pendingCommit != nil { + if xferProbeEnabled { + xferProbe("chunk_drop_commit_pending", "id", id, "offset", msg.Offset, "expected", cur.bytesWritten) + } + s.markRx() + if fabricXferDiagEnabled("chunk_while_commit_pending") { + otadiag.Event( + "[fabric-xfer]", "chunk_while_commit_pending", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("expected", u32s(cur.bytesWritten)), + ) + } + return + } + if fabricXferDiagEnabled("chunk_rx") { otadiag.Event( - "[fabric-xfer]", "chunk_stale_offset", id, + "[fabric-xfer]", "chunk_rx", id, otadiag.KV("offset", u32s(msg.Offset)), otadiag.KV("expected", u32s(cur.bytesWritten)), - otadiag.KV("need_tx", needOK), + otadiag.KV("encoded_len", strconvx.Itoa(len(msg.Data))), ) + } + if msg.Offset < cur.bytesWritten { + if xferProbeEnabled { + xferProbe("chunk_stale", "id", id, "offset", msg.Offset, "expected", cur.bytesWritten) + } + s.markRx() + needOK := s.sendTransferNeed(id, cur.bytesWritten) + if fabricXferDiagEnabled("chunk_stale_offset") { + otadiag.Event( + "[fabric-xfer]", "chunk_stale_offset", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("expected", u32s(cur.bytesWritten)), + otadiag.KV("need_tx", needOK), + ) + } return } if msg.Offset > cur.bytesWritten { + if xferProbeEnabled { + xferProbe("chunk_future", "id", id, "offset", msg.Offset, "expected", cur.bytesWritten) + } s.markRx() needOK := s.sendTransferNeed(id, cur.bytesWritten) - otadiag.Event( - "[fabric-xfer]", "chunk_future_offset", id, - otadiag.KV("offset", u32s(msg.Offset)), - otadiag.KV("expected", u32s(cur.bytesWritten)), - otadiag.KV("need_tx", needOK), - ) + if fabricXferDiagEnabled("chunk_future_offset") { + otadiag.Event( + "[fabric-xfer]", "chunk_future_offset", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("expected", u32s(cur.bytesWritten)), + otadiag.KV("need_tx", needOK), + ) + } return } decodeStart := time.Now() - raw, errStr := decodeChunkData(msg.Data) + raw, errStr := s.decodeChunkData(msg.Data) if errStr != "" { + if xferProbeEnabled { + xferProbe("chunk_decode_error", "id", id, "offset", msg.Offset, "reason", errStr, "encoded_len", len(msg.Data), "line_len", msg.LineLen) + } + s.counters.TransferDecodeErrors++ otadiag.Event( - "[fabric-xfer]", "chunk_decode_done", id, - otadiag.KV("ok", false), + "[fabric-xfer]", "chunk_reject", id, otadiag.KV("reason", errStr), + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("expected", u32s(cur.bytesWritten)), + otadiag.KV("encoded_len", strconvx.Itoa(len(msg.Data))), + otadiag.KV("line_len", strconvx.Itoa(msg.LineLen)), otadiag.KV("dur_ms", int(time.Since(decodeStart)/time.Millisecond)), ) + if fabricXferDiagEnabled("chunk_decode_done") { + otadiag.Event( + "[fabric-xfer]", "chunk_decode_done", id, + otadiag.KV("ok", false), + otadiag.KV("reason", errStr), + otadiag.KV("dur_ms", int(time.Since(decodeStart)/time.Millisecond)), + ) + } s.logKV("xfer_chunk decode retry", "err", errStr) s.retryCorruptTransferFrame(errStr) return } - otadiag.Event( - "[fabric-xfer]", "chunk_decode_done", id, - otadiag.KV("ok", true), - otadiag.KV("raw_len", strconvx.Itoa(len(raw))), - otadiag.KV("dur_ms", int(time.Since(decodeStart)/time.Millisecond)), - ) + if fabricXferDiagEnabled("chunk_decode_done") { + otadiag.Event( + "[fabric-xfer]", "chunk_decode_done", id, + otadiag.KV("ok", true), + otadiag.KV("raw_len", strconvx.Itoa(len(raw))), + otadiag.KV("dur_ms", int(time.Since(decodeStart)/time.Millisecond)), + ) + } if len(raw) == 0 { s.abortTransfer("empty_chunk") abortOK := s.sendTransferAbort(id, "empty_chunk") @@ -463,12 +1133,14 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { } if cur.bytesWritten+uint32(len(raw)) > cur.meta.Size { reason := "size_too_large" - otadiag.Event( - "[fabric-xfer]", "chunk_size_overflow", id, - otadiag.KV("offset", u32s(msg.Offset)), - otadiag.KV("raw_len", strconvx.Itoa(len(raw))), - otadiag.KV("size", u32s(cur.meta.Size)), - ) + if fabricXferDiagEnabled("chunk_size_overflow") { + otadiag.Event( + "[fabric-xfer]", "chunk_size_overflow", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("raw_len", strconvx.Itoa(len(raw))), + otadiag.KV("size", u32s(cur.meta.Size)), + ) + } s.abortTransfer(reason) abortOK := s.sendTransferAbort(id, reason) otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) @@ -482,94 +1154,90 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { digestStart := time.Now() want, ok := canonicalXXHash32Hex(msg.ChunkDigest) if !ok { + if xferProbeEnabled { + xferProbe("chunk_digest_error", "id", id, "offset", msg.Offset, "reason", "bad_message", "digest_len", len(msg.ChunkDigest), "data_len", len(msg.Data), "line_len", msg.LineLen) + } + s.counters.TransferDigestErrors++ otadiag.Event( - "[fabric-xfer]", "chunk_digest_done", id, - otadiag.KV("ok", false), + "[fabric-xfer]", "chunk_reject", id, otadiag.KV("reason", "bad_message"), otadiag.KV("offset", u32s(msg.Offset)), - otadiag.KV("digest_len", strconvx.Itoa(len(msg.ChunkDigest))), - otadiag.KV("data_len", strconvx.Itoa(len(msg.Data))), + otadiag.KV("expected", u32s(cur.bytesWritten)), + otadiag.KV("raw_len", strconvx.Itoa(len(raw))), + otadiag.KV("encoded_len", strconvx.Itoa(len(msg.Data))), + otadiag.KV("line_len", strconvx.Itoa(msg.LineLen)), + otadiag.KV("want", msg.ChunkDigest), otadiag.KV("dur_ms", int(time.Since(digestStart)/time.Millisecond)), ) + if fabricXferDiagEnabled("chunk_digest_done") { + otadiag.Event( + "[fabric-xfer]", "chunk_digest_done", id, + otadiag.KV("ok", false), + otadiag.KV("reason", "bad_message"), + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("digest_len", strconvx.Itoa(len(msg.ChunkDigest))), + otadiag.KV("data_len", strconvx.Itoa(len(msg.Data))), + otadiag.KV("dur_ms", int(time.Since(digestStart)/time.Millisecond)), + ) + } s.retryCorruptTransferFrame("bad_message") return } got := xxhashHex(xxhash.Sum32(raw, 0)) + if xferProbeEnabled && msg.Offset == 0 { + xferProbe("chunk_digest_first", "id", id, "offset", msg.Offset, "raw_len", len(raw), "encoded_len", len(msg.Data), "got", got, "want", want, "raw_prefix16", byteHexPrefix(raw, 16), "data_prefix32", stringPrefix(msg.Data, 32)) + } if got != want { + if xferProbeEnabled { + xferProbe("chunk_digest_error", "id", id, "offset", msg.Offset, "reason", "mismatch", "raw_len", len(raw), "encoded_len", len(msg.Data), "line_len", msg.LineLen, "got", got, "want", want, "raw_prefix16", byteHexPrefix(raw, 16), "data_prefix32", stringPrefix(msg.Data, 32)) + } + s.counters.TransferDigestErrors++ otadiag.Event( - "[fabric-xfer]", "chunk_digest_done", id, - otadiag.KV("ok", false), + "[fabric-xfer]", "chunk_reject", id, otadiag.KV("reason", "chunk_digest_mismatch"), otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("expected", u32s(cur.bytesWritten)), + otadiag.KV("raw_len", strconvx.Itoa(len(raw))), + otadiag.KV("encoded_len", strconvx.Itoa(len(msg.Data))), + otadiag.KV("line_len", strconvx.Itoa(msg.LineLen)), + otadiag.KV("want", want), + otadiag.KV("got", got), otadiag.KV("dur_ms", int(time.Since(digestStart)/time.Millisecond)), ) + if fabricXferDiagEnabled("chunk_digest_done") { + otadiag.Event( + "[fabric-xfer]", "chunk_digest_done", id, + otadiag.KV("ok", false), + otadiag.KV("reason", "chunk_digest_mismatch"), + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("dur_ms", int(time.Since(digestStart)/time.Millisecond)), + ) + } s.retryCorruptTransferFrame("chunk_digest_mismatch") return } - otadiag.Event( - "[fabric-xfer]", "chunk_digest_done", id, - otadiag.KV("ok", true), - otadiag.KV("dur_ms", int(time.Since(digestStart)/time.Millisecond)), - ) + if fabricXferDiagEnabled("chunk_digest_done") { + otadiag.Event( + "[fabric-xfer]", "chunk_digest_done", id, + otadiag.KV("ok", true), + otadiag.KV("dur_ms", int(time.Since(digestStart)/time.Millisecond)), + ) + } s.markRx() writeStart := time.Now() - otadiag.Event( - "[fabric-xfer]", "sink_write_start", id, - otadiag.KV("offset", u32s(msg.Offset)), - otadiag.KV("raw_len", strconvx.Itoa(len(raw))), - ) - if err := cur.sink.WriteChunk(msg.Offset, raw); err != nil { - reason := err.Error() + if fabricXferDiagEnabled("sink_write_start") { otadiag.Event( - "[fabric-xfer]", "sink_write_error", id, - otadiag.KV("reason", reason), - otadiag.KV("dur_ms", int(time.Since(writeStart)/time.Millisecond)), + "[fabric-xfer]", "sink_write_start", id, + otadiag.KV("offset", u32s(msg.Offset)), + otadiag.KV("raw_len", strconvx.Itoa(len(raw))), ) - s.logKV("transfer write failed", "err", reason) - s.abortTransfer(reason) - abortOK := s.sendTransferAbort(id, reason) - otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) - return } - _, _ = cur.hasher.Write(raw) - cur.bytesWritten += uint32(len(raw)) - cur.chunksSeen++ - cur.idleRetries = 0 - cur.corruptRetryOffset = cur.bytesWritten - cur.corruptRetriesAtOffset = 0 - cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) - otadiag.Event( - "[fabric-xfer]", "sink_write_done", id, - otadiag.KV("dur_ms", int(time.Since(writeStart)/time.Millisecond)), - otadiag.KV("next", u32s(cur.bytesWritten)), - ) - raw = nil - // Keep transfer memory bounded on TinyGo. The receiver allocates while - // unmarshalling JSON and decoding base64 chunks; without regular collection - // long updates can run out of heap before commit. - gcStart := time.Now() - otadiag.Event("[fabric-xfer]", "gc_start", id, otadiag.KV("next", u32s(cur.bytesWritten))) - runtime.GC() - otadiag.Event( - "[fabric-xfer]", "gc_done", id, - otadiag.KV("dur_ms", int(time.Since(gcStart)/time.Millisecond)), - otadiag.KV("next", u32s(cur.bytesWritten)), - ) - needOK := s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) - otadiag.Event( - "[fabric-xfer]", "need_tx", cur.meta.ID, - otadiag.KV("next", cur.bytesWritten), - otadiag.KV("ok", needOK), - otadiag.KV("accepted", true), - ) - if cur.bytesWritten != 0 && cur.bytesWritten%transferMemSampleStride == 0 { - var ms runtime.MemStats - runtime.ReadMemStats(&ms) + s.startPendingChunkWrite(cur, msg.Offset, raw) + if fabricXferDiagEnabled("sink_write_pending") { otadiag.Event( - "[fabric-xfer]", "transfer_mem_sample", cur.meta.ID, - otadiag.KV("next", cur.bytesWritten), - otadiag.KV("alloc", ms.Alloc), - otadiag.KV("heap", ms.HeapSys), + "[fabric-xfer]", "sink_write_pending", id, + otadiag.KV("dur_ms", int(time.Since(writeStart)/time.Millisecond)), + otadiag.KV("offset", u32s(msg.Offset)), ) } } @@ -581,6 +1249,20 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { return } id := cur.meta.ID + if xferProbeEnabled { + xferProbe("commit_rx", "id", id, "bytes", cur.bytesWritten, "size", msg.Size, "digest", msg.Digest) + } + if cur.pendingChunk != nil { + s.markRx() + if fabricXferDiagEnabled("commit_while_write_pending") { + otadiag.Event( + "[fabric-xfer]", "commit_while_write_pending", id, + otadiag.KV("expected", u32s(cur.bytesWritten)), + otadiag.KV("pending_offset", u32s(cur.pendingChunk.offset)), + ) + } + return + } if msg.Size != cur.meta.Size || cur.bytesWritten != cur.meta.Size { reason := "short_transfer" s.abortTransfer(reason) @@ -609,44 +1291,22 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { return } s.markRx() - info, err := cur.sink.Commit() - if err != nil { - s.logKV("transfer commit failed", "err", err.Error()) - reason := err.Error() - s.abortTransfer(reason) - abortOK := s.sendTransferAbort(id, reason) - otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) - return + if fabricXferDiagEnabled("transfer_commit_start") { + otadiag.Event("[fabric-xfer]", "transfer_commit_start", id) } - sink := cur.sink - meta := cur.meta - s.extendTransferQuiet("xfer_commit_target", transferCompleteQuiet) - s.clearTransfer() - - bytesPayload := sink.Bytes() - ok, reason := s.invokeTransferTarget(meta, id, info, bytesPayload) - if !ok { - s.extendTransferQuiet("xfer_target_rejected", transferCompleteQuiet) - abortOK := s.sendTransferAbort(id, reason) - otadiag.Event("[fabric-xfer]", "abort_tx", id, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) - return - } - s.extendTransferQuiet("xfer_done", transferCompleteQuiet) - s.recordCompletedTransfer(meta) - doneOK := s.sendTransferDone(id) - otadiag.Event("[fabric-xfer]", "done_tx", id, otadiag.KV("ok", doneOK)) - otadiag.StopUpdateWindow("transfer_done") + s.startPendingTransferCommit(cur) } -var targetCallTimeout = 5 * time.Second - -// invokeTransferTarget calls the local updater staging RPC named by -// xfer_begin.target. The wire no longer carries raw/member receiver topics; -// target="updater/main" maps to an internal bus RPC owned by the updater -// service. The reply gates whether fabric sends xfer_done or xfer_abort. -func (s *session) invokeTransferTarget(meta transferMeta, xferID string, info transferInfo, artefact []byte) (bool, string) { +// startTransferTargetCall invokes the local updater/main staging RPC without +// blocking the Fabric session reactor. The reply channel is selected directly +// by session.run, so completion wakes the reactor without waiting for a +// periodic tick. +func (s *session) startTransferTargetCall(meta transferMeta, xferID string, info transferInfo) string { if meta.Target != transferTargetUpdaterMain { - return false, "unsupported_target" + return "unsupported_target" + } + if s.pendingTargetCall != nil { + return "busy" } payload := updater.StagePayload{ LinkID: s.linkID, @@ -657,27 +1317,85 @@ func (s *session) invokeTransferTarget(meta transferMeta, xferID string, info tr DigestAlg: meta.DigestAlg, Digest: meta.Digest, Meta: meta.Meta, - Artefact: artefact, } msg := s.conn.NewMessage(updater.TopicStageRPC, payload, false) - replySub := s.conn.Request(msg) - defer s.conn.Unsubscribe(replySub) + s.pendingTargetCall = &pendingTargetCall{ + xferID: xferID, + meta: meta, + info: info, + sub: s.conn.Request(msg), + deadline: time.Now().Add(s.cfg.TargetCallTimeout), + } + if fabricXferDiagEnabled("target_call_start") { + otadiag.Event("[fabric-xfer]", "target_call_start", xferID, + otadiag.KV("timeout_ms", int(s.cfg.TargetCallTimeout/time.Millisecond)), + ) + } + return "" +} - select { - case rep, ok := <-replySub.Channel(): - if !ok || rep == nil { - updater.CancelStreamedStage(xferID, info.Generation, "stage_no_reply") - return false, "stage_no_reply" - } - ok, reason := decodeStageReply(rep.Payload) - if !ok { - updater.CancelStreamedStage(xferID, info.Generation, reason) - return false, reason +func (s *session) finishTargetCall(call *pendingTargetCall, ok bool, reason string) { + if call == nil { + return + } + if call.sub != nil { + s.conn.Unsubscribe(call.sub) + call.sub = nil + } + s.pendingTargetCall = nil + if ok { + s.counters.TransferCompletions++ + s.recordCompletedTransfer(call.meta) + doneOK := s.sendTransferDone(call.xferID) + if fabricXferDiagEnabled("done_tx") { + otadiag.Event("[fabric-xfer]", "done_tx", call.xferID, otadiag.KV("ok", doneOK)) } - return true, "" - case <-time.After(targetCallTimeout): - updater.CancelStreamedStage(xferID, info.Generation, "stage_timeout") - return false, "stage_timeout" + otadiag.StopUpdateWindow("transfer_done") + return + } + if reason == "" { + reason = "stage_rejected" + } + call.info.cancelStage(reason) + abortOK := s.sendTransferAbort(call.xferID, reason) + otadiag.Event("[fabric-xfer]", "abort_tx", call.xferID, otadiag.KV("reason", reason), otadiag.KV("ok", abortOK)) +} + +func (s *session) finishTargetReply(rep *bus.Message, ok bool) { + call := s.pendingTargetCall + if call == nil { + return + } + if !ok || rep == nil { + s.finishTargetCall(call, false, "stage_no_reply") + return + } + okReply, reason := decodeStageReply(rep.Payload) + otadiag.Event("[fabric-xfer]", "target_call_reply", call.xferID, + otadiag.KV("ok", okReply), + otadiag.KV("reason", reason), + otadiag.KV("bytes", call.info.BytesWritten), + otadiag.KV("generation", call.info.Generation), + ) + s.finishTargetCall(call, okReply, reason) +} + +func (s *session) cancelTargetCall(reason string) { + call := s.pendingTargetCall + if call == nil { + return + } + if reason == "" { + reason = reasonLinkDown + } + if call.sub != nil { + s.conn.Unsubscribe(call.sub) + call.sub = nil + } + s.pendingTargetCall = nil + call.info.cancelStage(reason) + if fabricXferDiagEnabled("target_call_cancel") { + otadiag.Event("[fabric-xfer]", "target_call_cancel", call.xferID, otadiag.KV("reason", reason)) } } @@ -761,3 +1479,30 @@ func xxhashHex(v uint32) string { } return string(buf[:]) } + +func byteHexPrefix(b []byte, n int) string { + if n < 0 { + n = 0 + } + if n > len(b) { + n = len(b) + } + const digits = "0123456789abcdef" + out := make([]byte, n*2) + for i := 0; i < n; i++ { + v := b[i] + out[i*2] = digits[v>>4] + out[i*2+1] = digits[v&0xf] + } + return string(out) +} + +func stringPrefix(s string, n int) string { + if n < 0 { + n = 0 + } + if n > len(s) { + n = len(s) + } + return s[:n] +} diff --git a/services/fabric/transfer_sink.go b/services/fabric/transfer_sink.go new file mode 100644 index 0000000..943f417 --- /dev/null +++ b/services/fabric/transfer_sink.go @@ -0,0 +1,94 @@ +package fabric + +import "errors" + +// streamedStageSink is the updater/main transfer sink. It keeps Fabric on the +// transfer-protocol side of the boundary: all update ownership goes through the +// explicit StageController supplied by the caller. +type streamedStageSink struct { + controller StageController + xferID string + generation uint64 + accepted uint32 + closed bool +} + +func beginUpdaterTransfer(controller StageController, meta transferMeta) (transferSink, error) { + if fabricTraceEnabled { + println("[fabric-sink]", "begin", "xfer", meta.ID, "target", meta.Target, "size", meta.Size) + } + if controller == nil { + return nil, errors.New("updater_stage_controller_missing") + } + generation, err := controller.BeginStreamedStage(meta.ID, meta.Size) + if err != nil { + if fabricTraceEnabled { + println("[fabric-sink]", "begin_error", "xfer", meta.ID, "err", err.Error()) + } + return nil, err + } + if fabricTraceEnabled { + println("[fabric-sink]", "begin_ok", "xfer", meta.ID, "generation", generation) + } + return &streamedStageSink{controller: controller, xferID: meta.ID, generation: generation}, nil +} + +func (s *streamedStageSink) WriteChunk(off uint32, data []byte) error { + if fabricTraceEnabled { + println("[fabric-sink]", "write", "xfer", s.xferID, "generation", s.generation, "offset", off, "len", len(data)) + } + if s.closed { + return errors.New("sink_closed") + } + if s.accepted != off { + return errors.New("unexpected_offset") + } + if err := s.controller.WriteStreamedStage(s.xferID, s.generation, data); err != nil { + if fabricTraceEnabled { + println("[fabric-sink]", "write_error", "xfer", s.xferID, "err", err.Error()) + } + return err + } + s.accepted += uint32(len(data)) + if fabricTraceEnabled { + println("[fabric-sink]", "write_ok", "xfer", s.xferID, "accepted", s.accepted) + } + return nil +} + +func (s *streamedStageSink) Commit() (transferInfo, error) { + if fabricTraceEnabled { + println("[fabric-sink]", "commit", "xfer", s.xferID, "generation", s.generation) + } + if s.closed { + return transferInfo{}, errors.New("sink_closed") + } + written, err := s.controller.CommitStreamedStage(s.xferID, s.generation) + if err != nil { + if fabricTraceEnabled { + println("[fabric-sink]", "commit_error", "xfer", s.xferID, "err", err.Error()) + } + return transferInfo{}, err + } + if fabricTraceEnabled { + println("[fabric-sink]", "commit_ok", "xfer", s.xferID, "written", written) + } + s.closed = true + return transferInfo{ + BytesWritten: written, + Generation: s.generation, + cancel: s.cancelAfterCommit, + }, nil +} + +func (s *streamedStageSink) Apply() error { return nil } + +func (s *streamedStageSink) Abort(reason string) error { + s.controller.AbortStreamedStage(s.xferID, s.generation, reason) + s.closed = true + return nil +} + +func (s *streamedStageSink) cancelAfterCommit(reason string) { + s.controller.CancelStreamedStage(s.xferID, s.generation, reason) +} diff --git a/services/fabric/transfer_sink_buffer.go b/services/fabric/transfer_sink_buffer.go deleted file mode 100644 index c012c66..0000000 --- a/services/fabric/transfer_sink_buffer.go +++ /dev/null @@ -1,95 +0,0 @@ -package fabric - -import ( - "errors" - - "devicecode-go/services/updater" -) - -// bufferSink is the default in-memory transferSink: it buffers the -// verified-by-wire (xxHash32) artefact in RAM and exposes the bytes via -// Bytes() so onTransferCommit can hand them to the updater/main staging -// RPC. The updater is responsible for signed-image verification and staging. -// -// Size cap is deliberately conservative: the smoke tests target small -// artefacts and large firmware images need a streaming-into-flash sink. -// Hitting the cap aborts the transfer cleanly via WriteChunk -> -// ErrArtefactTooLarge. -const maxArtefactBytes = 64 * 1024 - -var ErrArtefactTooLarge = errors.New("artefact_too_large") - -type bufferSink struct { - meta transferMeta - generation uint64 - buf []byte - closed bool - committed bool -} - -func newBufferSink(meta transferMeta) (*bufferSink, error) { - generation, err := updater.BeginStreamedStage(meta.ID, meta.Size) - if err != nil { - return nil, err - } - return &bufferSink{ - meta: meta, - generation: generation, - buf: make([]byte, 0, sizeHint(meta.Size)), - }, nil -} - -func sizeHint(announced uint32) int { - if announced == 0 || announced > maxArtefactBytes { - return maxArtefactBytes - } - return int(announced) -} - -func (s *bufferSink) WriteChunk(off uint32, data []byte) error { - if s.closed { - return errors.New("sink_closed") - } - if int(off) != len(s.buf) { - return errors.New("unexpected_offset") - } - if len(s.buf)+len(data) > maxArtefactBytes { - return ErrArtefactTooLarge - } - s.buf = append(s.buf, data...) - return nil -} - -func (s *bufferSink) Commit() (transferInfo, error) { - if s.closed { - return transferInfo{}, errors.New("sink_closed") - } - if s.generation != 0 { - if err := updater.CommitBufferedStage(s.meta.ID, s.generation); err != nil { - return transferInfo{}, err - } - } - s.committed = true - return transferInfo{BytesWritten: uint32(len(s.buf)), Generation: s.generation}, nil -} - -// Apply is a no-op for the buffer sink — the staged-image apply -// (slot switch + reboot) belongs to the updater's commit RPC, not to -// fabric's transfer state machine. -func (s *bufferSink) Apply() error { return nil } - -func (s *bufferSink) Abort(reason string) error { - if s.generation != 0 { - updater.AbortStreamedStage(s.meta.ID, s.generation, reason) - } - s.buf = nil - s.closed = true - return nil -} - -func (s *bufferSink) Bytes() []byte { - if !s.committed { - return nil - } - return s.buf -} diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go deleted file mode 100644 index 56ba8de..0000000 --- a/services/fabric/transfer_sink_rp2350.go +++ /dev/null @@ -1,64 +0,0 @@ -//go:build tinygo && rp2350 - -package fabric - -import ( - "errors" - - "devicecode-go/services/updater" -) - -type streamedStageSink struct { - xferID string - generation uint64 - accepted uint32 - closed bool -} - -func beginTransfer(meta transferMeta) (transferSink, error) { - generation, err := updater.BeginStreamedStage(meta.ID, meta.Size) - if err != nil { - return nil, err - } - return &streamedStageSink{xferID: meta.ID, generation: generation}, nil -} - -func (s *streamedStageSink) WriteChunk(off uint32, data []byte) error { - if s.closed { - return errors.New("sink_closed") - } - if s.accepted != off { - return errors.New("unexpected_offset") - } - if err := updater.WriteStreamedStage(s.xferID, s.generation, data); err != nil { - return err - } - s.accepted += uint32(len(data)) - return nil -} - -func (s *streamedStageSink) Commit() (transferInfo, error) { - if s.closed { - return transferInfo{}, errors.New("sink_closed") - } - written, err := updater.CommitStreamedStage(s.xferID, s.generation) - if err != nil { - return transferInfo{}, err - } - s.closed = true - return transferInfo{BytesWritten: written, Generation: s.generation}, nil -} - -func (s *streamedStageSink) Apply() error { return nil } - -func (s *streamedStageSink) Abort(reason string) error { - updater.AbortStreamedStage(s.xferID, s.generation, reason) - s.closed = true - return nil -} - -// Bytes returns nil because the TinyGo RP2350 default path verifies the signed -// container while streaming and writes only the authenticated payload into the -// inactive slot. fabric still calls updater/main staging; the updater consumes -// the verified staged descriptor instead of an in-RAM artefact. -func (s *streamedStageSink) Bytes() []byte { return nil } diff --git a/services/fabric/transfer_sink_stub.go b/services/fabric/transfer_sink_stub.go deleted file mode 100644 index 9554cff..0000000 --- a/services/fabric/transfer_sink_stub.go +++ /dev/null @@ -1,11 +0,0 @@ -//go:build !(tinygo && rp2350) - -// Host build (tests, dev tooling): same buffer-sink behaviour as the -// default RP2350 build. Lets unit tests exercise updater/main staging -// without firmware stubs in the way. - -package fabric - -func beginTransfer(meta transferMeta) (transferSink, error) { - return newBufferSink(meta) -} diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index 53daa2d..fe355f4 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -18,18 +18,27 @@ import ( ) type fakeTransferSink struct { - offs []uint32 - writes [][]byte - writeErr error - commitErr error - applyErr error - commitInfo transferInfo - committed bool - applied bool - abortReasons []string + offs []uint32 + writes [][]byte + writeErr error + writeEntered chan struct{} + writeEnterOnce sync.Once + writeRelease chan struct{} + commitErr error + applyErr error + commitInfo transferInfo + committed bool + applied bool + abortReasons []string } func (s *fakeTransferSink) WriteChunk(off uint32, data []byte) error { + if s.writeEntered != nil { + s.writeEnterOnce.Do(func() { close(s.writeEntered) }) + } + if s.writeRelease != nil { + <-s.writeRelease + } if s.writeErr != nil { return s.writeErr } @@ -56,9 +65,22 @@ func (s *fakeTransferSink) Abort(reason string) error { return nil } -// Bytes returns nil because the test fake doesn't retain a RAM copy -// of the transferred bytes — it tracks per-chunk writes instead. -func (s *fakeTransferSink) Bytes() []byte { return nil } +func waitAbortReason(t *testing.T, sink *fakeTransferSink, want string) { + t.Helper() + deadline := time.Now().Add(time.Second) + for { + if len(sink.abortReasons) > 0 { + if want != "" && sink.abortReasons[0] != want { + t.Fatalf("sink.Abort reasons = %v, want %q", sink.abortReasons, want) + } + return + } + if time.Now().After(deadline) { + t.Fatalf("timed out waiting for sink.Abort(%q); reasons=%v", want, sink.abortReasons) + } + time.Sleep(time.Millisecond) + } +} type diagCapture struct { mu sync.Mutex @@ -398,6 +420,49 @@ func readTransferNeed(t *testing.T, tr Transport, id string, next uint32) { } } +func readUntilTransferAbort(t *testing.T, tr Transport, id, reason string) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for { + if time.Now().After(deadline) { + t.Fatalf("timed out waiting for xfer_abort id=%s err=%s", id, reason) + } + lineCh := make(chan []byte, 1) + errCh := make(chan error, 1) + go func() { + line, err := tr.ReadLine() + if err != nil { + errCh <- err + return + } + lineCh <- append([]byte(nil), line...) + }() + var line []byte + select { + case line = <-lineCh: + case err := <-errCh: + t.Fatalf("ReadLine: %v", err) + case <-time.After(time.Until(deadline)): + t.Fatalf("timed out waiting for xfer_abort id=%s err=%s", id, reason) + } + var probe struct { + Type string `json:"type"` + XferID string `json:"xfer_id"` + Err string `json:"err"` + } + if err := json.Unmarshal(line, &probe); err != nil { + t.Fatalf("Unmarshal %q: %v", line, err) + } + if probe.Type != msgXferAbort { + continue + } + if probe.XferID != id || probe.Err != reason { + t.Fatalf("bad xfer_abort: %+v, want id=%s err=%s", probe, id, reason) + } + return + } +} + func readTransferAbort(t *testing.T, tr Transport, id, reason string) { t.Helper() abort := readMsg[protoXferAbort](t, tr) @@ -413,16 +478,30 @@ func writeRawLine(t *testing.T, tr Transport, line string) { } } +func TestTransferBeginWithoutStageControllerAbortsNoReady(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-no-controller", payload, nil)) + readTransferAbort(t, cm5, "xfer-no-controller", "updater_stage_controller_missing") +} + func TestTransferBeginWithoutPrepareAbortsNoReady(t *testing.T) { diag := captureOTADiag(t) b := newBus() - cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{}) + cancelUpdater, updaterSvc := runUpdaterForFabricTest(t, b, updater.Options{}) defer cancelUpdater() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + go RunWithOptions(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig(), RunOptions{StageController: updaterSvc}) bringUp(t, cm5) payload := []byte("abcd") @@ -438,7 +517,7 @@ func TestTransferBeginWithoutPrepareAbortsNoReady(t *testing.T) { func TestPreparedTransferBeginSendsReadyThenNeedZero(t *testing.T) { diag := captureOTADiag(t) b := newBus() - cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{}) + cancelUpdater, updaterSvc := runUpdaterForFabricTest(t, b, updater.Options{}) defer cancelUpdater() caller := b.NewConnection("caller") observer := b.NewConnection("observer") @@ -449,7 +528,7 @@ func TestPreparedTransferBeginSendsReadyThenNeedZero(t *testing.T) { cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + go RunWithOptions(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig(), RunOptions{StageController: updaterSvc}) bringUp(t, cm5) payload := []byte("abcd") @@ -512,7 +591,7 @@ func TestInvalidTransferBeginEmitsRejectDiagnosticNoActiveTransfer(t *testing.T) func TestTransferAbortCancelsUpdaterLease(t *testing.T) { b := newBus() - cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{}) + cancelUpdater, updaterSvc := runUpdaterForFabricTest(t, b, updater.Options{}) defer cancelUpdater() caller := b.NewConnection("caller") observer := b.NewConnection("observer") @@ -523,7 +602,7 @@ func TestTransferAbortCancelsUpdaterLease(t *testing.T) { cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + go RunWithOptions(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig(), RunOptions{StageController: updaterSvc}) bringUp(t, cm5) payload := []byte("abcd") @@ -542,7 +621,7 @@ func TestTransferAbortCancelsUpdaterLease(t *testing.T) { func TestTransferTargetRejectCancelsLeaseAndPreventsCommit(t *testing.T) { b := newBus() memMD := updater.NewMemoryMetadata() - cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{ + cancelUpdater, updaterSvc := runUpdaterForFabricTest(t, b, updater.Options{ Verifier: updater.StubVerifier(), Metadata: memMD, MetadataWrite: memMD, @@ -554,7 +633,7 @@ func TestTransferTargetRejectCancelsLeaseAndPreventsCommit(t *testing.T) { cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + go RunWithOptions(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig(), RunOptions{StageController: updaterSvc}) bringUp(t, cm5) payload := []byte("abcd") @@ -574,8 +653,8 @@ func TestTransferTargetRejectCancelsLeaseAndPreventsCommit(t *testing.T) { replyPayload := requestUpdaterForFabricTest(t, caller, updater.TopicCommitRPC, updater.CommitRequest{}) reply, ok := replyPayload.(updater.Reply) - if !ok || reply.OK || reply.Error != updater.ErrNothingStaged { - t.Fatalf("commit after rejected transfer = %#v, want nothing_staged", replyPayload) + if !ok || reply.OK || reply.Error != updater.ErrNoStagedImage { + t.Fatalf("commit after rejected transfer = %#v, want no_staged_image", replyPayload) } } @@ -727,44 +806,43 @@ func TestTransferAcceptedChunkEmitsProcessingDiagnostics(t *testing.T) { []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev chunk_digest_done", "ok true"}, []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev sink_write_start", "offset 0", "raw_len 4"}, []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev sink_write_done", "next 4"}, - []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev gc_start", "next 4"}, - []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev gc_done", "next 4"}, []string{"[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev need_tx", "next 4", "ok true", "accepted true"}, ) assertDiagNotContains(t, diag.snapshot(), "[fabric-xfer]", "xfer_id xfer-chunk-diag", "ev transfer_mem_sample") } -func TestTransferAcceptedChunkEmitsSparseMemorySample(t *testing.T) { +func TestTransferNeedIsSentOnlyAfterPendingChunkWriteCompletes(t *testing.T) { diag := captureOTADiag(t) b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - sink := &fakeTransferSink{} + sink := &fakeTransferSink{ + writeEntered: make(chan struct{}), + writeRelease: make(chan struct{}), + } go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) bringUp(t, cm5) - payload := make([]byte, transferMemSampleStride) - for i := range payload { - payload[i] = byte(i) + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-pending-chunk", payload, nil)) + readTransferReady(t, cm5, "xfer-pending-chunk", 0) + + sendMsg(t, cm5, xferChunk("xfer-pending-chunk", 0, payload)) + select { + case <-sink.writeEntered: + case <-time.After(time.Second): + t.Fatal("sink write did not start") } - sendMsg(t, cm5, xferBegin("xfer-mem-diag", payload, nil)) - readTransferReady(t, cm5, "xfer-mem-diag", 0) + time.Sleep(100 * time.Millisecond) + assertDiagNotContains(t, diag.snapshot(), "[fabric-xfer]", "xfer_id xfer-pending-chunk", "ev need_tx", "accepted true") - const chunkSize = 2048 - for off := 0; off < len(payload); off += chunkSize { - end := off + chunkSize - if end > len(payload) { - end = len(payload) - } - sendMsg(t, cm5, xferChunk("xfer-mem-diag", uint32(off), payload[off:end])) - need := readMsg[protoXferNeed](t, cm5) - if need.Next != uint32(end) { - t.Fatalf("xfer_need.next = %d, want %d", need.Next, end) - } + close(sink.writeRelease) + need := readMsg[protoXferNeed](t, cm5) + if need.Next != uint32(len(payload)) { + t.Fatalf("xfer_need.next = %d, want %d", need.Next, len(payload)) } - waitDiagContains(t, diag, "[fabric-xfer]", "xfer_id xfer-mem-diag", "ev transfer_mem_sample", "next 65536", "alloc", "heap") } func TestTransferChunkFutureOffsetRequestsCurrentAndCompletes(t *testing.T) { @@ -879,7 +957,7 @@ func TestTransferStaleLowerOffsetDoesNotRefreshPhaseDeadline(t *testing.T) { cfg: LinkConfig{PhaseTimeout: time.Second}, incomingTransfer: &incomingTransfer{ meta: transferMeta{ID: "xfer-stale-deadline", Size: 6}, - sink: sink, + worker: newTransferSinkWorker("xfer-stale-deadline", sink), bytesWritten: 3, deadline: oldDeadline, }, @@ -914,7 +992,7 @@ func TestTransferCurrentCorruptChunkRefreshesLinkLiveness(t *testing.T) { cfg: LinkConfig{PhaseTimeout: time.Second}, incomingTransfer: &incomingTransfer{ meta: transferMeta{ID: "xfer-corrupt-liveness", Size: 4}, - sink: &fakeTransferSink{}, + worker: newTransferSinkWorker("xfer-corrupt-liveness", &fakeTransferSink{}), deadline: time.Now().Add(time.Second), }, } @@ -1009,9 +1087,7 @@ func TestTransferChunkMissingDigestRetriesThenAborts(t *testing.T) { Data: rawURL(payload), }) readTransferAbort(t, cm5, "xfer-missing-digest", "bad_message") - if len(sink.abortReasons) == 0 { - t.Fatal("expected sink.Abort on missing chunk digest") - } + waitAbortReason(t, sink, "bad_message") } func TestTransferChunkInvalidBase64RetriesThenAborts(t *testing.T) { @@ -1046,9 +1122,7 @@ func TestTransferChunkInvalidBase64RetriesThenAborts(t *testing.T) { ChunkDigest: xxhashStr(payload), }) readTransferAbort(t, cm5, "xfer-bad-b64", "invalid_chunk_encoding") - if len(sink.abortReasons) == 0 || sink.abortReasons[0] != "invalid_chunk_encoding" { - t.Fatalf("sink.Abort reasons = %v, want invalid_chunk_encoding", sink.abortReasons) - } + waitAbortReason(t, sink, "invalid_chunk_encoding") } func TestTransferChunkDigestMismatchRequestsSameOffset(t *testing.T) { @@ -1083,7 +1157,6 @@ func TestTransferChunkDigestMismatchRequestsSameOffset(t *testing.T) { lines := diag.snapshot() assertDiagContains(t, lines, "[fabric-xfer]", "xfer_id xfer-bad-chunk-digest", "ev chunk_digest_done", "ok false", "reason chunk_digest_mismatch") assertDiagNotContains(t, lines, "[fabric-xfer]", "xfer_id xfer-bad-chunk-digest", "ev sink_write_start") - assertDiagNotContains(t, lines, "[fabric-xfer]", "xfer_id xfer-bad-chunk-digest", "ev gc_start") sendMsg(t, cm5, xferChunk("xfer-bad-chunk-digest", 0, payload)) need = readMsg[protoXferNeed](t, cm5) @@ -1202,7 +1275,7 @@ func TestTransferMalformedWrongXferIDDoesNotChargeActiveTransfer(t *testing.T) { tr: tr, incomingTransfer: &incomingTransfer{ meta: transferMeta{ID: activeID}, - sink: sink, + worker: newTransferSinkWorker(activeID, sink), deadline: time.Now().Add(time.Second), }, } @@ -1422,21 +1495,7 @@ func TestTransferCommitDigestMismatchAborts(t *testing.T) { if abort.Type != msgXferAbort || abort.Err != "digest_mismatch" { t.Fatalf("bad xfer_abort: %+v", abort) } - if len(sink.abortReasons) == 0 { - t.Fatal("expected sink abort on digest mismatch") - } -} - -// bufferingSinkAdapter wraps the production bufferSink so transfer tests -// can assert the bytes passed to updater/main staging. -type bufferingSinkAdapter struct { - *bufferSink - abortReasons []string -} - -func (b *bufferingSinkAdapter) Abort(reason string) error { - b.abortReasons = append(b.abortReasons, reason) - return b.bufferSink.Abort(reason) + waitAbortReason(t, sink, "digest_mismatch") } func TestTransferTargetInvokedAfterCommit(t *testing.T) { @@ -1450,7 +1509,7 @@ func TestTransferTargetInvokedAfterCommit(t *testing.T) { gotPayload := installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) - sink := &bufferingSinkAdapter{bufferSink: &bufferSink{meta: transferMeta{Size: 4}, buf: make([]byte, 0, 4)}} + sink := &fakeTransferSink{commitInfo: transferInfo{BytesWritten: 4, Generation: 7}} s := session{ linkID: defaultLinkID, nodeID: "mcu", @@ -1459,7 +1518,6 @@ func TestTransferTargetInvokedAfterCommit(t *testing.T) { tr: mcu, conn: b.NewConnection("fabric"), beginTransfer: func(meta transferMeta) (transferSink, error) { - sink.bufferSink.meta = meta return sink, nil }, } @@ -1488,8 +1546,8 @@ func TestTransferTargetInvokedAfterCommit(t *testing.T) { if p.Target != updater.TargetUpdaterMain || p.DigestAlg != updater.DigestAlgXXHash32 || p.Digest != xxhashStr(payload) { t.Fatalf("stage contract fields wrong: %+v", p) } - if string(p.Artefact) != string(payload) { - t.Fatalf("stage artefact = %v, want %q", p.Artefact, payload) + if p.Size != uint32(len(payload)) || p.Generation != 7 { + t.Fatalf("stage size/generation wrong: %+v", p) } case <-time.After(2 * time.Second): t.Fatal("timeout waiting for stage call") @@ -1620,7 +1678,7 @@ func TestTransferTargetRejectAbortsTransfer(t *testing.T) { _ = installStageResponder(t, b, updater.StageReply{OK: false, Err: "manifest_check_failed"}) - sink := &bufferingSinkAdapter{bufferSink: &bufferSink{meta: transferMeta{Size: 4}, buf: make([]byte, 0, 4)}} + sink := &fakeTransferSink{commitInfo: transferInfo{BytesWritten: 4, Generation: 7}} s := session{ linkID: defaultLinkID, nodeID: "mcu", @@ -1629,7 +1687,6 @@ func TestTransferTargetRejectAbortsTransfer(t *testing.T) { tr: mcu, conn: b.NewConnection("fabric"), beginTransfer: func(meta transferMeta) (transferSink, error) { - sink.bufferSink.meta = meta return sink, nil }, } @@ -1652,7 +1709,7 @@ func TestTransferTargetRejectAbortsTransfer(t *testing.T) { } } -func TestTransferTargetStageTimeoutCancelsLeaseAndPreventsLateStagePersist(t *testing.T) { +func TestTransferCommitTimeoutCancelsLeaseAndPreventsLateStagePersist(t *testing.T) { b := newBus() memMD := updater.NewMemoryMetadata() verif := &blockingVerifier{ @@ -1666,7 +1723,7 @@ func TestTransferTargetStageTimeoutCancelsLeaseAndPreventsLateStagePersist(t *te PayloadLength: 4, }, } - cancelUpdater, _ := runUpdaterForFabricTest(t, b, updater.Options{ + cancelUpdater, updaterSvc := runUpdaterForFabricTest(t, b, updater.Options{ Verifier: verif, Metadata: memMD, MetadataWrite: memMD, @@ -1675,14 +1732,13 @@ func TestTransferTargetStageTimeoutCancelsLeaseAndPreventsLateStagePersist(t *te caller := b.NewConnection("caller") prepareUpdaterForFabricTest(t, caller) - oldTimeout := targetCallTimeout - targetCallTimeout = 20 * time.Millisecond - defer func() { targetCallTimeout = oldTimeout }() + cfg := DefaultLinkConfig() + cfg.TargetCallTimeout = 20 * time.Millisecond cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", DefaultLinkConfig()) + go RunWithOptions(ctx, mcu, b.NewConnection("fabric"), "mcu", "bigbox-cm5", cfg, RunOptions{StageController: updaterSvc}) bringUp(t, cm5) id := "xfer-stage-timeout" @@ -1695,17 +1751,22 @@ func TestTransferTargetStageTimeoutCancelsLeaseAndPreventsLateStagePersist(t *te select { case <-verif.entered: case <-time.After(2 * time.Second): - t.Fatal("verifier did not start before stage timeout") + t.Fatal("verifier did not start before commit timeout") } - readTransferAbort(t, cm5, id, "stage_timeout") + // Let the configured commit deadline pass while the verifier/flash operation + // remains blocked. The worker observes that deadline at the next safe point. + time.Sleep(50 * time.Millisecond) if _, ok := memMD.StagedDescriptor(); ok { - t.Fatal("stage timeout persisted descriptor before verifier returned") + t.Fatal("descriptor persisted while verifier was still blocked") } + // The stage worker now owns the verifier/flash call directly rather than + // spawning a nested goroutine. The timeout is therefore observed at the + // next safe point: after the bounded verifier operation returns. close(verif.release) - time.Sleep(50 * time.Millisecond) + readUntilTransferAbort(t, cm5, id, "transfer_commit_timeout") if _, ok := memMD.StagedDescriptor(); ok { - t.Fatal("late verifier completion after stage timeout persisted descriptor") + t.Fatal("late verifier completion after commit timeout persisted descriptor") } } @@ -1752,9 +1813,7 @@ func TestTransferIdleChunkWatchdog(t *testing.T) { if abort.Type != msgXferAbort || abort.XferID != "xfer-wd" || abort.Err != "timeout" { t.Fatalf("bad xfer_abort: %+v", abort) } - if len(sink.abortReasons) == 0 || sink.abortReasons[0] != "timeout" { - t.Fatalf("sink.Abort reasons = %v, want [\"timeout\"]", sink.abortReasons) - } + waitAbortReason(t, sink, "timeout") } func TestTransferCommitDigestMismatchOnCommitFrameAborts(t *testing.T) { diff --git a/services/fabric/transport_limits.go b/services/fabric/transport_limits.go index 8d689ad..01af198 100644 --- a/services/fabric/transport_limits.go +++ b/services/fabric/transport_limits.go @@ -3,8 +3,8 @@ package fabric import "fmt" // maxLineLen caps a single fabric frame (line-delimited JSON) end-to-end. -// It must clear the release transfer chunk: 2048 raw bytes becomes about -// 2731 base64url chars, plus JSON envelope and newline. 4096 leaves margin +// It must clear the release transfer chunk: MaxAcceptedChunkSize raw bytes becomes about +// maxChunkBase64Len base64url chars, plus JSON envelope and newline. 4096 leaves margin // while keeping malformed lines bounded. const maxLineLen = 4096 diff --git a/services/fabric/transport_shmring.go b/services/fabric/transport_shmring.go index dece826..fe8d573 100644 --- a/services/fabric/transport_shmring.go +++ b/services/fabric/transport_shmring.go @@ -10,27 +10,41 @@ import ( // ShmringTransport implements Transport over two shmring rings (RX + TX). // Used for UART0 in production (main.go). type ShmringTransport struct { - rx *shmring.Ring - tx *shmring.Ring - cancel context.CancelFunc - ctx context.Context - buf []byte - over bool // draining an oversize line + rx *shmring.Ring + tx *shmring.Ring + cancel context.CancelFunc + ctx context.Context + lineBuf *[maxLineLen]byte + n int + over bool // draining an oversize line } func NewShmringTransport(rx, tx *shmring.Ring) *ShmringTransport { + return NewShmringTransportWithBuffers(rx, tx, nil) +} + +func NewShmringTransportWithBuffers(rx, tx *shmring.Ring, buffers *FabricBuffers) *ShmringTransport { ctx, cancel := context.WithCancel(context.Background()) - return &ShmringTransport{ - rx: rx, - tx: tx, - cancel: cancel, - ctx: ctx, - buf: make([]byte, 0, 256), - } + buf := &ensureFabricBuffers(buffers).TransportLine + return &ShmringTransport{rx: rx, tx: tx, cancel: cancel, ctx: ctx, lineBuf: buf} } func (t *ShmringTransport) ReadLine() ([]byte, error) { - t.buf = t.buf[:0] + var tmp [maxLineLen]byte + n, err := t.ReadLineInto(tmp[:]) + if err != nil { + return nil, err + } + out := make([]byte, n) + copy(out, tmp[:n]) + return out, nil +} + +func (t *ShmringTransport) ReadLineInto(dst []byte) (int, error) { + if len(dst) < maxLineLen { + return 0, fmt.Errorf("fabric read buffer too small: %d", len(dst)) + } + t.n = 0 t.over = false for { @@ -38,78 +52,80 @@ func (t *ShmringTransport) ReadLine() ([]byte, error) { if len(p1)+len(p2) == 0 { select { case <-t.ctx.Done(): - return nil, fmt.Errorf("transport closed") + return 0, fmt.Errorf("transport closed") case <-t.rx.Readable(): continue } } - // Scan p1 for newline. if idx := findByte(p1, '\n'); idx >= 0 { - if !t.over { - t.buf = append(t.buf, p1[:idx]...) + if !t.over && !t.appendLineChunk(p1[:idx]) { + t.over = true } t.rx.ReadRelease(idx + 1) - if t.over { - t.buf = t.buf[:0] - t.over = false - return nil, ErrLineTooLong - } - if len(t.buf) > maxLineLen { - return nil, ErrLineTooLong - } - out := make([]byte, len(t.buf)) - copy(out, t.buf) - traceLine("rx", out) - return out, nil + return t.finishLineInto(dst) } - // No newline in p1 — consume it, check p2. - if !t.over { - t.buf = append(t.buf, p1...) + if !t.over && !t.appendLineChunk(p1) { + t.over = true } if idx := findByte(p2, '\n'); idx >= 0 { - if !t.over { - t.buf = append(t.buf, p2[:idx]...) + if !t.over && !t.appendLineChunk(p2[:idx]) { + t.over = true } t.rx.ReadRelease(len(p1) + idx + 1) - if t.over { - t.buf = t.buf[:0] - t.over = false - return nil, ErrLineTooLong - } - if len(t.buf) > maxLineLen { - return nil, ErrLineTooLong - } - out := make([]byte, len(t.buf)) - copy(out, t.buf) - traceLine("rx", out) - return out, nil + return t.finishLineInto(dst) } - // No newline — consume everything, wait for more. - if !t.over { - t.buf = append(t.buf, p2...) + if !t.over && !t.appendLineChunk(p2) { + t.over = true } t.rx.ReadRelease(len(p1) + len(p2)) + } +} - // Check for oversize. - if len(t.buf) > maxLineLen { - t.buf = t.buf[:0] - t.over = true - } +func (t *ShmringTransport) appendLineChunk(p []byte) bool { + if len(p) == 0 { + return true + } + if t.n+len(p) > maxLineLen { + t.n = 0 + return false + } + copy(t.lineBuf[t.n:], p) + t.n += len(p) + return true +} + +func (t *ShmringTransport) finishLineInto(dst []byte) (int, error) { + if t.over { + t.n = 0 + t.over = false + return 0, ErrLineTooLong } + copy(dst, t.lineBuf[:t.n]) + traceLine("rx", dst[:t.n]) + return t.n, nil } func (t *ShmringTransport) WriteLine(data []byte) error { if len(data) > maxLineLen { return ErrLineTooLong } - line := append(data, '\n') - written := 0 + if err := t.writeBytes(data); err != nil { + return err + } + if err := t.writeBytes([]byte{'\n'}); err != nil { + return err + } + traceLine("tx", data) + return nil +} - for written < len(line) { +func (t *ShmringTransport) writeBytes(data []byte) error { + written := 0 + for written < len(data) { p1, p2 := t.tx.WriteAcquire() if len(p1)+len(p2) == 0 { select { @@ -119,8 +135,7 @@ func (t *ShmringTransport) WriteLine(data []byte) error { continue } } - - remaining := line[written:] + remaining := data[written:] n := copy(p1, remaining) remaining = remaining[n:] if len(remaining) > 0 && len(p2) > 0 { @@ -129,7 +144,6 @@ func (t *ShmringTransport) WriteLine(data []byte) error { t.tx.WriteCommit(n) written += n } - traceLine("tx", data) return nil } diff --git a/services/fabric/writer.go b/services/fabric/writer.go index 37286d5..4a1fe97 100644 --- a/services/fabric/writer.go +++ b/services/fabric/writer.go @@ -125,21 +125,14 @@ func (s *session) writeFrame(l lane, data []byte) bool { if len(data) > 0 && data[len(data)-1] == '\n' { data = data[:len(data)-1] } - if fabricTraceEnabled { - println( - "[fabric]", "sid", s.localSID, - "tx_frame", - "lane", laneName(l), - "type", protoType(data), - "len", len(data), - "line", tracePreview(data), - ) - } + s.traceWire("tx", laneName(l), data, nil) if err := s.tr.WriteLine(data); err != nil { if errors.Is(err, ErrLineTooLong) { + s.traceWireError("tx", "oversized_write_dropped", err.Error(), map[string]any{"lane": laneName(l), "frame_type": protoType(data)}) s.log("oversized write dropped") return true } + s.traceWireError("tx", "write_failed", err.Error(), map[string]any{"lane": laneName(l), "frame_type": protoType(data)}) s.handleLinkDown(reasonTransportWrite, err.Error()) return false } diff --git a/services/fabric/xfer_probe_disabled.go b/services/fabric/xfer_probe_disabled.go new file mode 100644 index 0000000..0527fa9 --- /dev/null +++ b/services/fabric/xfer_probe_disabled.go @@ -0,0 +1,7 @@ +//go:build !fabric_xfer_probe + +package fabric + +const xferProbeEnabled = false + +func xferProbe(args ...any) {} diff --git a/services/fabric/xfer_probe_enabled.go b/services/fabric/xfer_probe_enabled.go new file mode 100644 index 0000000..7c3d4a1 --- /dev/null +++ b/services/fabric/xfer_probe_enabled.go @@ -0,0 +1,87 @@ +//go:build fabric_xfer_probe + +package fabric + +import "strconv" + +const xferProbeEnabled = true + +var xferProbeLastProgress uint32 + +func xferProbe(args ...any) { + if !xferProbeShouldPrint(args...) { + return + } + print("[fabric-xfer-probe]") + for _, a := range args { + print(" ") + switch v := a.(type) { + case string: + print(v) + case int: + print(strconv.Itoa(v)) + case uint32: + print(strconv.FormatUint(uint64(v), 10)) + case uint64: + print(strconv.FormatUint(v, 10)) + case bool: + if v { + print("true") + } else { + print("false") + } + case error: + if v != nil { + print(v.Error()) + } + default: + print("?") + } + } + println() +} + +func xferProbeShouldPrint(args ...any) bool { + if len(args) == 0 { + return false + } + event, _ := args[0].(string) + switch event { + case "chunk_rx", "write_start", "write_done": + return false + case "need_after_write": + // Progress only. Per-chunk printing materially perturbs UART RX service + // while the peer is already sending the next chunk. + next, ok := xferProbeArgUint32(args, "next") + if !ok { + return false + } + if next == 0 || next-xferProbeLastProgress >= 32768 { + xferProbeLastProgress = next + return true + } + return false + default: + return true + } +} + +func xferProbeArgUint32(args []any, key string) (uint32, bool) { + for i := 0; i+1 < len(args); i++ { + k, ok := args[i].(string) + if !ok || k != key { + continue + } + switch v := args[i+1].(type) { + case uint32: + return v, true + case int: + if v >= 0 { + return uint32(v), true + } + case uint64: + return uint32(v), true + } + } + return 0, false +} diff --git a/services/hal/devices/serial_raw/builder.go b/services/hal/devices/serial_raw/builder.go index 15ecaca..2616155 100644 --- a/services/hal/devices/serial_raw/builder.go +++ b/services/hal/devices/serial_raw/builder.go @@ -2,16 +2,13 @@ package serial_raw import ( "context" - "runtime" "sync/atomic" "time" "devicecode-go/errcode" "devicecode-go/services/hal/internal/core" - "devicecode-go/services/otadiag" "devicecode-go/types" "devicecode-go/x/shmring" - "devicecode-go/x/strconvx" ) // ---- Parameters ---- @@ -25,12 +22,6 @@ type Params struct { TXSize int // power of two; default 512 if zero in SessionOpen } -const ( - serialRawPumpRXBudget = 256 - serialRawPumpTXBudget = 256 - serialRawPumpGapWarn = 20 * time.Millisecond -) - // ---- Device ---- type Device struct { @@ -58,19 +49,7 @@ type session struct { rxRing *shmring.Ring txHandle shmring.Handle txRing *shmring.Ring - - // Reactor-owned observability. Single writer only. - rxRingFull uint32 - rxLogAt time.Time - rxLogHits uint32 - rxPressureAt time.Time - rxPressureHits uint32 - rxPumpGapAt time.Time - rxPumpGapHits uint32 - lastRXPumpAt time.Time - lastRXPumpMoved int - lastRXPumpDurMS int - lastRXPumpGapMS int + probe uartxProbe // Single worker (reactor) for the port. ctx context.Context @@ -78,19 +57,6 @@ type session struct { done chan struct{} } -type serialRXDiagnostics interface { - RXBuffered() int - RXBufferCap() int -} - -type serialRXErrorDiagnostics interface { - RXDropCount() uint32 - RXOverrunCount() uint32 - RXBreakCount() uint32 - RXParityCount() uint32 - RXFramingCount() uint32 -} - // ---- Builder registration ---- func Builder() core.Builder { return builder{} } @@ -204,12 +170,6 @@ func (d *Device) Control(_ core.CapAddr, verb string, payload any) (core.Enqueue } d.startSession(rxSize, txSize) - println( - "[serial-raw]", "session_open", - "uart", d.a.Name, - "rx_size", strconvx.Itoa(rxSize), - "tx_size", strconvx.Itoa(txSize), - ) // --- Device-level hygiene: drain spurious RX before signalling link up --- // Discard any pre-existing or immediately-arriving bytes on the UART RX path. @@ -352,247 +312,131 @@ func (d *Device) stopSession() { // ---- Reactor (single goroutine) ---- -func (d *Device) logRingFullChange(s *session, force bool) { - const rxLogMinInterval = 1 * time.Second - - hits := s.rxRingFull - - if !force { - now := time.Now() - if now.Sub(s.rxLogAt) < rxLogMinInterval { - return - } - if hits == s.rxLogHits { - return - } - s.rxLogAt = now - } else { - s.rxLogAt = time.Now() - } - - println( - "[serial-raw]", "rx_ring_full", - "uart", d.a.Name, - "hits", strconvx.Utoa64(uint64(hits)), - "ring_avail", strconvx.Itoa(s.rxRing.Available()), - "ring_space", strconvx.Itoa(s.rxRing.Space()), - "ring_cap", strconvx.Itoa(s.rxRing.Cap()), - ) - s.rxLogHits = hits -} - -func (d *Device) appendRXPumpFields(s *session, fields []otadiag.Field, now time.Time) []otadiag.Field { - if !s.lastRXPumpAt.IsZero() { - fields = append(fields, otadiag.KV("since_rx_pump_ms", int(now.Sub(s.lastRXPumpAt)/time.Millisecond))) - } - fields = append(fields, - otadiag.KV("last_pump_moved", s.lastRXPumpMoved), - otadiag.KV("last_pump_dur_ms", s.lastRXPumpDurMS), - ) - if s.lastRXPumpGapMS >= 0 { - fields = append(fields, otadiag.KV("last_pump_gap_ms", s.lastRXPumpGapMS)) - } - return fields -} - -func appendRXErrorFields(port core.SerialPort, fields []otadiag.Field) []otadiag.Field { - diag, ok := port.(serialRXErrorDiagnostics) - if !ok { - return fields - } - return append(fields, - otadiag.KV("rx_drops", diag.RXDropCount()), - otadiag.KV("rx_overrun", diag.RXOverrunCount()), - otadiag.KV("rx_break", diag.RXBreakCount()), - otadiag.KV("rx_parity", diag.RXParityCount()), - otadiag.KV("rx_framing", diag.RXFramingCount()), - ) -} - -func (d *Device) logDriverPressure(s *session, force bool) { - const minInterval = 1 * time.Second - - diag, ok := d.port.(serialRXDiagnostics) - if !ok { - return - } - used := diag.RXBuffered() - capacity := diag.RXBufferCap() - if capacity <= 0 || used < 0 { - return - } - threshold := (capacity * 3) / 4 - if threshold < 1 { - threshold = 1 - } - if !force && used < threshold { - return - } - - hits := s.rxPressureHits + 1 - now := time.Now() - if !force { - if now.Sub(s.rxPressureAt) < minInterval { - return - } - } else { - now = time.Now() - } - s.rxPressureAt = now - s.rxPressureHits = hits - - fields := []otadiag.Field{ - otadiag.KV("uart", d.a.Name), - otadiag.KV("hits", strconvx.Utoa64(uint64(hits))), - otadiag.KV("driver_used", used), - otadiag.KV("driver_cap", capacity), - otadiag.KV("ring_avail", s.rxRing.Available()), - otadiag.KV("ring_space", s.rxRing.Space()), - otadiag.KV("ring_cap", s.rxRing.Cap()), - } - fields = d.appendRXPumpFields(s, fields, now) - fields = appendRXErrorFields(d.port, fields) - otadiag.Event("[serial-raw]", "rx_driver_pressure", otadiag.XferNone, fields...) - - if !s.lastRXPumpAt.IsZero() && now.Sub(s.lastRXPumpAt) >= serialRawPumpGapWarn { - d.logRXPumpGap(s, used, capacity, now) - } -} - -func (d *Device) logRXPumpGap(s *session, used, capacity int, now time.Time) { - const minInterval = 1 * time.Second - - if now.Sub(s.rxPumpGapAt) < minInterval { - return - } - s.rxPumpGapAt = now - s.rxPumpGapHits++ - fields := []otadiag.Field{ - otadiag.KV("uart", d.a.Name), - otadiag.KV("hits", strconvx.Utoa64(uint64(s.rxPumpGapHits))), - otadiag.KV("driver_used", used), - otadiag.KV("driver_cap", capacity), - otadiag.KV("ring_avail", s.rxRing.Available()), - otadiag.KV("ring_space", s.rxRing.Space()), - otadiag.KV("ring_cap", s.rxRing.Cap()), - otadiag.KV("since_rx_pump_ms", int(now.Sub(s.lastRXPumpAt)/time.Millisecond)), - otadiag.KV("last_pump_moved", s.lastRXPumpMoved), - otadiag.KV("last_pump_dur_ms", s.lastRXPumpDurMS), - otadiag.KV("last_pump_gap_ms", s.lastRXPumpGapMS), - } - fields = appendRXErrorFields(d.port, fields) - otadiag.Event("[serial-raw]", "rx_pump_gap", otadiag.XferNone, fields...) -} - -func (s *session) noteRXPump(moved int, started time.Time) { - if moved <= 0 { - return - } - now := time.Now() - gapMS := -1 - if !s.lastRXPumpAt.IsZero() { - gapMS = int(started.Sub(s.lastRXPumpAt) / time.Millisecond) - } - s.lastRXPumpAt = now - s.lastRXPumpMoved = moved - s.lastRXPumpDurMS = int(now.Sub(started) / time.Millisecond) - s.lastRXPumpGapMS = gapMS - if s.lastRXPumpGapMS < 0 { - s.lastRXPumpGapMS = 0 - } - if s.lastRXPumpDurMS >= 5 { - otadiag.Event( - "[serial-raw]", "rx_pump_slow", otadiag.XferNone, - otadiag.KV("moved", moved), - otadiag.KV("dur_ms", s.lastRXPumpDurMS), - otadiag.KV("gap_ms", s.lastRXPumpGapMS), - ) - } -} - -func (d *Device) pumpRX(s *session, u core.SerialPort, rxR *shmring.Ring, budget int) bool { - started := time.Now() - moved := 0 - - defer func() { - if moved > 0 { - s.noteRXPump(moved, started) - } - }() - - for moved < budget { - d.logDriverPressure(s, false) - p1, p2 := rxR.WriteAcquire() - if len(p1) == 0 { - s.rxRingFull++ - break - } - - remaining := budget - moved - p1 = limitSpan(p1, remaining) - n1 := u.TryRead(p1) - if n1 == 0 { - break - } - n := n1 - moved += n1 - if n1 < len(p1) { - rxR.WriteCommit(n) - break - } - - remaining = budget - moved - if remaining > 0 && len(p2) > 0 { - p2 = limitSpan(p2, remaining) - n2 := u.TryRead(p2) - n += n2 - moved += n2 - if n2 < len(p2) { - rxR.WriteCommit(n) - break - } - } - - rxR.WriteCommit(n) - } - - return moved > 0 -} - func (d *Device) reactor(s *session) { defer close(s.done) u := d.port rxR := s.rxRing // UART -> app txR := s.txRing // app -> UART + s.probe.start(d.id, u, rxR, txR) for { made := false - - if d.pumpRX(s, u, rxR, serialRawPumpRXBudget) { + rxMade := false + rxBackpressure := false + + // UART RX -> rxRing. + // + // RX is the only lossy edge in this chain. Drain the UARTX software RX + // ring until it is empty, or until the session ring applies real + // back-pressure. Do not switch to TX merely because some RX bytes were + // published: during a peer chunk, the remote UART keeps sending and the + // interrupt-side ring only has short-latency elasticity. + for { + p1, p2 := rxR.WriteAcquire() + if len(p1) == 0 { + s.probe.rxRingFull(d.id, u, rxR, txR) + rxBackpressure = true + break + } + n1 := u.TryRead(p1) + if n1 == 0 { + break + } + if n1 < len(p1) { + rxR.WriteCommit(n1) + s.probe.afterRX(d.id, u, rxR, txR, n1) + made = true + rxMade = true + continue + } + n2 := 0 + if len(p2) > 0 { + n2 = u.TryRead(p2) + } + rxR.WriteCommit(n1 + n2) + s.probe.afterRX(d.id, u, rxR, txR, n1+n2) made = true + rxMade = true + } + + if rxBackpressure { + // Downstream RX is full. Do not spin on UART readability, and do not rely + // on an explicit scheduler yield. If there is no outbound work, block on + // the only two edges that can make progress: the protocol consumer freeing + // RX space, or the application producing TX work. If outbound work exists, + // allow one small TX escape hatch; this lets a writer blocked in writeLine + // finish, after which the same application goroutine can read and free + // rxRing. + made = false + if txR.Available() == 0 { + select { + case <-s.ctx.Done(): + return + case <-rxR.Writable(): + case <-txR.Readable(): + } + continue + } + } else if rxMade { + // RX made progress and the downstream ring still had room. Re-check RX + // immediately before considering TX. This preserves the serial worker + // as the short-latency drain for the UARTX ISR ring without involving a + // scheduler hint. + continue } - if d.pumpTX(u, txR, serialRawPumpTXBudget) { + // txRing -> UART TX. Transmit under a small per-activation budget so + // retained publications or diagnostic chatter cannot monopolise this + // worker while the peer is sending a long chunk. Under RX back-pressure, + // the same budget also acts as a deadlock escape hatch for a writer whose + // reader cannot run until writeLine completes. + const txBudgetPerPass = 64 + txBudget := txBudgetPerPass + for txBudget > 0 { + p1, p2 := txR.ReadAcquire() + if len(p1) == 0 { + break + } + if len(p1) > txBudget { + p1 = p1[:txBudget] + } + n1 := u.TryWrite(p1) + if n1 == 0 { + break + } + txBudget -= n1 + if n1 < len(p1) || txBudget == 0 { + txR.ReadRelease(n1) + s.probe.afterTX(d.id, u, rxR, txR, n1) + made = true + break + } + n2 := 0 + if len(p2) > 0 && txBudget > 0 { + if len(p2) > txBudget { + p2 = p2[:txBudget] + } + n2 = u.TryWrite(p2) + txBudget -= n2 + } + txR.ReadRelease(n1 + n2) + s.probe.afterTX(d.id, u, rxR, txR, n1+n2) made = true + if n2 == 0 || txBudget == 0 { + break + } } if made { - select { - case <-s.ctx.Done(): - d.logRingFullChange(s, true) - return - default: - } - runtime.Gosched() continue } + s.probe.periodic(d.id, u, rxR, txR) + // Idle: wait for any edge, then re-check. - d.logRingFullChange(s, false) select { case <-s.ctx.Done(): - d.logRingFullChange(s, true) return case <-u.Readable(): case <-u.Writable(): @@ -602,56 +446,6 @@ func (d *Device) reactor(s *session) { } } -func (d *Device) pumpTX(u core.SerialPort, txR *shmring.Ring, budget int) bool { - moved := 0 - - for moved < budget { - p1, p2 := txR.ReadAcquire() - if len(p1) == 0 { - break - } - - remaining := budget - moved - p1 = limitSpan(p1, remaining) - n1 := u.TryWrite(p1) - if n1 == 0 { - break - } - n := n1 - moved += n1 - if n1 < len(p1) { - txR.ReadRelease(n) - break - } - - remaining = budget - moved - if remaining > 0 && len(p2) > 0 { - p2 = limitSpan(p2, remaining) - n2 := u.TryWrite(p2) - n += n2 - moved += n2 - if n2 < len(p2) { - txR.ReadRelease(n) - break - } - } - - txR.ReadRelease(n) - } - - return moved > 0 -} - -func limitSpan(p []byte, max int) []byte { - if max <= 0 { - return p[:0] - } - if len(p) > max { - return p[:max] - } - return p -} - // ---- Helpers ---- func isPow2(n int) bool { return n > 0 && (n&(n-1)) == 0 } diff --git a/services/hal/devices/serial_raw/builder_test.go b/services/hal/devices/serial_raw/builder_test.go deleted file mode 100644 index 3b71105..0000000 --- a/services/hal/devices/serial_raw/builder_test.go +++ /dev/null @@ -1,269 +0,0 @@ -package serial_raw - -import ( - "context" - "strings" - "sync" - "sync/atomic" - "testing" - "time" - - "devicecode-go/services/hal/internal/core" - "devicecode-go/services/otadiag" - "devicecode-go/types" -) - -type fakeSerialPort struct { - readable chan struct{} - writable chan struct{} - - continuousRX atomic.Bool - writeCalls atomic.Int32 - readCalls atomic.Int32 - maxReadLen atomic.Int32 - maxWriteLen atomic.Int32 - rxBuffered atomic.Int32 - rxBufferCap atomic.Int32 - rxDrops atomic.Uint32 - rxOverrun atomic.Uint32 - rxBreak atomic.Uint32 - rxParity atomic.Uint32 - rxFraming atomic.Uint32 - - mu sync.Mutex - written []byte -} - -func newFakeSerialPort() *fakeSerialPort { - p := &fakeSerialPort{ - readable: make(chan struct{}, 1), - writable: make(chan struct{}, 1), - } - p.signalReadable() - p.signalWritable() - return p -} - -func (p *fakeSerialPort) RXBuffered() int { return int(p.rxBuffered.Load()) } -func (p *fakeSerialPort) RXBufferCap() int { return int(p.rxBufferCap.Load()) } -func (p *fakeSerialPort) RXDropCount() uint32 { return p.rxDrops.Load() } -func (p *fakeSerialPort) RXOverrunCount() uint32 { return p.rxOverrun.Load() } -func (p *fakeSerialPort) RXBreakCount() uint32 { return p.rxBreak.Load() } -func (p *fakeSerialPort) RXParityCount() uint32 { return p.rxParity.Load() } -func (p *fakeSerialPort) RXFramingCount() uint32 { return p.rxFraming.Load() } - -func (p *fakeSerialPort) TryRead(dst []byte) int { - p.readCalls.Add(1) - recordMax(&p.maxReadLen, len(dst)) - if !p.continuousRX.Load() || len(dst) == 0 { - return 0 - } - for i := range dst { - dst[i] = 'r' - } - p.signalReadable() - return len(dst) -} - -func (p *fakeSerialPort) TryWrite(src []byte) int { - p.writeCalls.Add(1) - recordMax(&p.maxWriteLen, len(src)) - if len(src) == 0 { - return 0 - } - p.mu.Lock() - p.written = append(p.written, src...) - p.mu.Unlock() - p.signalWritable() - return len(src) -} - -func (p *fakeSerialPort) Readable() <-chan struct{} { return p.readable } -func (p *fakeSerialPort) Writable() <-chan struct{} { return p.writable } -func (p *fakeSerialPort) Flush() error { return nil } - -func (p *fakeSerialPort) signalReadable() { - select { - case p.readable <- struct{}{}: - default: - } -} - -func (p *fakeSerialPort) signalWritable() { - select { - case p.writable <- struct{}{}: - default: - } -} - -func (p *fakeSerialPort) writtenBytes() []byte { - p.mu.Lock() - defer p.mu.Unlock() - out := make([]byte, len(p.written)) - copy(out, p.written) - return out -} - -func recordMax(max *atomic.Int32, n int) { - for { - cur := max.Load() - if int32(n) <= cur { - return - } - if max.CompareAndSwap(cur, int32(n)) { - return - } - } -} - -func newTestDevice(port *fakeSerialPort) *Device { - return &Device{ - id: "uart1_raw", - a: core.CapAddr{Domain: "io", Kind: types.KindSerial, Name: "uart1"}, - port: port, - } -} - -func drainRXUntil(ctx context.Context, s *session) { - var buf [128]byte - for { - if s.rxRing.TryReadInto(buf[:]) > 0 { - continue - } - select { - case <-ctx.Done(): - return - case <-s.rxRing.Readable(): - } - } -} - -func waitUntil(t *testing.T, timeout time.Duration, pred func() bool) { - t.Helper() - deadline := time.Now().Add(timeout) - for time.Now().Before(deadline) { - if pred() { - return - } - time.Sleep(time.Millisecond) - } - t.Fatal("condition was not met before timeout") -} - -func TestDriverPressureLogIncludesPumpEvidence(t *testing.T) { - port := newFakeSerialPort() - port.rxBuffered.Store(128) - port.rxBufferCap.Store(128) - port.rxDrops.Store(3) - port.rxOverrun.Store(2) - port.rxFraming.Store(1) - dev := newTestDevice(port) - dev.startSession(512, 512) - defer dev.stopSession() - - s := dev.sess - s.lastRXPumpAt = time.Now().Add(-50 * time.Millisecond) - s.lastRXPumpMoved = 0 - s.lastRXPumpDurMS = 0 - s.lastRXPumpGapMS = 50 - - var lines []string - restore := otadiag.SetSinkForTest(func(line string) { - lines = append(lines, line) - }) - defer restore() - - dev.logDriverPressure(s, true) - - joined := strings.Join(lines, "\n") - for _, want := range []string{ - "[serial-raw]", - "ev rx_driver_pressure", - "uart uart1", - "driver_used 128", - "driver_cap 128", - "ring_space 512", - "since_rx_pump_ms", - "last_pump_gap_ms 50", - "rx_drops 3", - "rx_overrun 2", - "rx_framing 1", - "ev rx_pump_gap", - } { - if !strings.Contains(joined, want) { - t.Fatalf("pressure log missing %q:\n%s", want, joined) - } - } -} - -func TestReactorServicesTXWhileRXIsContinuous(t *testing.T) { - port := newFakeSerialPort() - port.continuousRX.Store(true) - dev := newTestDevice(port) - dev.startSession(512, 512) - - drainCtx, stopDrain := context.WithCancel(context.Background()) - defer stopDrain() - go drainRXUntil(drainCtx, dev.sess) - - payload := []byte("tx while rx is busy") - if n := dev.sess.txRing.TryWriteFrom(payload); n != len(payload) { - t.Fatalf("failed to seed tx ring: wrote %d/%d", n, len(payload)) - } - - waitUntil(t, 100*time.Millisecond, func() bool { - return port.writeCalls.Load() > 0 - }) - - if got := string(port.writtenBytes()); got != string(payload) { - t.Fatalf("written payload mismatch: got %q want %q", got, payload) - } - if max := port.maxReadLen.Load(); max > serialRawPumpRXBudget { - t.Fatalf("TryRead span exceeded budget: got %d want <= %d", max, serialRawPumpRXBudget) - } - if max := port.maxWriteLen.Load(); max > serialRawPumpTXBudget { - t.Fatalf("TryWrite span exceeded budget: got %d want <= %d", max, serialRawPumpTXBudget) - } - - dev.stopSession() -} - -func TestStopSessionReturnsUnderContinuousRX(t *testing.T) { - port := newFakeSerialPort() - port.continuousRX.Store(true) - dev := newTestDevice(port) - dev.startSession(512, 512) - - drainCtx, stopDrain := context.WithCancel(context.Background()) - defer stopDrain() - go drainRXUntil(drainCtx, dev.sess) - - done := make(chan struct{}) - go func() { - dev.stopSession() - close(done) - }() - - select { - case <-done: - case <-time.After(100 * time.Millisecond): - t.Fatal("stopSession did not return under continuous RX") - } -} - -func TestStopSessionReturnsWhenIdle(t *testing.T) { - dev := newTestDevice(newFakeSerialPort()) - dev.startSession(512, 512) - - done := make(chan struct{}) - go func() { - dev.stopSession() - close(done) - }() - - select { - case <-done: - case <-time.After(100 * time.Millisecond): - t.Fatal("stopSession did not return while idle") - } -} diff --git a/services/hal/devices/serial_raw/uartx_probe_default.go b/services/hal/devices/serial_raw/uartx_probe_default.go new file mode 100644 index 0000000..e322140 --- /dev/null +++ b/services/hal/devices/serial_raw/uartx_probe_default.go @@ -0,0 +1,16 @@ +//go:build !uartx_probe + +package serial_raw + +import ( + "devicecode-go/services/hal/internal/core" + "devicecode-go/x/shmring" +) + +type uartxProbe struct{} + +func (p *uartxProbe) start(id string, port core.SerialPort, rxR, txR *shmring.Ring) {} +func (p *uartxProbe) afterRX(id string, port core.SerialPort, rxR, txR *shmring.Ring, n int) {} +func (p *uartxProbe) afterTX(id string, port core.SerialPort, rxR, txR *shmring.Ring, n int) {} +func (p *uartxProbe) rxRingFull(id string, port core.SerialPort, rxR, txR *shmring.Ring) {} +func (p *uartxProbe) periodic(id string, port core.SerialPort, rxR, txR *shmring.Ring) {} diff --git a/services/hal/devices/serial_raw/uartx_probe_enabled.go b/services/hal/devices/serial_raw/uartx_probe_enabled.go new file mode 100644 index 0000000..c9b0732 --- /dev/null +++ b/services/hal/devices/serial_raw/uartx_probe_enabled.go @@ -0,0 +1,117 @@ +//go:build uartx_probe + +package serial_raw + +import ( + "time" + + "devicecode-go/services/hal/internal/core" + "devicecode-go/x/shmring" +) + +type uartxProbe struct { + armed bool + nextPeriodic time.Time + nextLoss time.Time + last core.SerialDebugStats + lastLoss uint32 +} + +func (p *uartxProbe) start(id string, port core.SerialPort, rxR, txR *shmring.Ring) { + p.print("start", id, port, rxR, txR) + p.nextPeriodic = time.Now().Add(2 * time.Second) + p.armed = true +} + +func (p *uartxProbe) afterRX(id string, port core.SerialPort, rxR, txR *shmring.Ring, n int) { + if n <= 0 { + return + } + p.printIfLossChanged("rx", id, port, rxR, txR) +} + +func (p *uartxProbe) afterTX(id string, port core.SerialPort, rxR, txR *shmring.Ring, n int) { + if n <= 0 { + return + } + p.printIfLossChanged("tx", id, port, rxR, txR) +} + +func (p *uartxProbe) rxRingFull(id string, port core.SerialPort, rxR, txR *shmring.Ring) { + // This is the HAL session ring, not the UARTX ISR ring. It matters because + // if it stays full, the session worker cannot drain the UARTX RX ring. + if rxR.Space() == 0 { + p.print("session_rx_ring_full", id, port, rxR, txR) + } +} + +func (p *uartxProbe) periodic(id string, port core.SerialPort, rxR, txR *shmring.Ring) { + now := time.Now() + if !p.armed || now.After(p.nextPeriodic) { + p.print("periodic", id, port, rxR, txR) + p.nextPeriodic = now.Add(2 * time.Second) + p.armed = true + } +} + +func (p *uartxProbe) printIfLossChanged(reason, id string, port core.SerialPort, rxR, txR *shmring.Ring) { + s, ok := debugStats(port) + if !ok { + return + } + loss := totalLoss(s) + // Keep the probe diagnostic but not self-defeating. Printing every dropped + // byte can itself stall the serial pump and create more drops. Emit promptly + // for the first loss edge, then coalesce further loss changes by count or + // time; max-occupancy changes are still visible in periodic snapshots. + now := time.Now() + if loss != p.lastLoss && (p.lastLoss == 0 || loss-p.lastLoss >= 128 || now.After(p.nextLoss)) { + p.print(reason, id, port, rxR, txR) + p.nextLoss = now.Add(500 * time.Millisecond) + } +} + +func totalLoss(s core.SerialDebugStats) uint32 { + return s.RXRingDrops + s.RXOverrun + s.RXBreak + s.RXParity + s.RXFraming +} + +func debugStats(port core.SerialPort) (core.SerialDebugStats, bool) { + d, ok := port.(core.SerialDiagnostics) + if !ok { + return core.SerialDebugStats{}, false + } + return d.DebugStats(), true +} + +func (p *uartxProbe) print(reason, id string, port core.SerialPort, rxR, txR *shmring.Ring) { + s, ok := debugStats(port) + if !ok { + return + } + loss := totalLoss(s) + println("[uartx-probe]", id, + "reason", reason, + "rx_hw", s.RXHWBytes, + "rx_enq", s.RXEnqueued, + "rx_read", s.RXReadBytes, + "rx_drop", s.RXRingDrops, + "rx_oe", s.RXOverrun, + "rx_fe", s.RXFraming, + "rx_pe", s.RXParity, + "rx_be", s.RXBreak, + "rx_max", s.RXRingMax, + "rx_notify_drop", s.RXNotifyDrop, + "tx_acc", s.TXAccepted, + "tx_hw", s.TXHWBytes, + "tx_full", s.TXRingFull, + "tx_max", s.TXRingMax, + "tx_notify_drop", s.TXNotifyDrop, + "sess_rx_avail", rxR.Available(), + "sess_rx_space", rxR.Space(), + "sess_tx_avail", txR.Available(), + "sess_tx_space", txR.Space(), + "loss", loss, + ) + p.last = s + p.lastLoss = loss +} diff --git a/services/hal/internal/core/resources.go b/services/hal/internal/core/resources.go index f589b24..84e951b 100644 --- a/services/hal/internal/core/resources.go +++ b/services/hal/internal/core/resources.go @@ -127,6 +127,36 @@ type SerialFormatConfigurator interface { SetFormat(databits, stopbits uint8, parity string) error } +// SerialDebugStats is an optional provider-specific diagnostic snapshot for +// UART-like serial resources. Values are coarse counters used to attribute data +// loss during hardware tests; they are not part of the stable HAL contract. +type SerialDebugStats struct { + RXIRQ uint32 + RXHWBytes uint32 + RXEnqueued uint32 + RXRingDrops uint32 + RXOverrun uint32 + RXBreak uint32 + RXParity uint32 + RXFraming uint32 + RXRingMax uint32 + RXReadBytes uint32 + RXReadEmpty uint32 + RXNotifyDrop uint32 + + TXIRQ uint32 + TXAccepted uint32 + TXHWBytes uint32 + TXRingFull uint32 + TXRingMax uint32 + TXTryCalls uint32 + TXNotifyDrop uint32 +} + +type SerialDiagnostics interface { + DebugStats() SerialDebugStats +} + // ---- Unified registry interface ---- type ResourceRegistry interface { diff --git a/services/hal/internal/provider/rp2_resources.go b/services/hal/internal/provider/rp2_resources.go index 8967322..8f08b72 100644 --- a/services/hal/internal/provider/rp2_resources.go +++ b/services/hal/internal/provider/rp2_resources.go @@ -707,57 +707,22 @@ type rp2SerialPort struct{ u *uartx.UART } func (p *rp2SerialPort) Readable() <-chan struct{} { return p.u.Readable() } func (p *rp2SerialPort) Writable() <-chan struct{} { return p.u.Writable() } -func (p *rp2SerialPort) TryRead(b []byte) int { return p.u.TryRead(b) } -func (p *rp2SerialPort) TryWrite(b []byte) int { return p.u.TryWrite(b) } -func (p *rp2SerialPort) Flush() error { return p.u.Flush() } - -func (p *rp2SerialPort) RXBuffered() int { - if p.u == nil || p.u.Buffer == nil { - return -1 - } - return int(p.u.Buffer.Used()) -} - -func (p *rp2SerialPort) RXBufferCap() int { - if p.u == nil || p.u.Buffer == nil { - return -1 - } - return int(p.u.Buffer.Size()) -} - -func (p *rp2SerialPort) RXDropCount() uint32 { - if p.u == nil { - return 0 - } - return p.u.RXDropCount() -} - -func (p *rp2SerialPort) RXOverrunCount() uint32 { - if p.u == nil { - return 0 - } - return p.u.RXOverrunCount() -} - -func (p *rp2SerialPort) RXBreakCount() uint32 { - if p.u == nil { - return 0 - } - return p.u.RXBreakCount() -} - -func (p *rp2SerialPort) RXParityCount() uint32 { - if p.u == nil { - return 0 - } - return p.u.RXParityCount() -} - -func (p *rp2SerialPort) RXFramingCount() uint32 { - if p.u == nil { - return 0 +func (p *rp2SerialPort) TryRead(b []byte) int { + n := p.u.TryRead(b) + p.u.NoteRXRead(n) + return n +} +func (p *rp2SerialPort) TryWrite(b []byte) int { return p.u.TryWrite(b) } +func (p *rp2SerialPort) Flush() error { return p.u.Flush() } +func (p *rp2SerialPort) DebugStats() core.SerialDebugStats { + s := p.u.Stats() + return core.SerialDebugStats{ + RXIRQ: s.RXIRQ, RXHWBytes: s.RXHWBytes, RXEnqueued: s.RXEnqueued, RXRingDrops: s.RXRingDrops, + RXOverrun: s.RXOverrun, RXBreak: s.RXBreak, RXParity: s.RXParity, RXFraming: s.RXFraming, + RXRingMax: s.RXRingMax, RXReadBytes: s.RXReadBytes, RXReadEmpty: s.RXReadEmpty, RXNotifyDrop: s.RXNotifyDrop, + TXIRQ: s.TXIRQ, TXAccepted: s.TXAccepted, TXHWBytes: s.TXHWBytes, TXRingFull: s.TXRingFull, + TXRingMax: s.TXRingMax, TXTryCalls: s.TXTryCalls, TXNotifyDrop: s.TXNotifyDrop, } - return p.u.RXFramingCount() } func (p *rp2SerialPort) SetBaudRate(br uint32) error { p.u.SetBaudRate(br); return nil } diff --git a/services/hal/internal/provider/setup_none.go b/services/hal/internal/provider/setup_none.go index 9bfa5b8..77e6ecf 100644 --- a/services/hal/internal/provider/setup_none.go +++ b/services/hal/internal/provider/setup_none.go @@ -1,4 +1,4 @@ -//go:build (rp2040 || rp2350) && !(pico_rich_dev || pico_bb_proto_1) +//go:build !((rp2040 || rp2350) && (pico_rich_dev || pico_bb_proto_1 || pico_cm5_emulator)) package provider diff --git a/services/hal/internal/provider/setup_selected.go b/services/hal/internal/provider/setup_selected.go index f865eef..20a4e6e 100644 --- a/services/hal/internal/provider/setup_selected.go +++ b/services/hal/internal/provider/setup_selected.go @@ -1,4 +1,4 @@ -//go:build (rp2040 || rp2350) && (pico_rich_dev || pico_bb_proto_1) +//go:build (rp2040 || rp2350) && (pico_rich_dev || pico_bb_proto_1 || pico_cm5_emulator) package provider diff --git a/services/hal/internal/provider/setups/pico_bb_proto_1.go b/services/hal/internal/provider/setups/pico_bb_proto_1.go index ae3d94d..df47893 100644 --- a/services/hal/internal/provider/setups/pico_bb_proto_1.go +++ b/services/hal/internal/provider/setups/pico_bb_proto_1.go @@ -24,6 +24,11 @@ var SelectedPlan = ResourcePlan{ }, } +// Keep raw serial session rings deliberately modest. Fabric is a framed +// stream protocol and should remain correct under bounded buffering; these +// are not intended to hold full transfer frames. +const rawSerialSessionSize = 512 + var SelectedSetup = types.HALConfig{ Devices: []types.HALDevice{ @@ -48,8 +53,8 @@ var SelectedSetup = types.HALConfig{ Domain: "io", Name: "uart0", Baud: 115_200, - RXSize: 32, - TXSize: 2048, + RXSize: rawSerialSessionSize, + TXSize: rawSerialSessionSize, }}, // Raw serial device bound to uart1 (public address hal/cap/io/serial/uart1/…) @@ -58,8 +63,8 @@ var SelectedSetup = types.HALConfig{ Domain: "io", Name: "uart1", Baud: 115_200, - RXSize: 256, - TXSize: 2048, + RXSize: rawSerialSessionSize, + TXSize: rawSerialSessionSize, }}, {ID: "charger0", Type: "ltc4015", Params: ltc4015dev.Params{ diff --git a/services/hal/internal/provider/setups/pico_cm5_emulator.go b/services/hal/internal/provider/setups/pico_cm5_emulator.go new file mode 100644 index 0000000..a91447b --- /dev/null +++ b/services/hal/internal/provider/setups/pico_cm5_emulator.go @@ -0,0 +1,34 @@ +//go:build (rp2040 || rp2350) && pico_cm5_emulator + +package setups + +import ( + serialraw "devicecode-go/services/hal/devices/serial_raw" + "devicecode-go/types" +) + +var SelectedPlan = ResourcePlan{ + UART: []UARTPlan{ + // Pico 1 CM5 emulator link UART. + // Wire Pico 1 GP0/TX -> Pico 2 GP5/RX and Pico 1 GP1/RX <- Pico 2 GP4/TX. + {ID: "uart0", TX: 0, RX: 1, Baud: 115_200}, + }, +} + +// Keep the emulator link under the same bounded serial-session constraint as +// the MCU Fabric link; the emulator must stream and apply flow control rather +// than buffering whole Fabric frames in the HAL session. +const rawSerialSessionSize = 512 + +var SelectedSetup = types.HALConfig{ + Devices: []types.HALDevice{ + {ID: "uart0_raw", Type: "serial_raw", Params: serialraw.Params{ + Bus: "uart0", + Domain: "io", + Name: "uart0", + Baud: 115_200, + RXSize: rawSerialSessionSize, + TXSize: rawSerialSessionSize, + }}, + }, +} diff --git a/services/hal/internal/provider/setups/pico_rich_dev.go b/services/hal/internal/provider/setups/pico_rich_dev.go index d2bd8e8..ddbefb2 100644 --- a/services/hal/internal/provider/setups/pico_rich_dev.go +++ b/services/hal/internal/provider/setups/pico_rich_dev.go @@ -23,6 +23,11 @@ var SelectedPlan = ResourcePlan{ }, } +// Keep raw serial session rings deliberately modest. Fabric is a framed +// stream protocol and should remain correct under bounded buffering; these +// are not intended to hold full transfer frames. +const rawSerialSessionSize = 512 + var SelectedSetup = types.HALConfig{ Devices: []types.HALDevice{ @@ -42,8 +47,8 @@ var SelectedSetup = types.HALConfig{ Domain: "io", Name: "uart0", Baud: 115_200, - RXSize: 32, - TXSize: 2048, + RXSize: rawSerialSessionSize, + TXSize: rawSerialSessionSize, }}, // Raw serial device bound to uart1 (public address hal/cap/io/serial/uart1/…) @@ -52,8 +57,8 @@ var SelectedSetup = types.HALConfig{ Domain: "io", Name: "uart1", Baud: 115_200, - RXSize: 32, - TXSize: 512, + RXSize: rawSerialSessionSize, + TXSize: rawSerialSessionSize, }}, {ID: "charger0", Type: "ltc4015", Params: ltc4015dev.Params{ diff --git a/services/otadiag/otadiag.go b/services/otadiag/otadiag.go index b31999b..46e545d 100644 --- a/services/otadiag/otadiag.go +++ b/services/otadiag/otadiag.go @@ -16,9 +16,10 @@ type Field struct { } var ( - startedAt = time.Now() - nextSeq atomic.Uint64 - verbose atomic.Bool + startedAt = time.Now() + nextSeq atomic.Uint64 + verbose atomic.Bool + heartbeatDeadlineMS atomic.Int64 sinkMu sync.Mutex sink func(string) @@ -40,6 +41,13 @@ func KV(key string, value any) Field { return Field{Key: key, Value: valueString(value)} } +// Enabled reports whether Event would emit a line for prefix/event under the +// current verbosity policy. It is used by TinyGo hot paths to avoid building +// diagnostic Field values that would be filtered out. +func Enabled(prefix, event string) bool { + return allowEvent(prefix, event, nil) +} + func Event(prefix, event, xferID string, fields ...Field) { if !allowEvent(prefix, event, fields) { return @@ -104,6 +112,21 @@ func SetUpdaterSnapshot(s StageSnapshot) { windowMu.Unlock() } +func SetHeartbeatDeadline(d time.Duration) { + if d <= 0 { + d = 45 * time.Second + } + heartbeatDeadlineMS.Store(d.Milliseconds()) +} + +func currentHeartbeatDeadline() time.Duration { + ms := heartbeatDeadlineMS.Load() + if ms <= 0 { + return 45 * time.Second + } + return time.Duration(ms) * time.Millisecond +} + func StartUpdateWindow(reason, xferID string) { if xferID == "" { xferID = XferNone @@ -166,7 +189,7 @@ func WindowActive() bool { func heartbeatLoop(stop <-chan struct{}) { ticker := time.NewTicker(time.Second) defer ticker.Stop() - deadline := time.NewTimer(45 * time.Second) + deadline := time.NewTimer(currentHeartbeatDeadline()) defer deadline.Stop() for { @@ -217,17 +240,24 @@ func allowEvent(prefix, event string, fields []Field) bool { if verbose.Load() { return true } + // Normal firmware builds keep Fabric/OTA observability as retained counters + // and state, not as per-frame/per-chunk log lines. The sink used by tests can + // still opt into the detailed stream through SetSinkForTest/verbose. switch prefix { - case "[serial-raw]", "[fabric-rx]", "[fabric-rpc]", "[fabric-handshake]": - return true case "[mcu-ota]": return event == "heartbeat_start" || event == "heartbeat_stop" + case "[updater-commit]": + return true case "[fabric-xfer]": return allowFabricXferEvent(event, fields) case "[updater-stream]": return allowUpdaterStreamEvent(event) + case "[serial-raw]", "[fabric-rx]", "[fabric-rpc]", "[fabric-handshake]": + return strings.HasSuffix(event, "_error") || + strings.Contains(event, "reject") || + strings.Contains(event, "abort") default: - return true + return false } } @@ -236,7 +266,9 @@ func allowFabricXferEvent(event string, fields []Field) bool { return true } switch event { - case "abort_local", "abort_tx", "done_tx", "ready_tx", "malformed_retry", "transfer_mem_sample": + case "abort_local", "abort_tx", "done_tx", "ready_tx", "malformed_retry", "transfer_mem_sample", + "progress", "timeout", "sink_write_slow", "commit_slow", + "transfer_commit_start", "transfer_commit_done", "target_call_start", "target_call_reply": return true case "need_tx": next := fieldValue(fields, "next") @@ -247,7 +279,7 @@ func allowFabricXferEvent(event string, fields []Field) bool { fieldValue(fields, "skipped") != "" case "chunk_decode_done", "chunk_digest_done": return fieldValue(fields, "ok") == "false" - case "chunk_stale_offset", "chunk_future_offset", "chunk_size_overflow", + case "chunk_reject", "chunk_stale_offset", "chunk_future_offset", "chunk_size_overflow", "sink_write_error", "chunk_write_error": return true default: @@ -262,7 +294,7 @@ func allowUpdaterStreamEvent(event string) bool { return true } switch event { - case "begin_entry", "lease_ok", "begin_exit", + case "begin_entry", "lease_ok", "begin_exit", "commit_result", "stage_reply", "flash_erase_start", "flash_erase_done", "flash_erase_error", "flash_program_error", "program_page_error": return true diff --git a/services/otadiag/otadiag_test.go b/services/otadiag/otadiag_test.go index 590d93f..02139d8 100644 --- a/services/otadiag/otadiag_test.go +++ b/services/otadiag/otadiag_test.go @@ -29,27 +29,24 @@ func TestDefaultFilterKeepsActionableEvents(t *testing.T) { lines, restore := captureDefaultFilteredEvents() defer restore() - Event("[serial-raw]", "rx_driver_pressure", XferNone, KV("uart", "uart0")) - Event("[fabric-rx]", "read_line", XferNone, KV("type", "ping")) - Event("[fabric-rpc]", "sent", XferNone, KV("call_id", "call-1")) + Event("[serial-raw]", "rx_ring_error", XferNone, KV("uart", "uart0")) + Event("[fabric-rx]", "read_line_error", XferNone, KV("reason", "line_too_long")) + Event("[fabric-rpc]", "call_reject", XferNone, KV("call_id", "call-1")) Event("[mcu-ota]", "heartbeat_start", "xfer-1", KV("reason", "prepare")) Event("[mcu-ota]", "heartbeat_stop", "xfer-1", KV("reason", "done")) - Event("[fabric-xfer]", "begin_rx", "xfer-1", KV("target", "updater/main")) - Event("[fabric-xfer]", "ready_tx", "xfer-1", KV("ok", true)) - Event("[fabric-xfer]", "need_tx", "xfer-1", KV("next", 0), KV("ok", true)) - Event("[fabric-xfer]", "chunk_digest_done", "xfer-1", KV("ok", false), KV("reason", "chunk_digest_mismatch")) + Event("[fabric-xfer]", "xfer_abort", "xfer-1", KV("reason", "cancelled")) Event("[fabric-xfer]", "sink_write_error", "xfer-1", KV("reason", "write_boom")) - Event("[updater-stream]", "prepare_rx", XferNone, KV("job_id", "job-1")) - Event("[updater-stream]", "flash_erase_start", "xfer-1", KV("offset", 0)) + Event("[updater-stream]", "prepare_reject", XferNone, KV("reason", "busy")) Event("[updater-stream]", "image_signature_verify_error", "xfer-1", KV("reason", "bad_signature")) + Event("[updater-commit]", "rx", XferNone, KV("job_id", "job-1")) got := strings.Join(*lines, "\n") for _, want := range []string{ "[serial-raw]", "[fabric-rx]", "[fabric-rpc]", "ev heartbeat_start", "ev heartbeat_stop", - "ev begin_rx", "ev ready_tx", "ev need_tx", - "ev chunk_digest_done", "ev sink_write_error", - "ev prepare_rx", "ev flash_erase_start", "ev image_signature_verify_error", + "ev xfer_abort", "ev sink_write_error", + "ev prepare_reject", "ev image_signature_verify_error", + "[updater-commit]", "ev rx", } { if !strings.Contains(got, want) { t.Fatalf("default filter output missing %q:\n%s", want, got) diff --git a/services/otadiag/verbose_trace.go b/services/otadiag/verbose_trace.go new file mode 100644 index 0000000..10b0232 --- /dev/null +++ b/services/otadiag/verbose_trace.go @@ -0,0 +1,7 @@ +//go:build ota_trace + +package otadiag + +func init() { + verbose.Store(true) +} diff --git a/services/reactor/build_policy_apply_test.go b/services/reactor/build_policy_apply_test.go new file mode 100644 index 0000000..8b86e1f --- /dev/null +++ b/services/reactor/build_policy_apply_test.go @@ -0,0 +1,17 @@ +//go:build !qa_reactor && !fabric_uart_hwtest && fabric_stage_enabled && fabric_apply_enabled && !fabric_uart_selftest + +package reactor + +import "testing" + +func TestBuildPolicyApply(t *testing.T) { + if got := fabricTransferMode(); got != "stage-controller:flash-stage" { + t.Fatalf("fabricTransferMode() = %q", got) + } + if got := updaterRuntimeMode(); got != "production-applier:commit-reboots" { + t.Fatalf("updaterRuntimeMode() = %q", got) + } + if !useHardwareFabricUART() { + t.Fatalf("fabric_apply_enabled production build should use hardware Fabric UART") + } +} diff --git a/services/reactor/build_policy_apply_without_stage_test.go b/services/reactor/build_policy_apply_without_stage_test.go new file mode 100644 index 0000000..bd6fbaf --- /dev/null +++ b/services/reactor/build_policy_apply_without_stage_test.go @@ -0,0 +1,14 @@ +//go:build !qa_reactor && !fabric_stage_enabled && fabric_apply_enabled && !fabric_uart_hwtest && !fabric_uart_selftest + +package reactor + +import "testing" + +func TestBuildPolicyApplyWithoutStageIsSafe(t *testing.T) { + if got := fabricTransferMode(); got != "stage-disabled" { + t.Fatalf("fabricTransferMode() = %q", got) + } + if got := updaterRuntimeMode(); got != "safe-defaults:apply-disabled" { + t.Fatalf("updaterRuntimeMode() = %q", got) + } +} diff --git a/services/reactor/build_policy_default_test.go b/services/reactor/build_policy_default_test.go new file mode 100644 index 0000000..353dc44 --- /dev/null +++ b/services/reactor/build_policy_default_test.go @@ -0,0 +1,17 @@ +//go:build !qa_reactor && !fabric_uart_hwtest && !fabric_stage_enabled && !fabric_apply_enabled && !fabric_uart_selftest + +package reactor + +import "testing" + +func TestBuildPolicyDefault(t *testing.T) { + if got := fabricTransferMode(); got != "stage-disabled" { + t.Fatalf("fabricTransferMode() = %q", got) + } + if got := updaterRuntimeMode(); got != "safe-defaults:apply-disabled" { + t.Fatalf("updaterRuntimeMode() = %q", got) + } + if !useHardwareFabricUART() { + t.Fatalf("default build should use hardware Fabric UART") + } +} diff --git a/services/reactor/build_policy_flash_stage_test.go b/services/reactor/build_policy_flash_stage_test.go new file mode 100644 index 0000000..9bd0008 --- /dev/null +++ b/services/reactor/build_policy_flash_stage_test.go @@ -0,0 +1,17 @@ +//go:build !qa_reactor && !fabric_uart_hwtest && fabric_stage_enabled && !fabric_apply_enabled && !fabric_uart_selftest + +package reactor + +import "testing" + +func TestBuildPolicyFlashStage(t *testing.T) { + if got := fabricTransferMode(); got != "stage-controller:flash-stage" { + t.Fatalf("fabricTransferMode() = %q", got) + } + if got := updaterRuntimeMode(); got != "safe-defaults:apply-disabled" { + t.Fatalf("updaterRuntimeMode() = %q", got) + } + if !useHardwareFabricUART() { + t.Fatalf("fabric_stage_enabled should use hardware Fabric UART") + } +} diff --git a/services/reactor/build_policy_selftest_test.go b/services/reactor/build_policy_selftest_test.go new file mode 100644 index 0000000..049add1 --- /dev/null +++ b/services/reactor/build_policy_selftest_test.go @@ -0,0 +1,17 @@ +//go:build !qa_reactor && fabric_uart_hwtest && fabric_uart_selftest + +package reactor + +import "testing" + +func TestBuildPolicyUARTSelfTest(t *testing.T) { + if got := fabricTransferMode(); got != "stage-controller:hwtest" { + t.Fatalf("fabricTransferMode() = %q", got) + } + if got := updaterRuntimeMode(); got != "safe-defaults:apply-disabled" { + t.Fatalf("updaterRuntimeMode() = %q", got) + } + if useHardwareFabricUART() { + t.Fatalf("fabric_uart_selftest should disable the hardware Fabric UART") + } +} diff --git a/services/reactor/build_policy_uart_hwtest_test.go b/services/reactor/build_policy_uart_hwtest_test.go new file mode 100644 index 0000000..b2ded8b --- /dev/null +++ b/services/reactor/build_policy_uart_hwtest_test.go @@ -0,0 +1,17 @@ +//go:build !qa_reactor && fabric_uart_hwtest && !fabric_uart_selftest + +package reactor + +import "testing" + +func TestBuildPolicyUARTHWTest(t *testing.T) { + if got := fabricTransferMode(); got != "stage-controller:hwtest" { + t.Fatalf("fabricTransferMode() = %q", got) + } + if got := updaterRuntimeMode(); got != "safe-defaults:apply-disabled" { + t.Fatalf("updaterRuntimeMode() = %q", got) + } + if !useHardwareFabricUART() { + t.Fatalf("fabric_uart_hwtest should use hardware Fabric UART unless fabric_uart_selftest is also set") + } +} diff --git a/services/reactor/children.go b/services/reactor/children.go new file mode 100644 index 0000000..018e0ce --- /dev/null +++ b/services/reactor/children.go @@ -0,0 +1,163 @@ +//go:build !qa_reactor + +package reactor + +import ( + "context" + + "devicecode-go/services/telemetry" + "devicecode-go/services/updater" +) + +// FirmwareVersion/FirmwareBuild/FirmwareImageID are the stamps the updater +// publishes via state/self/software. They are development sentinels for this +// original-reactor branch; build tooling can override them later by adding a +// same-package init file when the release wiring is introduced. +var ( + FirmwareVersion = "0.0.0-dev" + FirmwareBuild = "local" + FirmwareImageID = "img-dev" +) + +func firmwareIdentity() updater.Identity { + return updater.Identity{ + Version: FirmwareVersion, + Build: FirmwareBuild, + ImageID: FirmwareImageID, + } +} + +// childState describes lifecycle owned by the top-level Reactor. It is +// intentionally small: children own their internal state machines; the Reactor +// only starts them, observes unexpected exits, and stops them with its own +// context. +type childState uint8 + +const ( + childStopped childState = iota + childRunning + childFailed +) + +type childExit struct { + name string + expected bool +} + +type childRuntime struct { + name string + run func(context.Context) + cancel context.CancelFunc + state childState +} + +type childSupervisor struct { + children []childRuntime + done chan childExit +} + +func (s *childSupervisor) Add(name string, run func(context.Context)) { + if name == "" || run == nil { + return + } + s.children = append(s.children, childRuntime{name: name, run: run, state: childStopped}) +} + +func (s *childSupervisor) StartAll(ctx context.Context) { + if s.done == nil { + s.done = make(chan childExit, 4) + } + for i := range s.children { + if s.children[i].state == childRunning { + continue + } + childCtx, cancel := context.WithCancel(ctx) + s.children[i].cancel = cancel + s.children[i].state = childRunning + name := s.children[i].name + run := s.children[i].run + done := s.done + go func() { + run(childCtx) + expected := childCtx.Err() != nil + select { + case done <- childExit{name: name, expected: expected}: + default: + } + }() + log.Println("[svc] ", name, " started") + } +} + +func (s *childSupervisor) Done() <-chan childExit { + if s == nil || s.done == nil { + return nil + } + return s.done +} + +func (s *childSupervisor) HandleExit(ev childExit) { + if s == nil || ev.name == "" { + return + } + for i := range s.children { + if s.children[i].name != ev.name { + continue + } + if ev.expected { + s.children[i].state = childStopped + log.Println("[svc] ", ev.name, " stopped") + } else { + s.children[i].state = childFailed + log.Println("[svc] ", ev.name, " exited unexpectedly") + } + return + } +} + +func (s *childSupervisor) StopAll() { + if s == nil { + return + } + for i := range s.children { + if s.children[i].cancel != nil { + s.children[i].cancel() + } + } +} + +func (r *Reactor) startCoreChildren(ctx context.Context) { + if r == nil || r.uiConn == nil { + return + } + + // Updater publishes retained state/self/{software,updater,health} facts and + // binds the local updater RPC topics. The default firmware build still keeps + // Fabric staging safe-disabled at the Reactor boundary; fabric_uart_hwtest or + // fabric_stage_enabled explicitly opt into using this service as Fabric + // StageController. + updater.GenerateBootID() + updaterConn := r.uiConn.NewChildConnection("updater") + if updaterConn != nil { + updaterSvc := updater.New(updaterServiceOptions(updaterConn)) + log.Println("[updater] policy ", updaterRuntimeMode()) + r.updaterSvc = updaterSvc + r.children.Add("updater", updaterSvc.Run) + } + + telemetryConn := r.uiConn.NewChildConnection("telemetry") + if telemetryConn != nil { + telemetrySvc := telemetry.New(telemetryConn) + r.children.Add("telemetry", telemetrySvc.Run) + } + r.addFabricSelfTestChild() + r.children.StartAll(ctx) +} + +func (r *Reactor) stopCoreChildren() { + if r == nil { + return + } + r.children.StopAll() + r.updaterSvc = nil +} diff --git a/services/reactor/fabric_link.go b/services/reactor/fabric_link.go new file mode 100644 index 0000000..856072f --- /dev/null +++ b/services/reactor/fabric_link.go @@ -0,0 +1,97 @@ +//go:build !qa_reactor + +package reactor + +import ( + "context" + "time" + + "devicecode-go/services/fabric" + "devicecode-go/types" + "devicecode-go/x/shmring" +) + +const ( + fabricUART = "uart0" + fabricLogPrefix = "[" + fabricUART + "] " + fabricStopWaitTimeout = 500 * time.Millisecond +) + +// fabricBuffers is allocated once at package scope so the UART/Fabric hot path +// does not construct line or transfer-sized buffers on demand. It is shared by +// at most one active Fabric session; the Reactor tears any old session down +// before starting a replacement. +var fabricBuffers fabric.FabricBuffers + +func waitFabricDone(done <-chan struct{}, timeout time.Duration) bool { + if done == nil { + return true + } + timer := time.NewTimer(timeout) + defer timer.Stop() + select { + case <-done: + return true + case <-timer.C: + return false + } +} + +func (r *Reactor) startPassiveFabric(ctx context.Context, ev types.SerialSessionOpened) { + if r == nil || r.uiConn == nil { + return + } + // Only one Fabric session may own the UART rings. A fresh HAL session_opened + // event replaces the previous session explicitly. + r.stopFabricLink() + + rx := shmring.Get(shmring.Handle(ev.RXHandle)) + tx := shmring.Get(shmring.Handle(ev.TXHandle)) + if rx == nil || tx == nil { + log.Println(fabricLogPrefix + "fabric session missing rings") + return + } + + tr := fabric.NewShmringTransportWithBuffers(rx, tx, &fabricBuffers) + fabricConn := r.uiConn.NewChildConnection("fabric") + if fabricConn == nil { + _ = tr.Close() + log.Println(fabricLogPrefix + "fabric session missing bus") + return + } + + stageController := r.fabricStageController() + transferMode := fabricTransferMode() + + fabricCtx, cancel := context.WithCancel(ctx) + done := make(chan struct{}) + r.fabricCancel = cancel + r.fabricDone = done + r.fabricSessionOpen = true + + log.Println(fabricLogPrefix+"fabric session opening node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=", transferMode) + go func() { + defer close(done) + defer tr.Close() + // The transfer policy is selected by build tag. The default firmware build + // uses a rejecting controller, so an unexpected xfer_begin cannot enter + // flash staging. fabric_uart_hwtest/fabric_stage_enabled explicitly opt in + // to the updater-owned stage controller. + fabric.RunWithOptions(fabricCtx, tr, fabricConn, "mcu", "bigbox-cm5", fabric.DefaultLinkConfig(), fabric.RunOptions{Buffers: &fabricBuffers, StageController: stageController}) + }() + log.Println(fabricLogPrefix+"fabric session opened node=mcu peer=bigbox-cm5 link=mcu-uart0 transfer=", transferMode) +} + +func (r *Reactor) stopFabricLink() { + if r == nil || r.fabricCancel == nil { + return + } + done := r.fabricDone + r.fabricCancel() + r.fabricCancel = nil + r.fabricDone = nil + r.fabricSessionOpen = false + if !waitFabricDone(done, fabricStopWaitTimeout) { + log.Println(fabricLogPrefix + "fabric session stop timed out") + } +} diff --git a/services/reactor/fabric_selftest_disabled.go b/services/reactor/fabric_selftest_disabled.go new file mode 100644 index 0000000..5163b6d --- /dev/null +++ b/services/reactor/fabric_selftest_disabled.go @@ -0,0 +1,5 @@ +//go:build !qa_reactor && (!fabric_uart_selftest || !fabric_uart_hwtest) + +package reactor + +func (r *Reactor) addFabricSelfTestChild() {} diff --git a/services/reactor/fabric_selftest_enabled.go b/services/reactor/fabric_selftest_enabled.go new file mode 100644 index 0000000..eacb26e --- /dev/null +++ b/services/reactor/fabric_selftest_enabled.go @@ -0,0 +1,36 @@ +//go:build !qa_reactor && fabric_uart_selftest && fabric_uart_hwtest + +package reactor + +import ( + "context" + "time" + + "devicecode-go/services/fabric" +) + +func (r *Reactor) addFabricSelfTestChild() { + if r == nil || r.uiConn == nil || r.updaterSvc == nil { + return + } + conn := r.uiConn.NewChildConnection("fabric-selftest") + if conn == nil { + return + } + r.children.Add("fabric-selftest", func(ctx context.Context) { + log.Println("[fabric-selftest] starting in-process UART cross-wire transfer") + res, err := fabric.RunUARTSelfTest(ctx, fabric.UARTSelfTestOptions{ + Conn: conn, + StageController: r.updaterSvc, + PayloadSize: 1024, + ChunkSize: 256, + Timeout: 10 * time.Second, + }) + if err != nil { + log.Println("[fabric-selftest] failed err=", err.Error()) + } else { + log.Println("[fabric-selftest] ok xfer=", res.XferID, " bytes=", int(res.PayloadSize), " chunk=", int(res.ChunkSize), " digest=", res.Digest) + } + <-ctx.Done() + }) +} diff --git a/services/reactor/fabric_stage_disabled.go b/services/reactor/fabric_stage_disabled.go new file mode 100644 index 0000000..185d81f --- /dev/null +++ b/services/reactor/fabric_stage_disabled.go @@ -0,0 +1,54 @@ +//go:build !qa_reactor && !fabric_uart_hwtest && !fabric_stage_enabled + +package reactor + +import ( + "errors" + + "devicecode-go/services/fabric" +) + +// rejectingFabricStageController is the default firmware transfer policy for +// this integration slice. It makes the Fabric transfer boundary explicit while +// guaranteeing that an unexpected xfer_begin cannot enter the TinyGo flash +// prestage path. Hardware cross-wire tests opt in to the updater-owned stage +// controller with the fabric_uart_hwtest build tag; production firmware can do +// the same later with fabric_stage_enabled once the flash path is ready. +type rejectingFabricStageController struct{} + +func fabricTransferMode() string { return "stage-disabled" } + +func (r *Reactor) fabricStageController() fabric.StageController { + return rejectingFabricStageController{} +} + +func (rejectingFabricStageController) BeginStreamedStage(xferID string, size uint32) (uint64, error) { + _ = xferID + _ = size + return 0, errors.New("stage_disabled") +} + +func (rejectingFabricStageController) WriteStreamedStage(xferID string, generation uint64, data []byte) error { + _ = xferID + _ = generation + _ = data + return errors.New("stage_disabled") +} + +func (rejectingFabricStageController) CommitStreamedStage(xferID string, generation uint64) (uint32, error) { + _ = xferID + _ = generation + return 0, errors.New("stage_disabled") +} + +func (rejectingFabricStageController) AbortStreamedStage(xferID string, generation uint64, reason string) { + _ = xferID + _ = generation + _ = reason +} + +func (rejectingFabricStageController) CancelStreamedStage(xferID string, generation uint64, reason string) { + _ = xferID + _ = generation + _ = reason +} diff --git a/services/reactor/fabric_stage_flash.go b/services/reactor/fabric_stage_flash.go new file mode 100644 index 0000000..5faba80 --- /dev/null +++ b/services/reactor/fabric_stage_flash.go @@ -0,0 +1,14 @@ +//go:build !qa_reactor && !fabric_uart_hwtest && fabric_stage_enabled + +package reactor + +import "devicecode-go/services/fabric" + +func fabricTransferMode() string { return "stage-controller:flash-stage" } + +func (r *Reactor) fabricStageController() fabric.StageController { + if r == nil { + return nil + } + return r.updaterSvc +} diff --git a/services/reactor/fabric_stage_hwtest.go b/services/reactor/fabric_stage_hwtest.go new file mode 100644 index 0000000..315e085 --- /dev/null +++ b/services/reactor/fabric_stage_hwtest.go @@ -0,0 +1,14 @@ +//go:build !qa_reactor && fabric_uart_hwtest + +package reactor + +import "devicecode-go/services/fabric" + +func fabricTransferMode() string { return "stage-controller:hwtest" } + +func (r *Reactor) fabricStageController() fabric.StageController { + if r == nil { + return nil + } + return r.updaterSvc +} diff --git a/services/reactor/fabric_uart_policy_default.go b/services/reactor/fabric_uart_policy_default.go new file mode 100644 index 0000000..590c6fb --- /dev/null +++ b/services/reactor/fabric_uart_policy_default.go @@ -0,0 +1,5 @@ +//go:build !qa_reactor && !fabric_uart_selftest + +package reactor + +func useHardwareFabricUART() bool { return true } diff --git a/services/reactor/fabric_uart_policy_selftest.go b/services/reactor/fabric_uart_policy_selftest.go new file mode 100644 index 0000000..f7906c3 --- /dev/null +++ b/services/reactor/fabric_uart_policy_selftest.go @@ -0,0 +1,5 @@ +//go:build !qa_reactor && fabric_uart_selftest + +package reactor + +func useHardwareFabricUART() bool { return false } diff --git a/services/reactor/qa_reactor.go b/services/reactor/qa_reactor.go index 5b4770c..813779f 100644 --- a/services/reactor/qa_reactor.go +++ b/services/reactor/qa_reactor.go @@ -128,11 +128,11 @@ const ( ) type Reactor struct { - bus *bus.Bus uiConn *bus.Connection // UART jsonOut *shmring.Ring // telemetry (JSON UART TX) + // Logger UART1 already handled by global logger (see SetUART1) // inputs (latest) vin_mV, vbat_mV int32 @@ -165,26 +165,15 @@ type Reactor struct { // telemetry drop counters (bytes) droppedUART0Bytes int - bootBuyRC int32 } -type Options struct { - BootBuyRC int32 -} - -func NewReactor(b *bus.Bus, uiConn *bus.Connection) *Reactor { - return NewReactorWithOptions(b, uiConn, Options{}) -} - -func NewReactorWithOptions(b *bus.Bus, uiConn *bus.Connection, opts Options) *Reactor { +func NewReactor(uiConn *bus.Connection) *Reactor { return &Reactor{ - bus: b, - uiConn: uiConn, - levelUp: true, - state: stateOff, - now: time.Now(), - bootBuyRC: opts.BootBuyRC, - ledTick: 0, + uiConn: uiConn, + levelUp: true, + state: stateOff, + now: time.Now(), + ledTick: 0, } } diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 1cb0b3a..8065982 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -8,8 +8,6 @@ import ( "time" "devicecode-go/bus" - "devicecode-go/services/fabric" - "devicecode-go/services/telemetry" "devicecode-go/services/updater" "devicecode-go/types" "devicecode-go/utilities" @@ -17,78 +15,6 @@ import ( "devicecode-go/x/strconvx" ) -// FirmwareVersion/FirmwareBuild/FirmwareImageID are the stamps the updater -// publishes via state/self/software. main may override them before the reactor -// starts; defaults are development sentinels. -var ( - FirmwareVersion = "0.0.0-dev" - FirmwareBuild = "local" - FirmwareImageID = "img-dev" -) - -func firmwareIdentity() updater.Identity { - return updater.Identity{ - Version: FirmwareVersion, - Build: FirmwareBuild, - ImageID: FirmwareImageID, - } -} - -const ( - fabricWaitLogInterval = 2 * time.Second - fabricStopWaitTimeout = 500 * time.Millisecond -) - -func waitFabricDone(done <-chan struct{}, timeout time.Duration) bool { - if done == nil { - return true - } - timer := time.NewTimer(timeout) - defer timer.Stop() - select { - case <-done: - return true - case <-timer.C: - return false - } -} - -func waitForUpdaterCriticalFacts(ctx context.Context, conn *bus.Connection) bool { - if conn == nil { - return false - } - swSub := conn.Subscribe(updater.TopicSoftwareFact) - defer conn.Unsubscribe(swSub) - upSub := conn.Subscribe(updater.TopicUpdaterFact) - defer conn.Unsubscribe(upSub) - healthSub := conn.Subscribe(updater.TopicHealthFact) - defer conn.Unsubscribe(healthSub) - - softwareReady := false - updaterReady := false - healthReady := false - - for !(softwareReady && updaterReady && healthReady) { - select { - case <-ctx.Done(): - return false - case msg, ok := <-swSub.Channel(): - if ok && msg != nil && msg.Payload != nil { - softwareReady = true - } - case msg, ok := <-upSub.Channel(): - if ok && msg != nil && msg.Payload != nil { - updaterReady = true - } - case msg, ok := <-healthSub.Channel(): - if ok && msg != nil && msg.Payload != nil { - healthReady = true - } - } - } - return true -} - // ----------------------------------------------------------------------------- // Thresholds & timing // ----------------------------------------------------------------------------- @@ -171,6 +97,13 @@ func tSessClosed(name string) bus.Topic { return bus.T("hal", "cap", "io", "serial", name, "event", "session_closed") } +func subscriptionChannel(sub *bus.Subscription) <-chan *bus.Message { + if sub == nil { + return nil + } + return sub.Channel() +} + // ----------------------------------------------------------------------------- // Rail order (pre-gap semantics) // ----------------------------------------------------------------------------- @@ -203,9 +136,11 @@ const ( ) type Reactor struct { - bus *bus.Bus uiConn *bus.Connection + // UART + // Fabric uses uart0; human-readable logs are mirrored to uart1. + // inputs (latest) vin_mV, vbat_mV int32 iin_mA, ibat_mA int32 @@ -233,30 +168,31 @@ type Reactor struct { ledTick int // throttles breathe commands // misc - now time.Time - bootBuyRC int32 + now time.Time - // updater service handle used by the post-hello_ack republish hook. - updater *updater.Service -} + // telemetry drop counters (bytes) + droppedUART0Bytes int -type Options struct { - BootBuyRC int32 -} + // supervised children. The Reactor owns only lifecycle; child + // services own their own event loops and models. + children childSupervisor + updaterSvc *updater.Service -func NewReactor(b *bus.Bus, uiConn *bus.Connection) *Reactor { - return NewReactorWithOptions(b, uiConn, Options{}) + // Fabric link lifecycle. Fabric owns its protocol reactor; this top-level + // Reactor only opens/closes the HAL UART session and cancels the active + // Fabric session when the HAL session is replaced or closed. + fabricCancel context.CancelFunc + fabricDone chan struct{} + fabricSessionOpen bool } -func NewReactorWithOptions(b *bus.Bus, uiConn *bus.Connection, opts Options) *Reactor { +func NewReactor(uiConn *bus.Connection) *Reactor { return &Reactor{ - bus: b, - uiConn: uiConn, - levelUp: true, - state: stateOff, - now: time.Now(), - bootBuyRC: opts.BootBuyRC, - ledTick: 0, + uiConn: uiConn, + levelUp: true, + state: stateOff, + now: time.Now(), + ledTick: 0, } } @@ -456,23 +392,29 @@ func (r *Reactor) OnCharger(v types.ChargerValue) { r.vin_mV = v.VIN_mV r.iin_mA = v.IIn_mA r.tsVIN = r.now + + // JSON: {"power/charger/internal/vin":..,"vsys":..,"iin":..} } func (r *Reactor) OnBattery(v types.BatteryValue) { r.vbat_mV = v.PackMilliV r.ibat_mA = v.IBatMilliA r.tsVBAT = r.now + + // JSON: {"power/battery/internal/vbat":..,"ibat":..} } -func (r *Reactor) OnTempDeciC(label string, deci int, _ string) { +func (r *Reactor) OnTempDeciC(label string, deci int, jsonKey string) { log.Deci(label, deci) } -// ---- memory snapshot (every ~3 s in main loop) ---- +// ---- memory snapshot telemetry (every ~2 s in main loop) ---- func (r *Reactor) emitMemSnapshot() { var ms runtime.MemStats + runtime.GC() runtime.ReadMemStats(&ms) + // log line log.Println( "[mem] ", "alloc:", int(ms.Alloc), " ", @@ -480,37 +422,13 @@ func (r *Reactor) emitMemSnapshot() { "mallocs:", int(ms.Mallocs), " ", "frees:", int(ms.Frees), ) + // JSON (minimal to keep overhead low) } func (r *Reactor) Run(ctx context.Context) { - // Updater service: state machine + updater prepare/commit RPC - // RPC handlers + updater/main staging + retained state/self/{software, - // updater, health} facts. Started early so the initial fact retains - // land before fabric establishes — that way the first hello_ack - // observer sees a populated retain store. - updaterConn := r.bus.NewConnection("updater") - identity := firmwareIdentity() - updaterSvc := updater.New(updater.Options{ - Conn: updaterConn, - Verifier: updater.SignedImageVerifier(), - Applier: updater.ProductionApplier(), - Identity: identity, - BootBuyRC: r.bootBuyRC, - }) - go updaterSvc.Run(ctx) - r.updater = updaterSvc - if !waitForUpdaterCriticalFacts(ctx, r.bus.NewConnection("updater-ready")) { - return - } - - // Telemetry service: subscribes to HAL value topics and republishes - // at state/self/* with integer engineering units; runs the charger - // alert FSM and emits event/self/power/charger/alert on bit-set - // transitions. Started after the updater so the initial software/ - // updater retains land first. - telemetryConn := r.bus.NewConnection("telemetry") - telemetrySvc := telemetry.New(telemetryConn) - go telemetrySvc.Run(ctx) + r.startCoreChildren(ctx) + defer r.stopCoreChildren() + defer r.stopFabricLink() // Subscriptions (env + power) log.Println("[main] subscribing env + power …") @@ -521,34 +439,28 @@ func (r *Reactor) Run(ctx context.Context) { stSub := r.uiConn.Subscribe(stTopic) evSub := r.uiConn.Subscribe(evTopic) - // UART session for the CM5 Fabric link on proto_1 hardware. - const uartFabric = "uart1" - subSessOpenFabric := r.uiConn.Subscribe(tSessOpened(uartFabric)) - subSessClosedFabric := r.uiConn.Subscribe(tSessClosed(uartFabric)) - r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) - - // Retry back-off guards - var retryFabricAt time.Time - - // Fabric session lifecycle state - var fabricCancel context.CancelFunc - var fabricDone chan struct{} - var fabricSessionOpen bool - nextFabricWaitLog := time.Now() + // UART sessions. uart0 is the CM5 Fabric/message-bus link; uart1 is + // reserved for human-readable diagnostics. Legacy JSON telemetry is not + // emitted on either UART. + const uartLog = "uart1" + subSessOpenLog := r.uiConn.Subscribe(tSessOpened(uartLog)) + subSessClosedLog := r.uiConn.Subscribe(tSessClosed(uartLog)) + var subSessOpenFabric *bus.Subscription + var subSessClosedFabric *bus.Subscription + if useHardwareFabricUART() { + subSessOpenFabric = r.uiConn.Subscribe(tSessOpened(fabricUART)) + subSessClosedFabric = r.uiConn.Subscribe(tSessClosed(fabricUART)) + } - stopFabricSession := func() { - if fabricCancel == nil { - return - } - done := fabricDone - fabricCancel() - fabricCancel = nil - fabricDone = nil - if !waitFabricDone(done, fabricStopWaitTimeout) { - log.Println("[uart1] fabric session stop timed out") - } + // Kick open requests (fire-and-forget; events carry handles). + r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartLog), nil, false)) + if useHardwareFabricUART() { + r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(fabricUART), nil, false)) } + // Retry back-off guards. + var retryLogAt, retryFabricAt time.Time + // Supervisory ticker ticker := time.NewTicker(TICK) defer ticker.Stop() @@ -558,40 +470,32 @@ func (r *Reactor) Run(ctx context.Context) { for { select { // ---- UART session opened/closed ---- - case m := <-subSessOpenFabric.Channel(): + case m := <-subSessOpenLog.Channel(): if ev, ok := m.Payload.(types.SerialSessionOpened); ok { - // Tear down any previous fabric session before starting a new one. - stopFabricSession() - rx := shmring.Get(shmring.Handle(ev.RXHandle)) - tx := shmring.Get(shmring.Handle(ev.TXHandle)) - tr := fabric.NewShmringTransport(rx, tx) - fabricConn := r.bus.NewConnection("fabric") - fabricCtx, cancel := context.WithCancel(ctx) - done := make(chan struct{}) - fabricCancel = cancel - fabricDone = done - fabricSessionOpen = true - log.Println("[uart1] fabric session opening node=mcu peer=bigbox-cm5 link=mcu-uart0") - go func() { - defer close(done) - fabric.Run(fabricCtx, tr, fabricConn, "mcu", "bigbox-cm5", fabric.DefaultLinkConfig()) - }() - log.Println("[uart1] fabric session opened node=mcu peer=bigbox-cm5 link=mcu-uart0") + log.SetUART1(shmring.Get(shmring.Handle(ev.TXHandle))) + log.Println("[uart1] log session opened") } - case <-subSessClosedFabric.Channel(): - // Ignore stale close events — the open handler already tears down - // the previous session before starting a new one. - if !fabricSessionOpen { - continue + case m := <-subscriptionChannel(subSessOpenFabric): + if ev, ok := m.Payload.(types.SerialSessionOpened); ok { + r.startPassiveFabric(ctx, ev) + } + case <-subSessClosedLog.Channel(): + log.SetUART1(nil) + log.Println("[uart1] log session closed") + // Auto-reopen with back-off + if time.Now().After(retryLogAt) { + r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartLog), nil, false)) + retryLogAt = time.Now().Add(2 * time.Second) } - stopFabricSession() - fabricSessionOpen = false - nextFabricWaitLog = time.Now() - log.Println("[uart1] fabric session closed") + case <-subscriptionChannel(subSessClosedFabric): + r.stopFabricLink() + log.Println(fabricLogPrefix + "fabric session closed") + // Auto-reopen with back-off if time.Now().After(retryFabricAt) { - r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) + r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(fabricUART), nil, false)) retryFabricAt = time.Now().Add(2 * time.Second) } + // ---- Env prints ---- case m := <-tempSub.Channel(): if v, ok := m.Payload.(types.TemperatureValue); ok { @@ -607,6 +511,7 @@ func (r *Reactor) Run(ctx context.Context) { case m := <-humidSub.Channel(): if v, ok := m.Payload.(types.HumidityValue); ok { log.Hundredths("[value] env/humidity/core %RH=", int(v.RHx100)) + // JSON } // ---- Die Temp Backup ---- @@ -641,16 +546,16 @@ func (r *Reactor) Run(ctx context.Context) { case m := <-evSub.Channel(): printCapEvent(m) + // JSON: {"///event":""} + + // ---- Child service lifecycle ---- + case ev := <-r.children.Done(): + r.children.HandleExit(ev) // ---- Supervisory tick ---- case <-ticker.C: r.now = time.Now() - if !fabricSessionOpen && !r.now.Before(nextFabricWaitLog) { - log.Println("[main] waiting for fabric connection start") - nextFabricWaitLog = r.now.Add(fabricWaitLogInterval) - } - // 1) Run FSM (includes symmetric reversal) r.stepFSM() @@ -671,6 +576,10 @@ func (r *Reactor) Run(ctx context.Context) { } } +// ----------------------------------------------------------------------------- +// Centralised UART write helpers (handle partial writes) +// ----------------------------------------------------------------------------- + // ----------------------------------------------------------------------------- // Printing helpers (via Logger) // ----------------------------------------------------------------------------- diff --git a/services/reactor/reactor_test.go b/services/reactor/reactor_test.go deleted file mode 100644 index 34a7389..0000000 --- a/services/reactor/reactor_test.go +++ /dev/null @@ -1,106 +0,0 @@ -//go:build !qa_reactor - -package reactor - -import ( - "context" - "testing" - "time" - - "devicecode-go/bus" - "devicecode-go/services/updater" -) - -func TestWaitFabricDoneNil(t *testing.T) { - if !waitFabricDone(nil, time.Millisecond) { - t.Fatal("nil fabric done channel should be treated as stopped") - } -} - -func TestWaitFabricDoneClosed(t *testing.T) { - done := make(chan struct{}) - close(done) - - if !waitFabricDone(done, 50*time.Millisecond) { - t.Fatal("closed fabric done channel should report stopped") - } -} - -func TestWaitFabricDoneTimeout(t *testing.T) { - done := make(chan struct{}) - start := time.Now() - - if waitFabricDone(done, 10*time.Millisecond) { - t.Fatal("open fabric done channel should time out") - } - if elapsed := time.Since(start); elapsed > 250*time.Millisecond { - t.Fatalf("timeout wait took too long: %s", elapsed) - } -} - -func TestNewReactorDefaultsBootBuyRCZero(t *testing.T) { - r := NewReactor(nil, nil) - if r.bootBuyRC != 0 { - t.Fatalf("bootBuyRC = %d, want 0", r.bootBuyRC) - } -} - -func TestNewReactorWithOptionsStoresBootBuyRC(t *testing.T) { - r := NewReactorWithOptions(nil, nil, Options{BootBuyRC: -42}) - if r.bootBuyRC != -42 { - t.Fatalf("bootBuyRC = %d, want -42", r.bootBuyRC) - } -} - -func TestWaitForUpdaterCriticalFactsRequiresAllThreeFacts(t *testing.T) { - b := bus.NewBus(16, "+", "#") - waitConn := b.NewConnection("wait") - pubConn := b.NewConnection("pub") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - done := make(chan bool, 1) - go func() { - done <- waitForUpdaterCriticalFacts(ctx, waitConn) - }() - - pubConn.Publish(pubConn.NewMessage( - updater.TopicSoftwareFact, - updater.SoftwareFact{ImageID: "img", Version: "1.0", BootID: "boot"}, - true, - )) - pubConn.Publish(pubConn.NewMessage( - updater.TopicUpdaterFact, - updater.UpdaterFact{State: updater.StateRunning}, - true, - )) - select { - case got := <-done: - t.Fatalf("wait returned %t before health fact", got) - case <-time.After(20 * time.Millisecond): - } - - pubConn.Publish(pubConn.NewMessage( - updater.TopicHealthFact, - updater.HealthFact{State: "ok"}, - true, - )) - select { - case got := <-done: - if !got { - t.Fatal("wait returned false after all critical facts") - } - case <-time.After(time.Second): - t.Fatal("timeout waiting for critical facts") - } -} - -func TestWaitForUpdaterCriticalFactsStopsOnContextCancel(t *testing.T) { - b := bus.NewBus(16, "+", "#") - ctx, cancel := context.WithCancel(context.Background()) - cancel() - - if waitForUpdaterCriticalFacts(ctx, b.NewConnection("wait")) { - t.Fatal("wait returned true after context cancellation") - } -} diff --git a/services/reactor/updater_policy_apply.go b/services/reactor/updater_policy_apply.go new file mode 100644 index 0000000..6f6d342 --- /dev/null +++ b/services/reactor/updater_policy_apply.go @@ -0,0 +1,18 @@ +//go:build !qa_reactor && fabric_stage_enabled && fabric_apply_enabled && !fabric_uart_hwtest && !fabric_uart_selftest + +package reactor + +import ( + "devicecode-go/bus" + "devicecode-go/services/updater" +) + +func updaterRuntimeMode() string { return "production-applier:commit-reboots" } + +func updaterServiceOptions(conn *bus.Connection) updater.Options { + return updater.Options{ + Conn: conn, + Identity: firmwareIdentity(), + Applier: updater.ProductionApplier(), + } +} diff --git a/services/reactor/updater_policy_default.go b/services/reactor/updater_policy_default.go new file mode 100644 index 0000000..82bdd72 --- /dev/null +++ b/services/reactor/updater_policy_default.go @@ -0,0 +1,17 @@ +//go:build !qa_reactor && (!fabric_stage_enabled || !fabric_apply_enabled || fabric_uart_hwtest || fabric_uart_selftest) + +package reactor + +import ( + "devicecode-go/bus" + "devicecode-go/services/updater" +) + +func updaterRuntimeMode() string { return "safe-defaults:apply-disabled" } + +func updaterServiceOptions(conn *bus.Connection) updater.Options { + return updater.Options{ + Conn: conn, + Identity: firmwareIdentity(), + } +} diff --git a/services/telemetry/telemetry.go b/services/telemetry/telemetry.go index 3c15b04..46fde2d 100644 --- a/services/telemetry/telemetry.go +++ b/services/telemetry/telemetry.go @@ -241,6 +241,14 @@ type linkObservation struct { LocalSID string } +// fabricLinkObserver is implemented by services/fabric's retained link-state +// payload. Keeping this as a tiny structural interface avoids JSON reflection +// in the common in-process TinyGo path while still tolerating map/JSON payloads +// in host-side tests. +type fabricLinkObserver interface { + FabricLinkObservation() (ready bool, peerSID string, localSID string) +} + func linkReadyEdgeReason(prev, cur linkObservation, hadPrev bool) string { if !cur.Ready { return "" @@ -278,6 +286,9 @@ func decodeLinkReady(msg *bus.Message) (string, linkObservation) { obs.PeerSID, _ = p["peer_sid"].(string) obs.LocalSID, _ = p["local_sid"].(string) return id, obs + case fabricLinkObserver: + obs.Ready, obs.PeerSID, obs.LocalSID = p.FabricLinkObservation() + return id, obs } // Probe via JSON for the typed-struct payload fabric publishes. b, err := json.Marshal(msg.Payload) diff --git a/services/updater/abupdate_diag_host.go b/services/updater/abupdate_diag_host.go index cfda0a4..7bc88b2 100644 --- a/services/updater/abupdate_diag_host.go +++ b/services/updater/abupdate_diag_host.go @@ -6,17 +6,32 @@ import "sync" var abupdateDiagMu sync.Mutex var abupdateDiagActive bool +var abupdateDiagXferID string +var abupdateDiagGeneration uint64 func installABUpdateDiagHook(xferID string, generation uint64) { - _, _ = xferID, generation abupdateDiagMu.Lock() abupdateDiagActive = true + abupdateDiagXferID = xferID + abupdateDiagGeneration = generation abupdateDiagMu.Unlock() } func clearABUpdateDiagHook() { abupdateDiagMu.Lock() abupdateDiagActive = false + abupdateDiagXferID = "" + abupdateDiagGeneration = 0 + abupdateDiagMu.Unlock() +} + +func clearABUpdateDiagHookFor(xferID string, generation uint64) { + abupdateDiagMu.Lock() + if abupdateDiagActive && abupdateDiagXferID == xferID && abupdateDiagGeneration == generation { + abupdateDiagActive = false + abupdateDiagXferID = "" + abupdateDiagGeneration = 0 + } abupdateDiagMu.Unlock() } diff --git a/services/updater/abupdate_diag_tinygo.go b/services/updater/abupdate_diag_tinygo.go index c789aee..0cd9a41 100644 --- a/services/updater/abupdate_diag_tinygo.go +++ b/services/updater/abupdate_diag_tinygo.go @@ -41,6 +41,13 @@ func clearABUpdateDiagHook() { abupdateDiagGeneration = 0 } +func clearABUpdateDiagHookFor(xferID string, generation uint64) { + if abupdateDiagXferID != xferID || abupdateDiagGeneration != generation { + return + } + clearABUpdateDiagHook() +} + func emitABUpdateDiag(event string, fields ...otadiag.Field) { var out [10]otadiag.Field n := 0 diff --git a/services/updater/applier_host.go b/services/updater/applier_host.go index 1633fd1..8cbc3fc 100644 --- a/services/updater/applier_host.go +++ b/services/updater/applier_host.go @@ -2,18 +2,36 @@ package updater +import ( + "time" + + "devicecode-go/services/otadiag" +) + // ProductionApplier returns the applier the reactor wires by default. // On host builds (tests, dev environments without a flash slot to // reboot into) this stays the safe-default RefusingApplier — commit -// returns apply_unavailable. Real reboot wiring lives in +// returns commit_failed. Real reboot wiring lives in // applier_tinygo.go. func ProductionApplier() Applier { return RefusingApplier() } func scheduleArmReboot(a Applier, d StagedDescriptor, results chan<- applyRebootResult) { - if err := a.ArmReboot(d); err != nil { - select { - case results <- applyRebootResult{desc: d, err: err}: - default: + const replyFlushDelay = 750 * time.Millisecond + otadiag.Event( + "[updater-commit]", "arm_reboot_scheduled", otadiag.XferNone, + otadiag.KV("image_id", d.ImageID), + otadiag.KV("slot", int(d.Slot)), + otadiag.KV("delay_ms", int(replyFlushDelay/time.Millisecond)), + ) + go func() { + time.Sleep(replyFlushDelay) + otadiag.Event("[updater-commit]", "arm_reboot_start", otadiag.XferNone, otadiag.KV("image_id", d.ImageID), otadiag.KV("slot", int(d.Slot))) + if err := a.ArmReboot(d); err != nil { + otadiag.Event("[updater-commit]", "arm_reboot_return", otadiag.XferNone, otadiag.KV("err", err.Error()), otadiag.KV("image_id", d.ImageID), otadiag.KV("slot", int(d.Slot))) + select { + case results <- applyRebootResult{desc: d, err: err}: + default: + } } - } + }() } diff --git a/services/updater/applier_tinygo.go b/services/updater/applier_tinygo.go index 8df4b86..7d152e4 100644 --- a/services/updater/applier_tinygo.go +++ b/services/updater/applier_tinygo.go @@ -5,6 +5,8 @@ package updater import ( "errors" "time" + + "devicecode-go/services/otadiag" ) // abupdateApplier reboots into the slot the abupdateSink staged into. @@ -24,7 +26,7 @@ const postCommitReplyFlushDelay = 750 * time.Millisecond func (abupdateApplier) CanApply(d StagedDescriptor) error { _ = d if !sharedUpdaterInit { - return errFromRC("apply_unavailable_uninited", 0) + return errors.New(ErrApplyUnavailable) } return nil } @@ -40,13 +42,22 @@ func (abupdateApplier) ArmReboot(d StagedDescriptor) error { } func scheduleArmReboot(a Applier, d StagedDescriptor, results chan<- applyRebootResult) { + otadiag.Event( + "[updater-commit]", "arm_reboot_scheduled", otadiag.XferNone, + otadiag.KV("delay_ms", postCommitReplyFlushDelay), + otadiag.KV("image_id", d.ImageID), + otadiag.KV("version", d.Version), + otadiag.KV("slot", int(d.Slot)), + ) go func() { // handleCommit has only replied on the local bus. The fabric // session still needs a scheduler turn to marshal and write the // wire reply (and the state=rebooting retain) back to CM5 before // RebootIntoSlot stops the process. time.Sleep(postCommitReplyFlushDelay) + otadiag.Event("[updater-commit]", "arm_reboot_start", otadiag.XferNone, otadiag.KV("image_id", d.ImageID), otadiag.KV("slot", int(d.Slot))) if err := a.ArmReboot(d); err != nil { + otadiag.Event("[updater-commit]", "arm_reboot_return", otadiag.XferNone, otadiag.KV("err", err.Error()), otadiag.KV("image_id", d.ImageID), otadiag.KV("slot", int(d.Slot))) select { case results <- applyRebootResult{desc: d, err: err}: default: diff --git a/services/updater/prestage_host.go b/services/updater/prestage_host.go index 20f88fc..4d0acf1 100644 --- a/services/updater/prestage_host.go +++ b/services/updater/prestage_host.go @@ -2,7 +2,11 @@ package updater -import "errors" +import ( + "errors" + "io" + "os" +) type streamedStage struct { Version string @@ -12,27 +16,101 @@ type streamedStage struct { PayloadSHA256 string } +var hostStreamedStage struct { + file *os.File + path string + desc streamedStage + ready bool +} + func startStreamedStage(xferID string, generation uint64, size uint32) error { _, _, _ = xferID, generation, size + abortStreamedStage() + f, err := os.CreateTemp("", "dcgo-streamed-stage-*") + if err != nil { + return err + } + hostStreamedStage.file = f + hostStreamedStage.path = f.Name() return nil } func writeStreamedStage(xferID string, generation uint64, data []byte) error { - _, _, _ = xferID, generation, data - return errors.New("streamed_stage_not_supported") + _, _ = xferID, generation + if len(data) == 0 { + return errors.New("empty_chunk") + } + if hostStreamedStage.file == nil { + return errors.New("streamed_stage_not_started") + } + _, err := hostStreamedStage.file.Write(data) + return err } -func commitStreamedStage(xferID string, generation uint64) (streamedStage, error) { +func commitStreamedStage(svc *Service, xferID string, generation uint64) (streamedStage, error) { _, _ = xferID, generation - return streamedStage{}, errors.New("streamed_stage_not_supported") + f := hostStreamedStage.file + if f == nil { + return streamedStage{}, errors.New("streamed_stage_not_started") + } + if _, err := f.Seek(0, io.SeekStart); err != nil { + abortStreamedStage() + return streamedStage{}, err + } + if svc == nil { + abortStreamedStage() + return streamedStage{}, errors.New("updater_not_running") + } + sink, err := newSlotSink(0) + if err != nil { + abortStreamedStage() + return streamedStage{}, err + } + manifest, err := svc.verifier.Verify(f, sink) + if err != nil { + abortStreamedStage() + return streamedStage{}, err + } + desc := streamedStage{ + Version: manifest.Version, + BuildID: manifest.BuildID, + ImageID: manifest.ImageID, + Length: manifest.PayloadLength, + PayloadSHA256: manifest.PayloadSHA256, + } + hostStreamedStage.desc = desc + hostStreamedStage.ready = true + _ = f.Close() + _ = os.Remove(hostStreamedStage.path) + hostStreamedStage.file = nil + hostStreamedStage.path = "" + return desc, nil } -func abortStreamedStage() {} +func abortStreamedStage() { + if hostStreamedStage.file != nil { + _ = hostStreamedStage.file.Close() + } + if hostStreamedStage.path != "" { + _ = os.Remove(hostStreamedStage.path) + } + hostStreamedStage.file = nil + hostStreamedStage.path = "" + hostStreamedStage.desc = streamedStage{} + hostStreamedStage.ready = false +} func consumeStreamedStageResult() (streamedStage, bool) { - return streamedStage{}, false + if !hostStreamedStage.ready { + return streamedStage{}, false + } + out := hostStreamedStage.desc + hostStreamedStage.desc = streamedStage{} + hostStreamedStage.ready = false + return out, true } func discardStreamedStageResult() { + abortStreamedStage() clearABUpdateDiagHook() } diff --git a/services/updater/prestage_hwtest_tinygo.go b/services/updater/prestage_hwtest_tinygo.go new file mode 100644 index 0000000..4cbc761 --- /dev/null +++ b/services/updater/prestage_hwtest_tinygo.go @@ -0,0 +1,108 @@ +//go:build tinygo && rp2350 && fabric_uart_hwtest + +package updater + +import ( + "errors" + + "devicecode-go/x/xxhash" +) + +// Hardware UART/Fabric interconnection tests must exercise the Fabric receiver +// without writing the Pico2's inactive A/B slot. Under fabric_uart_hwtest the +// streamed stage is therefore a fixed-state digest/count sink. The production +// rp2350 prestage path remains in prestage_tinygo.go and is excluded by the +// build tag above. +var hwtestStreamedStage struct { + active bool + ready bool + xferID string + generation uint64 + declared uint32 + written uint32 + hasher xxhash.Hasher + desc streamedStage +} + +func startStreamedStage(xferID string, generation uint64, size uint32) error { + hwtestStreamedStage.active = true + hwtestStreamedStage.ready = false + hwtestStreamedStage.xferID = xferID + hwtestStreamedStage.generation = generation + hwtestStreamedStage.declared = size + hwtestStreamedStage.written = 0 + hwtestStreamedStage.hasher = *xxhash.New(0) + hwtestStreamedStage.desc = streamedStage{} + return nil +} + +func writeStreamedStage(xferID string, generation uint64, data []byte) error { + if !hwtestStreamedStage.active { + return errors.New("streamed_stage_not_started") + } + if hwtestStreamedStage.xferID != xferID || hwtestStreamedStage.generation != generation { + return errors.New("streamed_stage_generation_mismatch") + } + if len(data) == 0 { + return errors.New("empty_chunk") + } + hwtestStreamedStage.written += uint32(len(data)) + _, _ = hwtestStreamedStage.hasher.Write(data) + return nil +} + +func commitStreamedStage(svc *Service, xferID string, generation uint64) (streamedStage, error) { + _ = svc + if !hwtestStreamedStage.active { + return streamedStage{}, errors.New("streamed_stage_not_started") + } + if hwtestStreamedStage.xferID != xferID || hwtestStreamedStage.generation != generation { + return streamedStage{}, errors.New("streamed_stage_generation_mismatch") + } + if hwtestStreamedStage.written != hwtestStreamedStage.declared { + hwtestStreamedStage.active = false + return streamedStage{}, errors.New("streamed_stage_size_mismatch") + } + desc := streamedStage{ + Version: "uart-crosswire-test", + BuildID: "fabric-uart-crosswire", + ImageID: "hwtest-image", + Length: hwtestStreamedStage.written, + PayloadSHA256: "xxhash32:" + hwtestXXHashHex(hwtestStreamedStage.hasher.Sum32()), + } + hwtestStreamedStage.desc = desc + hwtestStreamedStage.ready = true + hwtestStreamedStage.active = false + return desc, nil +} + +func abortStreamedStage() { + hwtestStreamedStage.active = false + hwtestStreamedStage.ready = false + hwtestStreamedStage.desc = streamedStage{} +} + +func consumeStreamedStageResult() (streamedStage, bool) { + if !hwtestStreamedStage.ready { + return streamedStage{}, false + } + out := hwtestStreamedStage.desc + hwtestStreamedStage.ready = false + hwtestStreamedStage.desc = streamedStage{} + return out, true +} + +func discardStreamedStageResult() { + abortStreamedStage() + clearABUpdateDiagHook() +} + +func hwtestXXHashHex(v uint32) string { + const digits = "0123456789abcdef" + var buf [8]byte + for i := 7; i >= 0; i-- { + buf[i] = digits[v&0xf] + v >>= 4 + } + return string(buf[:]) +} diff --git a/services/updater/prestage_hwtest_types_tinygo.go b/services/updater/prestage_hwtest_types_tinygo.go new file mode 100644 index 0000000..93281c3 --- /dev/null +++ b/services/updater/prestage_hwtest_types_tinygo.go @@ -0,0 +1,16 @@ +//go:build tinygo && rp2350 && fabric_uart_hwtest + +package updater + +// streamedStage is shared by the production RP2350 prestage path and the +// fabric_uart_hwtest prestage sink. The production definition lives in +// prestage_tinygo.go, which is deliberately excluded under fabric_uart_hwtest +// so that the hardware UART/Fabric interconnection test does not write the +// inactive A/B flash slot. +type streamedStage struct { + Version string + BuildID string + ImageID string + Length uint32 + PayloadSHA256 string +} diff --git a/services/updater/prestage_tinygo.go b/services/updater/prestage_tinygo.go index 88c3d6d..56f215a 100644 --- a/services/updater/prestage_tinygo.go +++ b/services/updater/prestage_tinygo.go @@ -1,4 +1,4 @@ -//go:build tinygo && rp2350 +//go:build tinygo && rp2350 && !fabric_uart_hwtest package updater @@ -96,8 +96,8 @@ func writeStreamedStage(xferID string, generation uint64, data []byte) error { return err } -func commitStreamedStage(xferID string, generation uint64) (streamedStage, error) { - _, _ = xferID, generation +func commitStreamedStage(svc *Service, xferID string, generation uint64) (streamedStage, error) { + _, _, _ = svc, xferID, generation if streamedVerifier == nil { return streamedStage{}, errors.New("streamed_stage_not_started") } diff --git a/services/updater/receiver.go b/services/updater/receiver.go index 58efb25..c6c7d2e 100644 --- a/services/updater/receiver.go +++ b/services/updater/receiver.go @@ -1,10 +1,10 @@ package updater import ( - "bytes" "errors" "devicecode-go/bus" + "devicecode-go/services/otadiag" ) // The SlotSink used during verification is created via newSlotSink, @@ -12,6 +12,17 @@ import ( // tinygo+rp2350 returns an abupdate-backed sink that streams into the // inactive A/B slot (sink_tinygo.go). +func (s *Service) replyStage(msg *bus.Message, payload StagePayload, reply StageReply) { + otadiag.Event("[updater-stream]", "stage_reply", payload.XferID, + otadiag.KV("ok", reply.OK), + otadiag.KV("err", reply.Err), + otadiag.KV("stage", reply.Stage), + otadiag.KV("generation", payload.Generation), + otadiag.KV("size", payload.Size), + ) + s.reply(msg, reply) +} + // handleStage runs the verifier-gated staging path. Triggered by fabric // after xfer_commit; the reply gates whether fabric sends xfer_done or // xfer_abort. @@ -24,119 +35,60 @@ import ( func (s *Service) handleStage(msg *bus.Message) { payload, ok := jsonDecode[StagePayload](msg.Payload) if !ok { - s.reply(msg, StageReply{OK: false, Err: "bad_payload"}) + s.replyStage(msg, StagePayload{}, StageReply{OK: false, Err: "bad_payload"}) return } if payload.Target != TargetUpdaterMain { - s.reply(msg, StageReply{OK: false, Err: "unsupported_target"}) + s.replyStage(msg, payload, StageReply{OK: false, Err: "unsupported_target"}) return } if payload.DigestAlg != "" && payload.DigestAlg != DigestAlgXXHash32 { - s.reply(msg, StageReply{OK: false, Err: "unsupported_digest_alg"}) + s.replyStage(msg, payload, StageReply{OK: false, Err: "unsupported_digest_alg"}) return } if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { - s.reply(msg, StageReply{OK: false, Err: err.Error()}) - return - } - - if len(payload.Artefact) == 0 { - staged, ok := consumeStreamedStageResult() - if !ok { - s.failStage(payload, "artefact_missing") - s.reply(msg, StageReply{OK: false, Err: "artefact_missing"}) - return - } - if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { - s.failLateStage(payload, err) - s.reply(msg, StageReply{OK: false, Err: err.Error()}) - return - } - desc := StagedDescriptor{ - Version: staged.Version, - BuildID: staged.BuildID, - ImageID: staged.ImageID, - Length: staged.Length, - Slot: 0, - PayloadSHA256: staged.PayloadSHA256, - } - if err := s.metadataWrite.WriteStagedDescriptor(desc); err != nil { - s.failStage(payload, "metadata_write_failed:"+err.Error()) - s.reply(msg, StageReply{OK: false, Err: "metadata_write_failed"}) - return - } - if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { - s.failLateStage(payload, err) - s.reply(msg, StageReply{OK: false, Err: err.Error()}) - return - } - if !s.releaseStreamedStageLease(payload.XferID, payload.Generation) { - err := errors.New("stage_cancelled") - s.failLateStage(payload, err) - s.reply(msg, StageReply{OK: false, Err: err.Error()}) - return - } - s.setStagedImage(desc.ImageID, desc.Version) - s.transitionTo(StateStaged, "", desc.Version) - s.reply(msg, StageReply{OK: true, Stage: "staged"}) + s.replyStage(msg, payload, StageReply{OK: false, Err: err.Error()}) return } - sink, err := newSlotSink(uint32(len(payload.Artefact))) - if err != nil { - s.failStage(payload, "sink_init_failed:"+err.Error()) - s.reply(msg, StageReply{OK: false, Err: "sink_init_failed"}) - return - } - if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { - _ = sink.Abort() - s.failLateStage(payload, err) - s.reply(msg, StageReply{OK: false, Err: err.Error()}) - return - } - manifest, err := s.verifier.Verify(bytes.NewReader(payload.Artefact), sink) - if err != nil { - // Verifier rejected the artefact. Clear any prior descriptor so a - // following commit cannot apply stale firmware from an older stage. - s.failStage(payload, err.Error()) - s.reply(msg, StageReply{OK: false, Err: err.Error()}) + staged, ok := s.consumeStreamedStageResult() + if !ok { + s.failStage(payload, "artefact_missing") + s.replyStage(msg, payload, StageReply{OK: false, Err: "artefact_missing"}) return } if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { s.failLateStage(payload, err) - s.reply(msg, StageReply{OK: false, Err: err.Error()}) + s.replyStage(msg, payload, StageReply{OK: false, Err: err.Error()}) return } desc := StagedDescriptor{ - Version: manifest.Version, - BuildID: manifest.BuildID, - ImageID: manifest.ImageID, - Length: manifest.PayloadLength, - Slot: 0, // slot-pick comes from abupdate when hardware apply is wired - PayloadSHA256: manifest.PayloadSHA256, + Version: staged.Version, + BuildID: staged.BuildID, + ImageID: staged.ImageID, + Length: staged.Length, + Slot: 0, + PayloadSHA256: staged.PayloadSHA256, } if err := s.metadataWrite.WriteStagedDescriptor(desc); err != nil { s.failStage(payload, "metadata_write_failed:"+err.Error()) - s.reply(msg, StageReply{OK: false, Err: "metadata_write_failed"}) + s.replyStage(msg, payload, StageReply{OK: false, Err: "metadata_write_failed"}) return } if err := s.checkStreamedStageLease(payload.XferID, payload.Generation, true); err != nil { s.failLateStage(payload, err) - s.reply(msg, StageReply{OK: false, Err: err.Error()}) + s.replyStage(msg, payload, StageReply{OK: false, Err: err.Error()}) return } - if !s.releaseStreamedStageLease(payload.XferID, payload.Generation) { err := errors.New("stage_cancelled") s.failLateStage(payload, err) - s.reply(msg, StageReply{OK: false, Err: err.Error()}) + s.replyStage(msg, payload, StageReply{OK: false, Err: err.Error()}) return } - s.setStagedImage(desc.ImageID, manifest.Version) - s.transitionTo(StateStaged, "", manifest.Version) - // Do not republish the software fact here: PayloadSHA256 describes the - // running image, while this descriptor describes the staged image. - s.reply(msg, StageReply{OK: true, Stage: "staged"}) + s.setStagedImage(desc.ImageID, desc.Version) + s.transitionTo(StateStaged, "", desc.Version) + s.replyStage(msg, payload, StageReply{OK: true, Stage: "staged"}) } func (s *Service) failStage(payload StagePayload, reason string) { diff --git a/services/updater/rpc.go b/services/updater/rpc.go index 825d0f2..2b96441 100644 --- a/services/updater/rpc.go +++ b/services/updater/rpc.go @@ -15,8 +15,8 @@ func (s *Service) handlePrepare(msg *bus.Message) { prepareAt := time.Now() req, ok := jsonDecode[PrepareRequest](msg.Payload) if !ok { - otadiag.Event("[updater-stream]", "prepare_reject", otadiag.XferNone, otadiag.KV("reason", "bad_request")) - s.reply(msg, Reply{OK: false, Error: "bad_request"}) + otadiag.Event("[updater-stream]", "prepare_reject", otadiag.XferNone, otadiag.KV("reason", ErrInvalidRequest)) + s.reply(msg, Reply{OK: false, Error: ErrInvalidRequest}) return } otadiag.Event( @@ -26,8 +26,8 @@ func (s *Service) handlePrepare(msg *bus.Message) { otadiag.KV("expected_image_id", req.ExpectedImageID), ) if req.Target != "" && req.Target != PrepareTargetMCU { - otadiag.Event("[updater-stream]", "prepare_reject", otadiag.XferNone, otadiag.KV("reason", ErrTargetMismatch)) - s.reply(msg, Reply{OK: false, Error: ErrTargetMismatch}) + otadiag.Event("[updater-stream]", "prepare_reject", otadiag.XferNone, otadiag.KV("reason", ErrUnsupportedTarget)) + s.reply(msg, Reply{OK: false, Error: ErrUnsupportedTarget}) return } @@ -103,23 +103,37 @@ func (s *Service) handlePrepare(msg *bus.Message) { func (s *Service) handleCommit(msg *bus.Message) { req, ok := jsonDecode[CommitRequest](msg.Payload) if !ok { - s.reply(msg, Reply{OK: false, Error: "bad_request"}) + otadiag.Event("[updater-commit]", "reject", otadiag.XferNone, otadiag.KV("reason", ErrInvalidRequest)) + s.reply(msg, Reply{OK: false, Error: ErrInvalidRequest}) return } + otadiag.Event( + "[updater-commit]", "rx", otadiag.XferNone, + otadiag.KV("job_id", req.JobID), + otadiag.KV("expected_image_id", req.ExpectedImageID), + ) desc, present := s.metadata.StagedDescriptor() s.mu.Lock() stagedInState := s.state == StateStaged + state := s.state pendingImageID := s.pendingImageID streamActive := s.streamLeaseActive s.mu.Unlock() if streamActive { + otadiag.Event("[updater-commit]", "reject", otadiag.XferNone, otadiag.KV("reason", ErrBusy), otadiag.KV("stream_active", true)) s.reply(msg, Reply{OK: false, Error: ErrBusy}) return } if !present || !stagedInState { - s.reply(msg, Reply{OK: false, Error: ErrNothingStaged}) + otadiag.Event( + "[updater-commit]", "reject", otadiag.XferNone, + otadiag.KV("reason", ErrNoStagedImage), + otadiag.KV("staged_present", present), + otadiag.KV("state", string(state)), + ) + s.reply(msg, Reply{OK: false, Error: ErrNoStagedImage}) return } expectedImageID := pendingImageID @@ -127,20 +141,42 @@ func (s *Service) handleCommit(msg *bus.Message) { expectedImageID = req.ExpectedImageID } if expectedImageID != "" && desc.ImageID != expectedImageID { - s.reply(msg, Reply{OK: false, Error: ErrTargetMismatch}) + otadiag.Event( + "[updater-commit]", "reject", otadiag.XferNone, + otadiag.KV("reason", ErrImageIDMismatch), + otadiag.KV("staged_image_id", desc.ImageID), + otadiag.KV("expected_image_id", expectedImageID), + ) + s.reply(msg, Reply{OK: false, Error: ErrImageIDMismatch}) return } // Validate the apply path before publishing committing/rebooting or // replying accepted. The default Applier refuses in non-hardware tests. if err := s.applier.CanApply(desc); err != nil { - s.reply(msg, Reply{OK: false, Error: err.Error()}) + otadiag.Event( + "[updater-commit]", "reject", otadiag.XferNone, + otadiag.KV("reason", ErrApplyUnavailable), + otadiag.KV("err", err.Error()), + otadiag.KV("image_id", desc.ImageID), + otadiag.KV("version", desc.Version), + ) + s.reply(msg, Reply{OK: false, Error: ErrApplyUnavailable}) return } + otadiag.Event( + "[updater-commit]", "accepted", otadiag.XferNone, + otadiag.KV("job_id", req.JobID), + otadiag.KV("image_id", desc.ImageID), + otadiag.KV("version", desc.Version), + otadiag.KV("length", desc.Length), + otadiag.KV("slot", int(desc.Slot)), + ) s.transitionTo(StateCommitting, "", desc.Version) s.reply(msg, CommitReply{Accepted: true, RebootRequired: true}) s.transitionTo(StateRebooting, "", desc.Version) + otadiag.Event("[updater-commit]", "state", otadiag.XferNone, otadiag.KV("state", string(StateRebooting)), otadiag.KV("image_id", desc.ImageID)) scheduleArmReboot(s.applier, desc, s.applyResults) } diff --git a/services/updater/stream_lease.go b/services/updater/stream_lease.go index 905e80b..4f326b2 100644 --- a/services/updater/stream_lease.go +++ b/services/updater/stream_lease.go @@ -1,182 +1,525 @@ package updater import ( + "context" "errors" - "sync" "time" "devicecode-go/services/otadiag" ) -var ( - activeServiceMu sync.Mutex - activeService *Service +type streamedStageCommandKind uint8 + +const ( + streamedStageCommandBegin streamedStageCommandKind = iota + 1 + streamedStageCommandWrite + streamedStageCommandCommit + streamedStageCommandAbort + streamedStageCommandCancel ) -func registerActiveService(s *Service) func() { - activeServiceMu.Lock() - activeService = s - activeServiceMu.Unlock() - return func() { - activeServiceMu.Lock() - if activeService == s { - activeService = nil +type streamedStageCommand struct { + kind streamedStageCommandKind + xferID string + generation uint64 + size uint32 + data []byte + reason string + reply chan streamedStageCommandResult +} + +type streamedStageCommandResult struct { + generation uint64 + written uint32 + err error +} + +type streamedStageWorkerCommand struct { + kind streamedStageCommandKind + xferID string + generation uint64 + size uint32 + data []byte + reason string +} + +type streamedStageWorkerResult struct { + kind streamedStageCommandKind + xferID string + generation uint64 + staged streamedStage + err error +} + +// BeginStreamedStage submits a transfer-begin operation to the updater reactor. +// The updater loop owns the lease/state decision; the stage worker owns any +// verifier or flash setup needed to accept the stream. +func (s *Service) BeginStreamedStage(xferID string, size uint32) (uint64, error) { + res := s.submitStreamedStageCommand(streamedStageCommand{ + kind: streamedStageCommandBegin, + xferID: xferID, + size: size, + }) + return res.generation, res.err +} + +func (s *Service) WriteStreamedStage(xferID string, generation uint64, data []byte) error { + res := s.submitStreamedStageCommand(streamedStageCommand{ + kind: streamedStageCommandWrite, + xferID: xferID, + generation: generation, + data: data, + }) + return res.err +} + +func (s *Service) CommitStreamedStage(xferID string, generation uint64) (uint32, error) { + res := s.submitStreamedStageCommand(streamedStageCommand{ + kind: streamedStageCommandCommit, + xferID: xferID, + generation: generation, + }) + return res.written, res.err +} + +func (s *Service) AbortStreamedStage(xferID string, generation uint64, reason string) { + _ = s.submitStreamedStageCommand(streamedStageCommand{ + kind: streamedStageCommandAbort, + xferID: xferID, + generation: generation, + reason: reason, + }) +} + +func (s *Service) CancelStreamedStage(xferID string, generation uint64, reason string) { + _ = s.submitStreamedStageCommand(streamedStageCommand{ + kind: streamedStageCommandCancel, + xferID: xferID, + generation: generation, + reason: reason, + }) +} + +func (s *Service) submitStreamedStageCommand(cmd streamedStageCommand) streamedStageCommandResult { + if updaterTraceEnabled { + println("[updater-trace]", "submit", "kind", int(cmd.kind), "xfer", cmd.xferID, "generation", cmd.generation, "size", cmd.size, "data_len", len(cmd.data)) + } + if s == nil { + return streamedStageCommandResult{err: errors.New("updater_not_running")} + } + select { + case <-s.stageStopped: + return streamedStageCommandResult{err: errors.New("updater_not_running")} + default: + } + select { + case <-s.stageReady: + default: + return streamedStageCommandResult{err: errors.New("updater_not_running")} + } + cmd.reply = make(chan streamedStageCommandResult, 1) + select { + case s.stageCommands <- cmd: + case <-s.stageStopped: + return streamedStageCommandResult{err: errors.New("updater_not_running")} + } + select { + case res := <-cmd.reply: + if updaterTraceEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + println("[updater-trace]", "reply", "kind", int(cmd.kind), "xfer", cmd.xferID, "generation", res.generation, "written", res.written, "err", errText) + } + return res + case <-s.stageStopped: + return streamedStageCommandResult{err: errors.New("updater_not_running")} + } +} + +func (s *Service) runStreamedStageWorker(ctx context.Context) { + for { + select { + case <-ctx.Done(): + abortStreamedStage() + s.clearActiveABUpdateDiagHook() + return + case cmd, ok := <-s.stageWorkerCommands: + if !ok { + abortStreamedStage() + s.clearActiveABUpdateDiagHook() + return + } + if updaterTraceEnabled { + println("[updater-trace]", "worker_start", "kind", int(cmd.kind), "xfer", cmd.xferID, "generation", cmd.generation, "size", cmd.size, "data_len", len(cmd.data)) + } + res := streamedStageWorkerResult{kind: cmd.kind, xferID: cmd.xferID, generation: cmd.generation} + switch cmd.kind { + case streamedStageCommandBegin: + res.err = startStreamedStage(cmd.xferID, cmd.generation, cmd.size) + case streamedStageCommandWrite: + res.err = writeStreamedStage(cmd.xferID, cmd.generation, cmd.data) + case streamedStageCommandCommit: + res.staged, res.err = commitStreamedStage(s, cmd.xferID, cmd.generation) + case streamedStageCommandAbort, streamedStageCommandCancel: + abortStreamedStage() + clearABUpdateDiagHookFor(cmd.xferID, cmd.generation) + default: + res.err = errors.New("bad_stage_command") + } + if updaterTraceEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + println("[updater-trace]", "worker_done", "kind", int(cmd.kind), "xfer", cmd.xferID, "generation", cmd.generation, "err", errText) + } + select { + case s.stageWorkerResults <- res: + case <-ctx.Done(): + abortStreamedStage() + s.clearActiveABUpdateDiagHook() + return + } } - activeServiceMu.Unlock() } } -func currentService() *Service { - activeServiceMu.Lock() - defer activeServiceMu.Unlock() - return activeService +func (s *Service) handleStreamedStageCommand(cmd streamedStageCommand) { + if updaterTraceEnabled { + println("[updater-trace]", "handle_cmd", "kind", int(cmd.kind), "xfer", cmd.xferID, "generation", cmd.generation, "pending", s.pendingStageCommand != nil) + } + if cmd.reply == nil { + return + } + if s.pendingStageCommand != nil { + switch cmd.kind { + case streamedStageCommandAbort, streamedStageCommandCancel: + s.cancelPendingStreamedStage(cmd) + default: + cmd.reply <- streamedStageCommandResult{err: errors.New(ErrBusy)} + } + return + } + switch cmd.kind { + case streamedStageCommandBegin: + s.startStreamedStageBegin(cmd) + case streamedStageCommandWrite: + s.startStreamedStageWrite(cmd) + case streamedStageCommandCommit: + s.startStreamedStageCommit(cmd) + case streamedStageCommandAbort, streamedStageCommandCancel: + s.startStreamedStageAbort(cmd) + default: + cmd.reply <- streamedStageCommandResult{err: errors.New("bad_stage_command")} + } } -// BeginStreamedStage acquires the updater-owned staging lease opened by the -// last successful prepare-update call. Fabric calls this from xfer_begin before -// any sink mutates flash or buffers transfer state. -func BeginStreamedStage(xferID string, size uint32) (uint64, error) { - beginAt := time.Now() - otadiag.SetActiveXfer(xferID) - otadiag.Event("[updater-stream]", "begin_entry", xferID, otadiag.KV("size", size)) - s := currentService() - if s == nil { - otadiag.Event( - "[updater-stream]", "begin_error", xferID, - otadiag.KV("err", "updater_not_running"), - otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), - ) - otadiag.StopUpdateWindow("updater_not_running") - return 0, errors.New("updater_not_running") +func (s *Service) cancelPendingStreamedStage(cmd streamedStageCommand) { + if cmd.reason == "" { + cmd.reason = "abort" } - gen, err := s.beginStreamedStageLease(xferID) + // The updater reactor owns logical cancellation even while a flash/verifier + // worker command is in progress. The worker cannot be interrupted inside a + // bounded operation, but its eventual result will be rejected by the lease + // checks because the lease is cancelled here first. + s.cancelStreamedStageLease(cmd.xferID, cmd.generation, cmd.reason) + clearABUpdateDiagHookFor(cmd.xferID, cmd.generation) + s.queueStageWorkerAbort(cmd.xferID, cmd.generation, cmd.reason) + cmd.reply <- streamedStageCommandResult{} +} + +func (s *Service) startStreamedStageBegin(cmd streamedStageCommand) { + if updaterTraceEnabled { + println("[updater-trace]", "begin_cmd", "xfer", cmd.xferID, "size", cmd.size) + } + beginAt := time.Now() + otadiag.SetActiveXfer(cmd.xferID) + otadiag.Event("[updater-stream]", "begin_entry", cmd.xferID, otadiag.KV("size", cmd.size)) + gen, err := s.beginStreamedStageLease(cmd.xferID) if err != nil { otadiag.Event( - "[updater-stream]", "lease_error", xferID, + "[updater-stream]", "lease_error", cmd.xferID, otadiag.KV("err", err.Error()), otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), ) - return 0, err + cmd.reply <- streamedStageCommandResult{err: err} + return } - otadiag.Event("[updater-stream]", "lease_ok", xferID, otadiag.KV("generation", gen)) - installABUpdateDiagHook(xferID, gen) - startAt := time.Now() - otadiag.Event( - "[updater-stream]", "start_entry", xferID, - otadiag.KV("generation", gen), - otadiag.KV("size", size), - ) - if err := startStreamedStage(xferID, gen, size); err != nil { - otadiag.Event( - "[updater-stream]", "start_error", xferID, - otadiag.KV("generation", gen), - otadiag.KV("err", err.Error()), - otadiag.KV("dur_ms", int(time.Since(startAt)/time.Millisecond)), - ) - clearABUpdateDiagHook() - s.cancelStreamedStageLease(xferID, gen, err.Error()) + otadiag.Event("[updater-stream]", "lease_ok", cmd.xferID, otadiag.KV("generation", gen)) + installABUpdateDiagHook(cmd.xferID, gen) + cmd.generation = gen + s.pendingStageCommand = &cmd + s.sendStageWorkerCommand(streamedStageWorkerCommand{ + kind: streamedStageCommandBegin, + xferID: cmd.xferID, + generation: gen, + size: cmd.size, + }) +} + +func (s *Service) startStreamedStageWrite(cmd streamedStageCommand) { + if updaterTraceEnabled { + println("[updater-trace]", "write_cmd", "xfer", cmd.xferID, "generation", cmd.generation, "data_len", len(cmd.data)) + } + if err := s.checkStreamedStageLease(cmd.xferID, cmd.generation, false); err != nil { + cmd.reply <- streamedStageCommandResult{err: err} + return + } + s.pendingStageCommand = &cmd + s.sendStageWorkerCommand(streamedStageWorkerCommand{ + kind: streamedStageCommandWrite, + xferID: cmd.xferID, + generation: cmd.generation, + data: cmd.data, + }) +} + +func (s *Service) startStreamedStageCommit(cmd streamedStageCommand) { + if updaterTraceEnabled { + println("[updater-trace]", "commit_cmd", "xfer", cmd.xferID, "generation", cmd.generation) + } + if err := s.checkStreamedStageLease(cmd.xferID, cmd.generation, false); err != nil { + cmd.reply <- streamedStageCommandResult{err: err} + return + } + s.pendingStageCommand = &cmd + s.sendStageWorkerCommand(streamedStageWorkerCommand{ + kind: streamedStageCommandCommit, + xferID: cmd.xferID, + generation: cmd.generation, + }) +} + +func (s *Service) startStreamedStageAbort(cmd streamedStageCommand) { + if cmd.reason == "" { + cmd.reason = "abort" + } + // Mark the logical lease cancelled in the updater reactor first. The + // worker then performs the storage/verifier abort at its next safe point. + if cmd.kind == streamedStageCommandCancel { + s.cancelStreamedStageLease(cmd.xferID, cmd.generation, cmd.reason) + } else { + s.cancelStreamedStageLease(cmd.xferID, cmd.generation, cmd.reason) + } + clearABUpdateDiagHookFor(cmd.xferID, cmd.generation) + s.pendingStageCommand = &cmd + s.sendStageWorkerCommand(streamedStageWorkerCommand{ + kind: cmd.kind, + xferID: cmd.xferID, + generation: cmd.generation, + reason: cmd.reason, + }) +} + +func (s *Service) sendStageWorkerCommand(cmd streamedStageWorkerCommand) { + if updaterTraceEnabled { + println("[updater-trace]", "worker_queue", "kind", int(cmd.kind), "xfer", cmd.xferID, "generation", cmd.generation, "size", cmd.size, "data_len", len(cmd.data)) + } + select { + case s.stageWorkerCommands <- cmd: + default: + // The updater reactor admits only one pending command at a time, so the + // worker queue should not fill. If it does, fail and cancel the logical + // lease from the reactor rather than blocking it. + if s.pendingStageCommand != nil && s.pendingStageCommand.reply != nil { + pending := s.pendingStageCommand + s.pendingStageCommand = nil + if pending.generation != 0 { + s.cancelStreamedStageLease(pending.xferID, pending.generation, ErrBusy) + clearABUpdateDiagHookFor(pending.xferID, pending.generation) + } + pending.reply <- streamedStageCommandResult{err: errors.New(ErrBusy)} + } + } +} + +func (s *Service) handleStreamedStageWorkerResult(res streamedStageWorkerResult) { + if updaterTraceEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + println("[updater-trace]", "worker_result", "kind", int(res.kind), "xfer", res.xferID, "generation", res.generation, "err", errText) + } + cmd := s.pendingStageCommand + if cmd == nil || cmd.xferID != res.xferID || cmd.generation != res.generation || cmd.kind != res.kind { + // Stale worker result from an already-cancelled generation. The updater + // reactor is authoritative, so ignore it. + return + } + s.pendingStageCommand = nil + switch res.kind { + case streamedStageCommandBegin: + s.finishStreamedStageBegin(*cmd, res) + case streamedStageCommandWrite: + s.finishStreamedStageWrite(*cmd, res) + case streamedStageCommandCommit: + s.finishStreamedStageCommit(*cmd, res) + case streamedStageCommandAbort, streamedStageCommandCancel: + clearABUpdateDiagHookFor(cmd.xferID, cmd.generation) + cmd.reply <- streamedStageCommandResult{} + default: + cmd.reply <- streamedStageCommandResult{err: errors.New("bad_stage_command")} + } +} + +func (s *Service) finishStreamedStageBegin(cmd streamedStageCommand, res streamedStageWorkerResult) { + if updaterTraceEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + println("[updater-trace]", "finish_begin", "xfer", cmd.xferID, "generation", cmd.generation, "err", errText) + } + beginAt := time.Now() + if res.err != nil { + clearABUpdateDiagHookFor(cmd.xferID, cmd.generation) + s.cancelStreamedStageLease(cmd.xferID, cmd.generation, res.err.Error()) otadiag.Event( - "[updater-stream]", "begin_error", xferID, - otadiag.KV("err", err.Error()), - otadiag.KV("generation", gen), + "[updater-stream]", "begin_error", cmd.xferID, + otadiag.KV("err", res.err.Error()), + otadiag.KV("generation", cmd.generation), otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), ) otadiag.StopUpdateWindow("start_streamed_stage_error") - return 0, err + cmd.reply <- streamedStageCommandResult{err: res.err} + return } - otadiag.Event( - "[updater-stream]", "start_exit", xferID, - otadiag.KV("generation", gen), - otadiag.KV("dur_ms", int(time.Since(startAt)/time.Millisecond)), - ) - markAt := time.Now() - otadiag.Event("[updater-stream]", "mark_receiving_entry", xferID, otadiag.KV("generation", gen)) - if err := s.markStreamedStageReceiving(xferID, gen); err != nil { + if err := s.markStreamedStageReceiving(cmd.xferID, cmd.generation); err != nil { + clearABUpdateDiagHookFor(cmd.xferID, cmd.generation) + s.cancelStreamedStageLease(cmd.xferID, cmd.generation, err.Error()) + s.queueStageWorkerAbort(cmd.xferID, cmd.generation, err.Error()) otadiag.Event( - "[updater-stream]", "mark_receiving_error", xferID, - otadiag.KV("generation", gen), + "[updater-stream]", "begin_error", cmd.xferID, otadiag.KV("err", err.Error()), - otadiag.KV("dur_ms", int(time.Since(markAt)/time.Millisecond)), - ) - abortStreamedStage() - clearABUpdateDiagHook() - s.cancelStreamedStageLease(xferID, gen, err.Error()) - otadiag.Event( - "[updater-stream]", "begin_error", xferID, - otadiag.KV("err", err.Error()), - otadiag.KV("generation", gen), + otadiag.KV("generation", cmd.generation), otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), ) otadiag.StopUpdateWindow("mark_receiving_error") - return 0, err + cmd.reply <- streamedStageCommandResult{err: err} + return } otadiag.Event( - "[updater-stream]", "mark_receiving_exit", xferID, - otadiag.KV("generation", gen), - otadiag.KV("dur_ms", int(time.Since(markAt)/time.Millisecond)), - ) - otadiag.Event( - "[updater-stream]", "begin_exit", xferID, - otadiag.KV("generation", gen), + "[updater-stream]", "begin_exit", cmd.xferID, + otadiag.KV("generation", cmd.generation), otadiag.KV("dur_ms", int(time.Since(beginAt)/time.Millisecond)), ) - return gen, nil + cmd.reply <- streamedStageCommandResult{generation: cmd.generation} } -func WriteStreamedStage(xferID string, generation uint64, data []byte) error { - s := currentService() - if s == nil { - return errors.New("updater_not_running") +func (s *Service) finishStreamedStageWrite(cmd streamedStageCommand, res streamedStageWorkerResult) { + if updaterTraceEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + println("[updater-trace]", "finish_write", "xfer", cmd.xferID, "generation", cmd.generation, "err", errText) + } + if res.err != nil { + s.cancelStreamedStageLease(cmd.xferID, cmd.generation, res.err.Error()) + clearABUpdateDiagHookFor(cmd.xferID, cmd.generation) + s.queueStageWorkerAbort(cmd.xferID, cmd.generation, res.err.Error()) + cmd.reply <- streamedStageCommandResult{err: res.err} + return } - if err := s.checkStreamedStageLease(xferID, generation, false); err != nil { - return err + if err := s.checkStreamedStageLease(cmd.xferID, cmd.generation, false); err != nil { + clearABUpdateDiagHookFor(cmd.xferID, cmd.generation) + s.queueStageWorkerAbort(cmd.xferID, cmd.generation, err.Error()) + cmd.reply <- streamedStageCommandResult{err: err} + return } - return writeStreamedStage(xferID, generation, data) + cmd.reply <- streamedStageCommandResult{} } -func CommitStreamedStage(xferID string, generation uint64) (uint32, error) { - s := currentService() - if s == nil { - return 0, errors.New("updater_not_running") +func (s *Service) finishStreamedStageCommit(cmd streamedStageCommand, res streamedStageWorkerResult) { + if updaterTraceEnabled { + errText := "" + if res.err != nil { + errText = res.err.Error() + } + println("[updater-trace]", "finish_commit", "xfer", cmd.xferID, "generation", cmd.generation, "err", errText, "staged_len", res.staged.Length) } - if err := s.checkStreamedStageLease(xferID, generation, false); err != nil { - return 0, err + clearABUpdateDiagHookFor(cmd.xferID, cmd.generation) + if res.err != nil { + otadiag.Event("[updater-stream]", "commit_result", cmd.xferID, + otadiag.KV("ok", false), + otadiag.KV("err", res.err.Error()), + otadiag.KV("generation", cmd.generation), + ) + s.cancelStreamedStageLease(cmd.xferID, cmd.generation, res.err.Error()) + cmd.reply <- streamedStageCommandResult{err: res.err} + return } - staged, err := commitStreamedStage(xferID, generation) - clearABUpdateDiagHook() - if err != nil { - s.cancelStreamedStageLease(xferID, generation, err.Error()) - return 0, err + if err := s.markStreamedStageCommitted(cmd.xferID, cmd.generation); err != nil { + otadiag.Event("[updater-stream]", "commit_result", cmd.xferID, + otadiag.KV("ok", false), + otadiag.KV("err", err.Error()), + otadiag.KV("generation", cmd.generation), + otadiag.KV("written", res.staged.Length), + ) + s.queueStageWorkerAbort(cmd.xferID, cmd.generation, err.Error()) + cmd.reply <- streamedStageCommandResult{err: err} + return } - if err := s.markStreamedStageCommitted(xferID, generation); err != nil { - abortStreamedStage() - return 0, err + s.setStreamedStageResult(res.staged) + otadiag.Event("[updater-stream]", "commit_result", cmd.xferID, + otadiag.KV("ok", true), + otadiag.KV("generation", cmd.generation), + otadiag.KV("written", res.staged.Length), + otadiag.KV("image_id", res.staged.ImageID), + otadiag.KV("version", res.staged.Version), + ) + cmd.reply <- streamedStageCommandResult{written: res.staged.Length} +} + +func (s *Service) queueStageWorkerAbort(xferID string, generation uint64, reason string) { + if reason == "" { + reason = "abort" + } + select { + case s.stageWorkerCommands <- streamedStageWorkerCommand{kind: streamedStageCommandAbort, xferID: xferID, generation: generation, reason: reason}: + default: } - return staged.Length, nil } -func CommitBufferedStage(xferID string, generation uint64) error { - s := currentService() +func (s *Service) setStreamedStageResult(staged streamedStage) { + s.mu.Lock() + s.streamStageResult = staged + s.streamStageResultOK = true + s.mu.Unlock() +} + +func (s *Service) clearActiveABUpdateDiagHook() { if s == nil { - return errors.New("updater_not_running") + return } - if err := s.markStreamedStageCommitted(xferID, generation); err != nil { - return err + s.mu.Lock() + xferID := s.streamXferID + generation := s.stageGeneration + s.mu.Unlock() + if xferID == "" || generation == 0 { + return } - clearABUpdateDiagHook() - return nil + clearABUpdateDiagHookFor(xferID, generation) } -func AbortStreamedStage(xferID string, generation uint64, reason string) { - abortStreamedStage() - clearABUpdateDiagHook() - if s := currentService(); s != nil { - s.cancelStreamedStageLease(xferID, generation, reason) +func (s *Service) consumeStreamedStageResult() (streamedStage, bool) { + s.mu.Lock() + defer s.mu.Unlock() + if !s.streamStageResultOK { + return streamedStage{}, false } + out := s.streamStageResult + s.streamStageResult = streamedStage{} + s.streamStageResultOK = false + return out, true } -func CancelStreamedStage(xferID string, generation uint64, reason string) { - AbortStreamedStage(xferID, generation, reason) +func (s *Service) discardStreamedStageResultLocked() { + s.streamStageResult = streamedStage{} + s.streamStageResultOK = false } func (s *Service) openStageGenerationLocked() uint64 { @@ -188,7 +531,7 @@ func (s *Service) openStageGenerationLocked() uint64 { s.streamXferID = "" s.streamCancelled = false s.streamCommitted = false - discardStreamedStageResult() + s.discardStreamedStageResultLocked() return s.stageGeneration } @@ -214,6 +557,7 @@ func (s *Service) beginStreamedStageLease(xferID string) (uint64, error) { s.streamXferID = xferID s.streamCancelled = false s.streamCommitted = false + s.discardStreamedStageResultLocked() snap := s.diagSnapshotLocked() gen := s.stageGeneration s.mu.Unlock() @@ -289,6 +633,7 @@ func (s *Service) cancelStreamedStageLease(xferID string, generation uint64, rea s.streamXferID = "" s.stagedImageID = "" s.pendingVersion = "" + s.discardStreamedStageResultLocked() if s.state == StateReady || s.state == StateReceiving || s.state == StateStaged { s.state = StateFailed } diff --git a/services/updater/stream_stage_actor_test.go b/services/updater/stream_stage_actor_test.go new file mode 100644 index 0000000..4ad8952 --- /dev/null +++ b/services/updater/stream_stage_actor_test.go @@ -0,0 +1,153 @@ +package updater + +import ( + "io" + "strings" + "testing" + "time" +) + +type actorBlockingVerifier struct { + entered chan struct{} + release chan struct{} + manifest Manifest +} + +func (v *actorBlockingVerifier) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + select { + case <-v.entered: + default: + close(v.entered) + } + <-v.release + if _, err := io.Copy(sink, r); err != nil { + return Manifest{}, err + } + if err := sink.Commit(); err != nil { + return Manifest{}, err + } + return v.manifest, nil +} + +func TestStreamedStageActorRejectsConcurrentCommandWhileWorkerBusy(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + verif := &actorBlockingVerifier{ + entered: make(chan struct{}), + release: make(chan struct{}), + manifest: Manifest{ + Version: "9.9.9", + BuildID: "build-9.9.9", + ImageID: "mcu-dev-9.9.9", + PayloadSHA256: strings.Repeat("a", 64), + PayloadLength: 4, + }, + } + + svc, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) + defer cancel() + prepareUpdaterForLease(t, caller) + gen, err := svc.BeginStreamedStage("xfer-actor-busy", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + if err := svc.WriteStreamedStage("xfer-actor-busy", gen, []byte("blob")); err != nil { + t.Fatalf("WriteStreamedStage: %v", err) + } + + commitErr := make(chan error, 1) + go func() { + _, err := svc.CommitStreamedStage("xfer-actor-busy", gen) + commitErr <- err + }() + select { + case <-verif.entered: + case <-time.After(2 * time.Second): + t.Fatal("verifier did not enter") + } + + if err := svc.WriteStreamedStage("xfer-actor-busy", gen, []byte("more")); err == nil || err.Error() != ErrBusy { + t.Fatalf("WriteStreamedStage while commit pending err = %v, want busy", err) + } + + close(verif.release) + select { + case err := <-commitErr: + if err != nil { + t.Fatalf("CommitStreamedStage after release: %v", err) + } + case <-time.After(2 * time.Second): + t.Fatal("commit did not complete") + } +} + +func TestStreamedStageActorCancelWhileWorkerBusyRejectsLateWorkerSuccess(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + observer := b.NewConnection("observer") + upSub := observer.Subscribe(TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + memMD := NewMemoryMetadata() + verif := &actorBlockingVerifier{ + entered: make(chan struct{}), + release: make(chan struct{}), + manifest: Manifest{ + Version: "9.9.9", + BuildID: "build-9.9.9", + ImageID: "mcu-dev-9.9.9", + PayloadSHA256: strings.Repeat("b", 64), + PayloadLength: 4, + }, + } + + svc, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + prepareUpdaterForLease(t, caller) + gen, err := svc.BeginStreamedStage("xfer-cancel-busy", 4) + if err != nil { + t.Fatalf("BeginStreamedStage: %v", err) + } + if err := svc.WriteStreamedStage("xfer-cancel-busy", gen, []byte("blob")); err != nil { + t.Fatalf("WriteStreamedStage: %v", err) + } + + commitErr := make(chan error, 1) + go func() { + _, err := svc.CommitStreamedStage("xfer-cancel-busy", gen) + commitErr <- err + }() + select { + case <-verif.entered: + case <-time.After(2 * time.Second): + t.Fatal("verifier did not enter") + } + + svc.CancelStreamedStage("xfer-cancel-busy", gen, "outer_timeout") + failed := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + if got := strValue(failed.LastError); got != "outer_timeout" { + t.Fatalf("last_error = %q, want outer_timeout", got) + } + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatal("descriptor persisted before blocked verifier was released") + } + + close(verif.release) + select { + case err := <-commitErr: + if err == nil { + t.Fatal("CommitStreamedStage succeeded after cancellation") + } + case <-time.After(2 * time.Second): + t.Fatal("commit did not return after verifier release") + } + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatal("late worker success persisted descriptor after cancellation") + } +} diff --git a/services/updater/trace_disabled.go b/services/updater/trace_disabled.go new file mode 100644 index 0000000..3790bb1 --- /dev/null +++ b/services/updater/trace_disabled.go @@ -0,0 +1,5 @@ +//go:build !updater_trace + +package updater + +const updaterTraceEnabled = false diff --git a/services/updater/trace_enabled.go b/services/updater/trace_enabled.go new file mode 100644 index 0000000..c40f634 --- /dev/null +++ b/services/updater/trace_enabled.go @@ -0,0 +1,5 @@ +//go:build updater_trace + +package updater + +const updaterTraceEnabled = true diff --git a/services/updater/types.go b/services/updater/types.go index 095d8ea..d396469 100644 --- a/services/updater/types.go +++ b/services/updater/types.go @@ -32,10 +32,10 @@ const ( PrepareTargetMCU = "mcu" TargetUpdaterMain = "updater/main" DigestAlgXXHash32 = "xxhash32" - // DefaultMaxChunkSize is the safe RP2350 Fabric OTA limit currently - // advertised by prepare-update. It is a target pacing limit, not a - // Fabric protocol maximum. - DefaultMaxChunkSize uint32 = 512 + // DefaultMaxChunkSize is the fabric-jsonl/1 v1 initial raw chunk size. + // The CM5 sender chooses the actual chunk size, but the MCU must accept + // at least 2048-byte chunks. + DefaultMaxChunkSize uint32 = 2048 ) // PrepareRequest mirrors the current prepare-update payload. @@ -48,11 +48,12 @@ type PrepareRequest struct { Metadata any `json:"metadata,omitempty"` } -// CommitRequest mirrors commit-update. +// CommitRequest mirrors the strict commit-update payload. Commit is deliberately +// minimal: the MCU decides from its staged descriptor and the optional expected +// image id. Arbitrary metadata belongs to prepare/stage, not commit. type CommitRequest struct { JobID string `json:"job_id,omitempty"` ExpectedImageID string `json:"expected_image_id,omitempty"` - Metadata any `json:"metadata,omitempty"` } type PrepareReply struct { @@ -76,16 +77,19 @@ type Reply struct { // Refusal error strings — the Lua side compares against these. const ( - ErrBusy = "busy" - ErrNothingStaged = "nothing_staged" - ErrTargetMismatch = "target_mismatch" - ErrABUpdateBuyFailed = "abupdate_buy_failed" + ErrBusy = "busy" + ErrInvalidRequest = "invalid_request" + ErrUnsupportedTarget = "unsupported_target" + ErrStorageUnavailable = "storage_unavailable" + ErrNoStagedImage = "no_staged_image" + ErrImageIDMismatch = "image_id_mismatch" + ErrABUpdateBuyFailed = "abupdate_buy_failed" // ErrApplyUnavailable is returned when the commit RPC sees a valid // staged descriptor but no Applier is wired to actually trigger // the slot-switch + reboot. Refusing by default means we never lie // to the CM5 about apply success when the hardware apply path is not // wired. - ErrApplyUnavailable = "apply_unavailable" + ErrApplyUnavailable = "commit_failed" ) // SoftwareFact is the retained payload at state/self/software. @@ -132,10 +136,10 @@ type StagedDescriptor struct { PayloadSHA256 string `json:"payload_sha256"` } -// StagePayload is the local updater/main staging RPC invoked by fabric -// after xfer_commit has verified size and transfer digest. It replaces -// the older meta.receiver/raw-member receive path; the CM5 supplies only -// target="updater/main" on the wire. +// StagePayload is the local updater/main staging RPC invoked by Fabric after +// xfer_commit has verified size and transfer digest and committed the streamed +// staging lease. The payload carries only metadata and the lease generation; it +// must never carry the whole artefact as a []byte on MCU builds. type StagePayload struct { LinkID string `json:"link_id"` XferID string `json:"xfer_id"` @@ -145,7 +149,6 @@ type StagePayload struct { DigestAlg string `json:"digest_alg"` Digest string `json:"digest"` Meta any `json:"meta,omitempty"` - Artefact []byte `json:"artefact,omitempty"` } type StageReply struct { diff --git a/services/updater/updater.go b/services/updater/updater.go index 5923234..a54f11c 100644 --- a/services/updater/updater.go +++ b/services/updater/updater.go @@ -160,11 +160,22 @@ type Service struct { preparing bool bootBuyRC int32 - stageGeneration uint64 - streamLeaseActive bool - streamXferID string - streamCancelled bool - streamCommitted bool + stageGeneration uint64 + streamLeaseActive bool + streamXferID string + streamCancelled bool + streamCommitted bool + streamStageResult streamedStage + streamStageResultOK bool + + stageCommands chan streamedStageCommand + stageWorkerCommands chan streamedStageWorkerCommand + stageWorkerResults chan streamedStageWorkerResult + pendingStageCommand *streamedStageCommand + stageReady chan struct{} + stageStopped chan struct{} + stageReadyOnce sync.Once + stageStoppedOnce sync.Once applyResults chan applyRebootResult @@ -245,15 +256,20 @@ func New(opts Options) *Service { mw = noopMetadataWriter{} } s := &Service{ - conn: opts.Conn, - verifier: v, - applier: a, - identity: opts.Identity, - metadata: mr, - metadataWrite: mw, - state: StateRunning, - bootBuyRC: opts.BootBuyRC, - applyResults: make(chan applyRebootResult, 1), + conn: opts.Conn, + verifier: v, + applier: a, + identity: opts.Identity, + metadata: mr, + metadataWrite: mw, + state: StateRunning, + bootBuyRC: opts.BootBuyRC, + stageCommands: make(chan streamedStageCommand, 1), + stageWorkerCommands: make(chan streamedStageWorkerCommand, 1), + stageWorkerResults: make(chan streamedStageWorkerResult, 1), + stageReady: make(chan struct{}), + stageStopped: make(chan struct{}), + applyResults: make(chan applyRebootResult, 1), criticalRepublish: normalizeCriticalRepublishConfig( opts.CriticalRepublish, ), @@ -280,8 +296,9 @@ func (noopMetadataWriter) ClearStagedDescriptor() error { // surface, and watches the fabric link-state retain for ready-true // edges. Blocks until ctx is cancelled. func (s *Service) Run(ctx context.Context) { - unregister := registerActiveService(s) - defer unregister() + s.stageReadyOnce.Do(func() { close(s.stageReady) }) + go s.runStreamedStageWorker(ctx) + defer s.stageStoppedOnce.Do(func() { close(s.stageStopped) }) defer otadiag.StopUpdateWindow("updater_stop") prepareSub := s.conn.Subscribe(TopicPrepareRPC) @@ -383,6 +400,10 @@ func (s *Service) Run(ctx context.Context) { continue } s.handleStage(msg) + case cmd := <-s.stageCommands: + s.handleStreamedStageCommand(cmd) + case result := <-s.stageWorkerResults: + s.handleStreamedStageWorkerResult(result) case result := <-s.applyResults: s.failRebootIfCurrent(result.desc, result.err) case now := <-criticalTimerC: @@ -549,6 +570,14 @@ type linkObservation struct { LocalSID string } +// fabricLinkObserver is implemented by services/fabric's retained link-state +// payload. Keeping this as a tiny structural interface avoids JSON reflection +// in the common in-process TinyGo path while still tolerating map/JSON payloads +// in host-side tests. +type fabricLinkObserver interface { + FabricLinkObservation() (ready bool, peerSID string, localSID string) +} + func republishReason(prev, cur linkObservation, hadPrev bool) string { if !cur.Ready { return "" @@ -594,6 +623,9 @@ func decodeLinkState(msg *bus.Message) (string, linkObservation) { obs.PeerSID, _ = p["peer_sid"].(string) obs.LocalSID, _ = p["local_sid"].(string) return linkID, obs + case fabricLinkObserver: + obs.Ready, obs.PeerSID, obs.LocalSID = p.FabricLinkObservation() + return linkID, obs } // Fall back to JSON probe for the typed-struct payload that // fabric publishes via its linkStatePayload type. diff --git a/services/updater/updater_test.go b/services/updater/updater_test.go index 931050c..1a8670e 100644 --- a/services/updater/updater_test.go +++ b/services/updater/updater_test.go @@ -115,7 +115,7 @@ func (f *failingClearMetadata) ClearStagedDescriptor() error { // fakeApplier always succeeds — used by tests that need the commit RPC // to drive the state machine through committing/rebooting without // actually rebooting (production wiring uses RefusingApplier so the -// commit RPC returns apply_unavailable until the real abupdate-backed +// commit RPC returns commit_failed until the real abupdate-backed // implementation is supplied). // // canCalls and rebootCalls are kept separate so tests can verify the commit @@ -298,11 +298,10 @@ func testStagePayload(id string, artefact []byte) StagePayload { Size: uint32(len(artefact)), DigestAlg: DigestAlgXXHash32, Digest: "deadbeef", - Artefact: artefact, } } -func preparedStagePayload(t *testing.T, caller *bus.Connection, svc *Service, id string, artefact []byte) StagePayload { +func preparedStreamedStageLease(t *testing.T, caller *bus.Connection, svc *Service, id string, artefact []byte) (StagePayload, uint64) { t.Helper() req := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) sub := caller.Request(req) @@ -325,15 +324,26 @@ func preparedStagePayload(t *testing.T, caller *bus.Connection, svc *Service, id case <-time.After(2 * time.Second): t.Fatal("timeout waiting for prepare reply") } - generation, err := svc.beginStreamedStageLease(id) + generation, err := svc.BeginStreamedStage(id, uint32(len(artefact))) if err != nil { - t.Fatalf("begin stage lease: %v", err) + t.Fatalf("begin streamed stage: %v", err) } - if err := svc.markStreamedStageCommitted(id, generation); err != nil { - t.Fatalf("commit stage lease: %v", err) + if len(artefact) > 0 { + if err := svc.WriteStreamedStage(id, generation, artefact); err != nil { + t.Fatalf("write streamed stage: %v", err) + } } payload := testStagePayload(id, artefact) payload.Generation = generation + return payload, generation +} + +func preparedStagePayload(t *testing.T, caller *bus.Connection, svc *Service, id string, artefact []byte) StagePayload { + t.Helper() + payload, generation := preparedStreamedStageLease(t, caller, svc, id, artefact) + if _, err := svc.CommitStreamedStage(id, generation); err != nil { + t.Fatalf("commit streamed stage: %v", err) + } return payload } @@ -584,14 +594,23 @@ func requestUpdaterReply(t *testing.T, caller *bus.Connection, topic bus.Topic, return nil } +func TestStreamedStageControllerRequiresUpdaterRun(t *testing.T) { + b := newTestBus() + svc := New(Options{Conn: b.NewConnection("updater")}) + + if gen, err := svc.BeginStreamedStage("xfer-not-running", 4); err == nil || err.Error() != "updater_not_running" || gen != 0 { + t.Fatalf("BeginStreamedStage before Run = gen=%d err=%v, want updater_not_running", gen, err) + } +} + func TestBeginStreamedStageBeforePrepareReturnsStageNotPrepared(t *testing.T) { b := newTestBus() conn := b.NewConnection("updater") - _, cancel := runService(t, b, Options{Conn: conn}) + svc, cancel := runService(t, b, Options{Conn: conn}) defer cancel() - if gen, err := BeginStreamedStage("xfer-before-prepare", 4); err == nil || err.Error() != "stage_not_prepared" || gen != 0 { + if gen, err := svc.BeginStreamedStage("xfer-before-prepare", 4); err == nil || err.Error() != "stage_not_prepared" || gen != 0 { t.Fatalf("BeginStreamedStage before prepare = gen=%d err=%v, want stage_not_prepared", gen, err) } } @@ -605,7 +624,7 @@ func TestPrepareOpensSingleReceivingStreamLeaseAndClearsStaleDescriptor(t *testi memMD := NewMemoryMetadata() _ = memMD.WriteStagedDescriptor(StagedDescriptor{Version: "old", ImageID: "old-image", PayloadSHA256: "old"}) - _, cancel := runService(t, b, Options{Conn: conn, Metadata: memMD, MetadataWrite: memMD}) + svc, cancel := runService(t, b, Options{Conn: conn, Metadata: memMD, MetadataWrite: memMD}) defer cancel() prepareUpdaterForLease(t, caller) @@ -613,30 +632,30 @@ func TestPrepareOpensSingleReceivingStreamLeaseAndClearsStaleDescriptor(t *testi t.Fatal("prepare did not clear stale staged descriptor") } - gen, err := BeginStreamedStage("xfer-lease", 4) + gen, err := svc.BeginStreamedStage("xfer-lease", 4) if err != nil { t.Fatalf("BeginStreamedStage: %v", err) } if gen == 0 { t.Fatal("BeginStreamedStage returned generation 0") } - defer CancelStreamedStage("xfer-lease", gen, "test_done") + defer svc.CancelStreamedStage("xfer-lease", gen, "test_done") up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateReceiving }) if strValue(up.LastError) != "" { t.Fatalf("receiving last_error = %q, want empty", strValue(up.LastError)) } - if _, err := BeginStreamedStage("xfer-second", 4); err == nil || err.Error() != ErrBusy { + if _, err := svc.BeginStreamedStage("xfer-second", 4); err == nil || err.Error() != ErrBusy { t.Fatalf("second BeginStreamedStage err = %v, want busy", err) } - if err := CommitBufferedStage("wrong-xfer", gen); err == nil || err.Error() != "stage_generation_mismatch" { - t.Fatalf("wrong xfer CommitBufferedStage err = %v, want generation mismatch", err) + if err := svc.markStreamedStageCommitted("wrong-xfer", gen); err == nil || err.Error() != "stage_generation_mismatch" { + t.Fatalf("wrong xfer markStreamedStageCommitted err = %v, want generation mismatch", err) } - if err := CommitBufferedStage("xfer-lease", gen+1); err == nil || err.Error() != "stage_generation_mismatch" { - t.Fatalf("wrong generation CommitBufferedStage err = %v, want generation mismatch", err) + if err := svc.markStreamedStageCommitted("xfer-lease", gen+1); err == nil || err.Error() != "stage_generation_mismatch" { + t.Fatalf("wrong generation markStreamedStageCommitted err = %v, want generation mismatch", err) } - if err := CommitBufferedStage("xfer-lease", gen); err != nil { - t.Fatalf("matching CommitBufferedStage: %v", err) + if err := svc.markStreamedStageCommitted("xfer-lease", gen); err != nil { + t.Fatalf("matching markStreamedStageCommitted: %v", err) } } @@ -645,14 +664,14 @@ func TestPrepareAndCommitRejectWhileStreamLeaseActive(t *testing.T) { conn := b.NewConnection("updater") caller := b.NewConnection("caller") - _, cancel := runService(t, b, Options{Conn: conn}) + svc, cancel := runService(t, b, Options{Conn: conn}) defer cancel() prepareUpdaterForLease(t, caller) - gen, err := BeginStreamedStage("xfer-active", 4) + gen, err := svc.BeginStreamedStage("xfer-active", 4) if err != nil { t.Fatalf("BeginStreamedStage: %v", err) } - defer CancelStreamedStage("xfer-active", gen, "test_done") + defer svc.CancelStreamedStage("xfer-active", gen, "test_done") prepPayload := requestUpdaterReply(t, caller, TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}) prepReply, ok := prepPayload.(Reply) @@ -667,26 +686,40 @@ func TestPrepareAndCommitRejectWhileStreamLeaseActive(t *testing.T) { } } -func TestStreamedStageDiagHookClearsOnBufferedCommit(t *testing.T) { +func TestStreamedStageDiagHookClearsOnCommittedStage(t *testing.T) { b := newTestBus() conn := b.NewConnection("updater") caller := b.NewConnection("caller") - _, cancel := runService(t, b, Options{Conn: conn}) + svc, cancel := runService(t, b, Options{Conn: conn}) defer cancel() prepareUpdaterForLease(t, caller) - gen, err := BeginStreamedStage("xfer-hook-commit", 4) + gen, err := svc.BeginStreamedStage("xfer-hook-commit", 4) if err != nil { t.Fatalf("BeginStreamedStage: %v", err) } if !abupdateDiagHookActiveForTest() { t.Fatal("diagnostic hook inactive after BeginStreamedStage") } - if err := CommitBufferedStage("xfer-hook-commit", gen); err != nil { - t.Fatalf("CommitBufferedStage: %v", err) + if err := svc.markStreamedStageCommitted("xfer-hook-commit", gen); err != nil { + t.Fatalf("markStreamedStageCommitted: %v", err) + } + clearABUpdateDiagHook() + if abupdateDiagHookActiveForTest() { + t.Fatal("diagnostic hook still active after committed stage") + } +} + +func TestStreamedStageDiagHookClearIsLeaseScoped(t *testing.T) { + clearABUpdateDiagHook() + installABUpdateDiagHook("current-xfer", 2) + clearABUpdateDiagHookFor("stale-xfer", 1) + if !abupdateDiagHookActiveForTest() { + t.Fatal("stale generation cleared current diagnostic hook") } + clearABUpdateDiagHookFor("current-xfer", 2) if abupdateDiagHookActiveForTest() { - t.Fatal("diagnostic hook still active after buffered commit") + t.Fatal("matching generation did not clear diagnostic hook") } } @@ -695,17 +728,17 @@ func TestStreamedStageDiagHookClearsOnAbort(t *testing.T) { conn := b.NewConnection("updater") caller := b.NewConnection("caller") - _, cancel := runService(t, b, Options{Conn: conn}) + svc, cancel := runService(t, b, Options{Conn: conn}) defer cancel() prepareUpdaterForLease(t, caller) - gen, err := BeginStreamedStage("xfer-hook-abort", 4) + gen, err := svc.BeginStreamedStage("xfer-hook-abort", 4) if err != nil { t.Fatalf("BeginStreamedStage: %v", err) } if !abupdateDiagHookActiveForTest() { t.Fatal("diagnostic hook inactive after BeginStreamedStage") } - AbortStreamedStage("xfer-hook-abort", gen, "test_abort") + svc.AbortStreamedStage("xfer-hook-abort", gen, "test_abort") if abupdateDiagHookActiveForTest() { t.Fatal("diagnostic hook still active after abort") } @@ -716,17 +749,17 @@ func TestStreamedStageDiagHookClearsOnCommitError(t *testing.T) { conn := b.NewConnection("updater") caller := b.NewConnection("caller") - _, cancel := runService(t, b, Options{Conn: conn}) + svc, cancel := runService(t, b, Options{Conn: conn}) defer cancel() prepareUpdaterForLease(t, caller) - gen, err := BeginStreamedStage("xfer-hook-commit-error", 4) + gen, err := svc.BeginStreamedStage("xfer-hook-commit-error", 4) if err != nil { t.Fatalf("BeginStreamedStage: %v", err) } if !abupdateDiagHookActiveForTest() { t.Fatal("diagnostic hook inactive after BeginStreamedStage") } - if _, err := CommitStreamedStage("xfer-hook-commit-error", gen); err == nil { + if _, err := svc.CommitStreamedStage("xfer-hook-commit-error", gen); err == nil { t.Fatal("CommitStreamedStage returned nil error, want host streamed_stage_not_supported") } if abupdateDiagHookActiveForTest() { @@ -765,7 +798,7 @@ func TestCancelStreamedStagePreventsLateStageSuccess(t *testing.T) { PayloadLength: 4, }} - _, cancel := runService(t, b, Options{ + svc, cancel := runService(t, b, Options{ Conn: conn, Verifier: verif, Metadata: memMD, @@ -773,14 +806,14 @@ func TestCancelStreamedStagePreventsLateStageSuccess(t *testing.T) { }) defer cancel() prepareUpdaterForLease(t, caller) - gen, err := BeginStreamedStage("xfer-cancel", 4) + gen, err := svc.BeginStreamedStage("xfer-cancel", 4) if err != nil { t.Fatalf("BeginStreamedStage: %v", err) } - if err := CommitBufferedStage("xfer-cancel", gen); err != nil { - t.Fatalf("CommitBufferedStage: %v", err) + if err := svc.markStreamedStageCommitted("xfer-cancel", gen); err != nil { + t.Fatalf("markStreamedStageCommitted: %v", err) } - CancelStreamedStage("xfer-cancel", gen, "test_cancel") + svc.CancelStreamedStage("xfer-cancel", gen, "test_cancel") stage := testStagePayload("xfer-cancel", []byte("blob")) stage.Generation = gen @@ -810,7 +843,7 @@ func TestReleasedStagedLeaseIgnoresLateCancel(t *testing.T) { PayloadLength: 4, }} - _, cancel := runService(t, b, Options{ + svc, cancel := runService(t, b, Options{ Conn: conn, Verifier: verif, Applier: app, @@ -819,12 +852,15 @@ func TestReleasedStagedLeaseIgnoresLateCancel(t *testing.T) { }) defer cancel() prepareUpdaterForLease(t, caller) - gen, err := BeginStreamedStage("xfer-released", 4) + gen, err := svc.BeginStreamedStage("xfer-released", 4) if err != nil { t.Fatalf("BeginStreamedStage: %v", err) } - if err := CommitBufferedStage("xfer-released", gen); err != nil { - t.Fatalf("CommitBufferedStage: %v", err) + if err := svc.WriteStreamedStage("xfer-released", gen, []byte("blob")); err != nil { + t.Fatalf("WriteStreamedStage: %v", err) + } + if _, err := svc.CommitStreamedStage("xfer-released", gen); err != nil { + t.Fatalf("CommitStreamedStage: %v", err) } stage := testStagePayload("xfer-released", []byte("blob")) @@ -839,7 +875,7 @@ func TestReleasedStagedLeaseIgnoresLateCancel(t *testing.T) { t.Fatal("stage did not persist descriptor") } - CancelStreamedStage("xfer-released", gen, "late_cancel") + svc.CancelStreamedStage("xfer-released", gen, "late_cancel") if _, ok := memMD.StagedDescriptor(); !ok { t.Fatal("late cancel cleared released staged descriptor") } @@ -863,7 +899,7 @@ func TestStaleGenerationAndWrongXferCannotMutateStreamedStage(t *testing.T) { PayloadLength: 4, }} - _, cancel := runService(t, b, Options{ + svc, cancel := runService(t, b, Options{ Conn: conn, Verifier: verif, Metadata: memMD, @@ -871,20 +907,23 @@ func TestStaleGenerationAndWrongXferCannotMutateStreamedStage(t *testing.T) { }) defer cancel() prepareUpdaterForLease(t, caller) - gen, err := BeginStreamedStage("xfer-current", 4) + gen, err := svc.BeginStreamedStage("xfer-current", 4) if err != nil { t.Fatalf("BeginStreamedStage: %v", err) } - defer CancelStreamedStage("xfer-current", gen, "test_done") + defer svc.CancelStreamedStage("xfer-current", gen, "test_done") - if err := WriteStreamedStage("wrong-xfer", gen, []byte("data")); err == nil || err.Error() != "stage_generation_mismatch" { + if err := svc.WriteStreamedStage("wrong-xfer", gen, []byte("data")); err == nil || err.Error() != "stage_generation_mismatch" { t.Fatalf("wrong xfer WriteStreamedStage err = %v, want generation mismatch", err) } - if _, err := CommitStreamedStage("xfer-current", gen+1); err == nil || err.Error() != "stage_generation_mismatch" { + if _, err := svc.CommitStreamedStage("xfer-current", gen+1); err == nil || err.Error() != "stage_generation_mismatch" { t.Fatalf("stale generation CommitStreamedStage err = %v, want generation mismatch", err) } - if err := CommitBufferedStage("xfer-current", gen); err != nil { - t.Fatalf("CommitBufferedStage: %v", err) + if err := svc.WriteStreamedStage("xfer-current", gen, []byte("data")); err != nil { + t.Fatalf("WriteStreamedStage: %v", err) + } + if _, err := svc.CommitStreamedStage("xfer-current", gen); err != nil { + t.Fatalf("CommitStreamedStage: %v", err) } for _, tc := range []struct { @@ -931,8 +970,8 @@ func TestCommitWithoutStagedReturnsNothingStaged(t *testing.T) { if reply.OK { t.Fatalf("commit unexpectedly OK without staged image: %+v", reply) } - if reply.Error != ErrNothingStaged { - t.Fatalf("commit error = %q, want %q", reply.Error, ErrNothingStaged) + if reply.Error != ErrNoStagedImage { + t.Fatalf("commit error = %q, want %q", reply.Error, ErrNoStagedImage) } case <-time.After(2 * time.Second): t.Fatal("timeout waiting for commit reply") @@ -961,8 +1000,8 @@ func TestCommitWithoutStagedStateRefusesEvenWithDescriptor(t *testing.T) { select { case msg := <-replySub.Channel(): reply, _ := msg.Payload.(Reply) - if reply.OK || reply.Error != ErrNothingStaged { - t.Fatalf("commit reply = %+v, want refusal=nothing_staged", reply) + if reply.OK || reply.Error != ErrNoStagedImage { + t.Fatalf("commit reply = %+v, want refusal=no_staged_image", reply) } case <-time.After(2 * time.Second): t.Fatal("timeout waiting for commit reply") @@ -999,8 +1038,8 @@ func TestCommitUsesPreparedExpectedImageOverCommitPayload(t *testing.T) { payload := requestUpdaterReply(t, caller, TopicCommitRPC, CommitRequest{ExpectedImageID: "image-B"}) reply, ok := payload.(Reply) - if !ok || reply.OK || reply.Error != ErrTargetMismatch { - t.Fatalf("commit reply = %#v, want target mismatch", payload) + if !ok || reply.OK || reply.Error != ErrImageIDMismatch { + t.Fatalf("commit reply = %#v, want image id mismatch", payload) } canCalls, rebootCalls := app.callCounts() if canCalls != 0 || rebootCalls != 0 { @@ -1049,7 +1088,7 @@ func TestCommitWithoutApplierReturnsApplyUnavailable(t *testing.T) { case msg := <-csub.Channel(): reply, _ := msg.Payload.(Reply) if reply.OK || reply.Error != ErrApplyUnavailable { - t.Fatalf("commit reply = %+v, want refusal=apply_unavailable", reply) + t.Fatalf("commit reply = %+v, want refusal=commit_failed", reply) } case <-time.After(2 * time.Second): t.Fatal("timeout waiting for commit reply") @@ -1223,21 +1262,9 @@ func TestStageStubVerifierPublishesFailed(t *testing.T) { svc, cancel := runService(t, b, Options{Conn: conn, Verifier: StubVerifier()}) defer cancel() - req := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "xfer-1", []byte("blob")), false) - replySub := caller.Request(req) - defer caller.Unsubscribe(replySub) - - select { - case msg := <-replySub.Channel(): - reply, ok := msg.Payload.(StageReply) - if !ok || reply.OK { - t.Fatalf("stage unexpectedly OK with stub: %+v", reply) - } - if !strings.Contains(reply.Err, "verifier_stub") { - t.Fatalf("stage err = %q, want stub sentinel", reply.Err) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for stage reply") + _, generation := preparedStreamedStageLease(t, caller, svc, "xfer-1", []byte("blob")) + if _, err := svc.CommitStreamedStage("xfer-1", generation); err == nil || !strings.Contains(err.Error(), "verifier_stub") { + t.Fatalf("commit streamed stage err = %v, want stub sentinel", err) } up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) @@ -1401,11 +1428,13 @@ func TestStageFakeAcceptWritesStagedDescriptor(t *testing.T) { func TestStageFailureClearsStaleStagedDescriptor(t *testing.T) { // A (stage A) -> (prepare for B) -> (stage B fails) flow must not leave - // descriptor A persisted. The next commit should return nothing_staged + // descriptor A persisted. The next commit should return no_staged_image // rather than committing stale firmware. b := newTestBus() conn := b.NewConnection("updater") caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) // Pre-stage: a real descriptor sitting in metadata from an earlier // successful flow. @@ -1423,22 +1452,19 @@ func TestStageFailureClearsStaleStagedDescriptor(t *testing.T) { }) defer cancel() - // Drive updater/main staging to failure. - rreq := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "x", []byte("blob")), false) - rsub := caller.Request(rreq) - defer caller.Unsubscribe(rsub) - select { - case <-rsub.Channel(): - case <-time.After(2 * time.Second): - t.Fatal("timeout") + // Drive updater/main streamed staging to verifier failure. + _, generation := preparedStreamedStageLease(t, caller, svc, "x", []byte("blob")) + if _, err := svc.CommitStreamedStage("x", generation); err == nil || err.Error() != "bad_signature" { + t.Fatalf("commit streamed stage err = %v, want bad_signature", err) } + _ = waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) // The stale descriptor must have been cleared. if _, ok := memMD.StagedDescriptor(); ok { t.Fatalf("stale staged descriptor survived receiver failure") } - // Commit must refuse with nothing_staged rather than commit the + // Commit must refuse with no_staged_image rather than commit the // stale image. creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) csub := caller.Request(creq) @@ -1446,8 +1472,8 @@ func TestStageFailureClearsStaleStagedDescriptor(t *testing.T) { select { case msg := <-csub.Channel(): reply, _ := msg.Payload.(Reply) - if reply.OK || reply.Error != ErrNothingStaged { - t.Fatalf("commit reply = %+v, want refusal=nothing_staged", reply) + if reply.OK || reply.Error != ErrNoStagedImage { + t.Fatalf("commit reply = %+v, want refusal=no_staged_image", reply) } case <-time.After(2 * time.Second): t.Fatal("timeout") @@ -1595,21 +1621,9 @@ func TestStageFakeRejectPublishesFailed(t *testing.T) { svc, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) defer cancel() - req := caller.NewMessage(TopicStageRPC, preparedStagePayload(t, caller, svc, "xfer-3", []byte("blob")), false) - replySub := caller.Request(req) - defer caller.Unsubscribe(replySub) - - select { - case msg := <-replySub.Channel(): - reply, ok := msg.Payload.(StageReply) - if !ok || reply.OK { - t.Fatalf("stage unexpectedly OK: %+v", reply) - } - if reply.Err != "manifest_check_failed" { - t.Fatalf("stage err = %q, want manifest_check_failed", reply.Err) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for stage reply") + _, generation := preparedStreamedStageLease(t, caller, svc, "xfer-3", []byte("blob")) + if _, err := svc.CommitStreamedStage("xfer-3", generation); err == nil || err.Error() != "manifest_check_failed" { + t.Fatalf("commit streamed stage err = %v, want manifest_check_failed", err) } up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) diff --git a/services/updater/verifier.go b/services/updater/verifier.go index bf376dd..d6a25a1 100644 --- a/services/updater/verifier.go +++ b/services/updater/verifier.go @@ -79,7 +79,7 @@ type Applier interface { // refusingApplier is the production default. CanApply always returns // ErrApplyUnavailable so commit refuses with -// `error: "apply_unavailable"` and never reaches ArmReboot. +// `error: "commit_failed"` and never reaches ArmReboot. type refusingApplier struct{} // RefusingApplier returns the safe-default Applier for this branch. diff --git a/third_party/tinygo-uartx/README.md b/third_party/tinygo-uartx/README.md deleted file mode 100644 index 83d52c5..0000000 --- a/third_party/tinygo-uartx/README.md +++ /dev/null @@ -1,239 +0,0 @@ -# tinygo-uartx - -Interrupt-driven UART for TinyGo with **blocking `io.Reader`/`io.Writer` semantics**, explicit non-blocking operations, and practical flush. Designed and tested on RP2040/RP2350 (PL011), with stubs for other TinyGo targets. - -> **Compatibility notes** -> * `uartx` **breaks** with TinyGo’s `machine.UART` behaviour: **`Read(p)` blocks** until at least one byte is available. If you need non-blocking reads, use `TryRead`. See “API differences” below. -> * **TinyGo 0.39:** this currently requires the single-core scheduler. Build with `-scheduler tasks`. - ---- - -## Why use `uartx`? - -* **Correct, event-driven TX/RX**: uses hardware FIFOs and IRQs; foreground code does not poll during normal operation. -* **Clear Go semantics**: implements `io.Reader`, `io.Writer`, `io.ByteReader`, `io.ByteWriter` (via `ReadByte`/`WriteByte`) and a simple `Flusher` (`Flush()`). -* **Concurrent composability**: coalesced readiness channels `Readable()` and `Writable()` integrate naturally with `select` for fast, back-pressure-aware pacing. -* **On-the-wire completion**: `Flush()` waits for software buffer empty **and** hardware FIFO empty **and** the transmitter to go idle. -* **Production-oriented**: liveness at TX start, no foreground/ISR races, correct ordering of RX error handling, and minimal timed polling only where the hardware offers no interrupt (TX idle). - ---- - -## Supported targets - -* RP2040 / RP2350 (`go:build rp2040 || rp2350`) with PL011. - Other arches are build-gated but not implemented here. - ---- - -## Install - -```bash -go get github.com/jangala-dev/tinygo-uartx/uartx -``` - ---- - -## Quick start (RP2040) - -```go -package main - -import ( - "machine" - "time" - - "github.com/jangala-dev/tinygo-uartx/uartx" -) - -func main() { - u0 := uartx.UART0 - u1 := uartx.UART1 - - // Wire: U0 TX=GP0 -> U1 RX=GP5, U1 TX=GP4 -> U0 RX=GP1 - _ = u0.Configure(uartx.UARTConfig{BaudRate: 230400, TX: machine.Pin(0), RX: machine.Pin(1)}) - _ = u1.Configure(uartx.UARTConfig{BaudRate: 230400, TX: machine.Pin(4), RX: machine.Pin(5)}) - - // Writer: block until bytes are accepted by driver (SW TX ring and/or HW FIFO). - msg := []byte("hello, world\n") - _, _ = u0.Write(msg) - - // Reader: Read blocks until at least 1 byte is available, returns n>0, nil. - buf := make([]byte, 64) - n, _ := u1.Read(buf) - _, _ = u1.Write(buf[:n]) // echo back - - // Ensure everything went on the wire. - _ = u0.Flush() - _ = u1.Flush() - - for { time.Sleep(time.Second) } -} -``` - ---- - -## API overview - -### Blocking I/O - -* `Read(p []byte) (int, error)` - Blocks until **at least one byte** is available; returns `n>0, nil`. Does **not** return `io.EOF` for an idle UART. - -* `Write(p []byte) (int, error)` - Blocks until **all bytes are accepted** by the driver (software TX buffer and/or HW FIFO). Does **not** wait for the line to drain; see `Flush`. - -* `ReadByte() (byte, error)` - Non-blocking single-byte read from the software RX buffer. Returns `errUARTBufferEmpty` if no data is available. - -* `WriteByte(b byte) error` - Blocks until the byte is accepted by the driver. - -### Non-blocking helpers - -* `TryRead(p []byte) int` - Returns immediately with up to `len(p)` bytes from the RX buffer. `0` means “no data now”. -* `TryWrite(p []byte) int` - Returns immediately with `0..len(p)` bytes accepted into HW FIFO and/or SW TX buffer. `0` means “no space now”. - -### Readiness (for `select`) - -* `Readable() <-chan struct{}` - Coalesced notification: a receive interrupt that enqueues ≥1 byte sends a token. You **must re-check** state after waking (level→edge coalescer). -* `Writable() <-chan struct{}` - Coalesced notification: TX progress/space. Sent when bytes move SW→HW or space appears. Also level-coalesced; re-check state after waking. - -### Flush - -* `Flush() error` - Blocks until software TX buffer is empty, the HW TX FIFO is empty, **and** the transmitter is not busy. - Note: PL011 does not raise an interrupt for the final “idle” edge, so `Flush` uses a short timed poll (scaled to baud) in addition to readiness wakes. - -### Buffer introspection - -* `Buffered() int` – bytes in the RX ring. -* `TxFree() int` – free space in the SW TX ring. - -### Interfaces satisfied - -* `io.Reader`, `io.Writer`, `io.ByteReader`, `io.ByteWriter` -* `Flusher` (package-local `Flush() error`) - ---- - -## API differences vs TinyGo `machine.UART` - -| Behaviour | `machine.UART` (TinyGo) | `uartx` | -| ---------------------- | ------------------------ | --------------------------------------------- | -| `Read(p)` | **Non-blocking** | **Blocking** until ≥1 byte | -| Non-blocking read | `Read(p)` | `TryRead(p) int` | -| Non-blocking write | implementation-dependent | `TryWrite(p) int` | -| Event readiness | varied | `Readable()`, `Writable()` coalesced channels | -| On-the-wire completion | `Write(p)` | `Flush()` (FIFO empty **and** line idle) | -| Internals | polling/IRQ mix | **IRQ-driven**; HW FIFOs; minimal timed poll | - -If you migrate from `machine.UART`, audit any paths that relied on `Read` being non-blocking. Use `TryRead` or `Readable()` with `select` for non-blocking behaviour. - ---- - -## Concurrent patterns - -### Producer with pacing - -```go -func writeAll(u *uartx.UART, p []byte) { - sent := 0 - for sent < len(p) { - if n := u.TryWrite(p[sent:]); n > 0 { - sent += n - continue - } - <-u.Writable() // wait for space/progress; then re-check - } -} -``` - -### Consumer with timeout - -```go -func readSome(ctx context.Context, u *uartx.UART, p []byte) (int, error) { - if n := u.TryRead(p); n > 0 { return n, nil } - for { - select { - case <-u.Readable(): - if n := u.TryRead(p); n > 0 { return n, nil } - case <-ctx.Done(): - return 0, ctx.Err() - } - } -} -``` - -### Duplex with `select` - -```go -func pump(uIn, uOut *uartx.UART, buf []byte) { - for { - select { - case <-uIn.Readable(): - if n := uIn.TryRead(buf); n > 0 { - writeAll(uOut, buf[:n]) - } - case <-uOut.Writable(): - // optional: send pending application data - } - } -} -``` - ---- - -## Behavioural notes (RP2040/RP2350) - -* **Interrupt model**: RX uses level/timeout; TX uses level. Steady-state writes to the HW FIFO are performed in the ISR. Foreground only seeds the FIFO at TX start or in a guarded “masked kick” corner case; this avoids reordering. -* **Error handling**: framing/parity/overrun bytes are dropped on RX (read clears per-byte flags); sticky error status is cleared after draining. -* **Flush**: requires SW TX empty, FIFO empty and transmitter not busy. The final idle edge is not interrupt-driven on PL011; a short timed poll is used. - ---- - -## Example: integrity test (excerpt) - -```go -// Sender -func sendPattern(ctx context.Context, u *uartx.UART, gen func(int) byte, n int) error { - var buf [192]byte - for i := 0; i < n; { - k := len(buf) - if n-i < k { k = n - i } - for j := 0; j < k; j++ { buf[j] = gen(i+j) } - if _, err := sendAll(ctx, u, buf[:k]); err != nil { return err } - i += k - } - return nil -} - -func sendAll(ctx context.Context, u *uartx.UART, p []byte) (int, error) { - sent := 0 - for sent < len(p) { - if n := u.TryWrite(p[sent:]); n > 0 { sent += n; continue } - select { - case <-u.Writable(): - case <-ctx.Done(): return sent, ctx.Err() - } - } - return sent, nil -} -``` - ---- - -## Limitations and future work - -* Only RP2040/RP2350 implementation is included at present. -* RX overflow is dropped silently by default; add counters if required for diagnostics. -* CTS/RTS flow control is enabled only if both pins are configured; application-level tests advised. - ---- - -## Licence - -MIT, see `LICENCE` file. diff --git a/third_party/tinygo-uartx/uartx/rp2_uart.go b/third_party/tinygo-uartx/uartx/rp2_uart.go index 9a6b974..f2c8be6 100644 --- a/third_party/tinygo-uartx/uartx/rp2_uart.go +++ b/third_party/tinygo-uartx/uartx/rp2_uart.go @@ -40,11 +40,106 @@ type UART struct { baud uint32 // last configured baud (for diagnostics, not used by HW) - rxDrops volatile.Register32 - rxOverruns volatile.Register32 - rxBreaks volatile.Register32 - rxParity volatile.Register32 - rxFraming volatile.Register32 + stats uartStatsRegs +} + +// UARTStats is a non-atomic diagnostic snapshot. It is intended for coarse +// attribution while testing embedded UART paths, not for accounting-critical +// decisions. Counters may be sampled while the ISR is updating them. +type UARTStats struct { + RXIRQ uint32 + RXHWBytes uint32 + RXEnqueued uint32 + RXRingDrops uint32 + RXOverrun uint32 + RXBreak uint32 + RXParity uint32 + RXFraming uint32 + RXRingMax uint32 + RXReadBytes uint32 + RXReadEmpty uint32 + RXNotifyDrop uint32 + + TXIRQ uint32 + TXAccepted uint32 + TXHWBytes uint32 + TXRingFull uint32 + TXRingMax uint32 + TXTryCalls uint32 + TXNotifyDrop uint32 +} + +type uartStatsRegs struct { + rxIRQ volatile.Register32 + rxHWBytes volatile.Register32 + rxEnqueued volatile.Register32 + rxRingDrops volatile.Register32 + rxOverrun volatile.Register32 + rxBreak volatile.Register32 + rxParity volatile.Register32 + rxFraming volatile.Register32 + rxRingMax volatile.Register32 + rxReadBytes volatile.Register32 + rxReadEmpty volatile.Register32 + rxNotifyDrop volatile.Register32 + + txIRQ volatile.Register32 + txAccepted volatile.Register32 + txHWBytes volatile.Register32 + txRingFull volatile.Register32 + txRingMax volatile.Register32 + txTryCalls volatile.Register32 + txNotifyDrop volatile.Register32 +} + +func (uart *UART) inc(reg *volatile.Register32, n uint32) { reg.Set(reg.Get() + n) } + +func (uart *UART) observeRXRingUsed() { + u := uint32(uart.Buffer.Used()) + for { + old := uart.stats.rxRingMax.Get() + if u <= old { + return + } + uart.stats.rxRingMax.Set(u) + return + } +} + +func (uart *UART) observeTXRingUsed() { + u := uint32(uart.TxBuffer.Used()) + for { + old := uart.stats.txRingMax.Get() + if u <= old { + return + } + uart.stats.txRingMax.Set(u) + return + } +} + +// Stats returns a diagnostic snapshot of UART ISR and foreground counters. +func (uart *UART) Stats() UARTStats { + return UARTStats{ + RXIRQ: uart.stats.rxIRQ.Get(), RXHWBytes: uart.stats.rxHWBytes.Get(), RXEnqueued: uart.stats.rxEnqueued.Get(), RXRingDrops: uart.stats.rxRingDrops.Get(), RXOverrun: uart.stats.rxOverrun.Get(), RXBreak: uart.stats.rxBreak.Get(), RXParity: uart.stats.rxParity.Get(), RXFraming: uart.stats.rxFraming.Get(), RXRingMax: uart.stats.rxRingMax.Get(), RXReadBytes: uart.stats.rxReadBytes.Get(), RXReadEmpty: uart.stats.rxReadEmpty.Get(), RXNotifyDrop: uart.stats.rxNotifyDrop.Get(), + TXIRQ: uart.stats.txIRQ.Get(), TXAccepted: uart.stats.txAccepted.Get(), TXHWBytes: uart.stats.txHWBytes.Get(), TXRingFull: uart.stats.txRingFull.Get(), TXRingMax: uart.stats.txRingMax.Get(), TXTryCalls: uart.stats.txTryCalls.Get(), TXNotifyDrop: uart.stats.txNotifyDrop.Get(), + } +} + +// NoteRXRead records bytes drained from the UARTX software RX ring by a +// foreground consumer. It is diagnostic only; it deliberately does not alter +// UART data state. +func (uart *UART) NoteRXRead(n int) { + if n > 0 { + uart.inc(&uart.stats.rxReadBytes, uint32(n)) + return + } + uart.inc(&uart.stats.rxReadEmpty, 1) +} + +// ClearStats resets diagnostic counters. It does not alter UART data state. +func (uart *UART) ClearStats() { + uart.stats = uartStatsRegs{} } // Configure sets up the PL011, its pins and interrupts. It leaves RXIM/RTIM @@ -168,21 +263,6 @@ func (uart *UART) SetFormat(databits, stopbits uint8, parity UARTParity) error { return nil } -// RXDropCount reports bytes dropped because the software RX ring was full. -func (uart *UART) RXDropCount() uint32 { return uart.rxDrops.Get() } - -// RXOverrunCount reports PL011 RX overrun error bytes seen by the ISR. -func (uart *UART) RXOverrunCount() uint32 { return uart.rxOverruns.Get() } - -// RXBreakCount reports PL011 RX break error bytes seen by the ISR. -func (uart *UART) RXBreakCount() uint32 { return uart.rxBreaks.Get() } - -// RXParityCount reports PL011 RX parity error bytes seen by the ISR. -func (uart *UART) RXParityCount() uint32 { return uart.rxParity.Get() } - -// RXFramingCount reports PL011 RX framing error bytes seen by the ISR. -func (uart *UART) RXFramingCount() uint32 { return uart.rxFraming.Get() } - // initUART asserts and releases the peripheral reset for the selected PL011. func initUART(uart *UART) { var resetVal uint32 @@ -230,6 +310,7 @@ func (uart *UART) attemptSend(p []byte) int { break } uart.Bus.UARTDR.Set(uint32(b)) + uart.inc(&uart.stats.txHWBytes, 1) } // Arm TX interrupts; ISR takes over steady-state. uart.Bus.UARTIMSC.SetBits(rp.UART0_UARTIMSC_TXIM) @@ -252,6 +333,7 @@ func (uart *UART) attemptSend(p []byte) int { break } uart.Bus.UARTDR.Set(uint32(b)) + uart.inc(&uart.stats.txHWBytes, 1) } // Re-enable TX level interrupts. Next drop to/under IFLS will raise TX IRQ. @@ -286,6 +368,7 @@ func (uart *UART) tryWriteHW(p []byte) int { i := 0 for i < len(p) && !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_TXFF) { uart.Bus.UARTDR.Set(uint32(p[i])) + uart.inc(&uart.stats.txHWBytes, 1) i++ } return i @@ -296,9 +379,11 @@ func (uart *UART) enqueueTX(p []byte) int { i := 0 for i < len(p) { if ok := uart.TxBuffer.Put(p[i]); !ok { + uart.inc(&uart.stats.txRingFull, 1) break } i++ + uart.observeTXRingUsed() } return i } @@ -318,22 +403,36 @@ func (uart *UART) handleInterrupt(interrupt.Interrupt) { // RX path (RX level or RX timeout). if (mis & (rp.UART0_UARTMIS_RXMIS | rp.UART0_UARTMIS_RTMIS)) != 0 { + uart.inc(&uart.stats.rxIRQ, 1) // In the ISR, only notify if at least one byte was enqueued. enq := 0 for !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_RXFE) { r := uart.Bus.UARTDR.Get() - errs := r & (rp.UART0_UARTDR_OE | rp.UART0_UARTDR_BE | - rp.UART0_UARTDR_PE | rp.UART0_UARTDR_FE) - if errs != 0 { - uart.noteRXErrors(errs) + uart.inc(&uart.stats.rxHWBytes, 1) + if (r & (rp.UART0_UARTDR_OE | rp.UART0_UARTDR_BE | + rp.UART0_UARTDR_PE | rp.UART0_UARTDR_FE)) != 0 { + if (r & rp.UART0_UARTDR_OE) != 0 { + uart.inc(&uart.stats.rxOverrun, 1) + } + if (r & rp.UART0_UARTDR_BE) != 0 { + uart.inc(&uart.stats.rxBreak, 1) + } + if (r & rp.UART0_UARTDR_PE) != 0 { + uart.inc(&uart.stats.rxParity, 1) + } + if (r & rp.UART0_UARTDR_FE) != 0 { + uart.inc(&uart.stats.rxFraming, 1) + } // Drop errored byte; reading DR clears the per-byte error flags. continue } if uart.Buffer.Put(byte(r & 0xFF)) { enq++ + uart.inc(&uart.stats.rxEnqueued, 1) + uart.observeRXRingUsed() } else { - incrementRegister32(&uart.rxDrops) + uart.inc(&uart.stats.rxRingDrops, 1) } } @@ -346,12 +445,14 @@ func (uart *UART) handleInterrupt(interrupt.Interrupt) { select { case uart.notify <- struct{}{}: default: + uart.inc(&uart.stats.rxNotifyDrop, 1) } } } // TX path (TX level). if mis&rp.UART0_UARTMIS_TXMIS != 0 { + uart.inc(&uart.stats.txIRQ, 1) // Move bytes from SW buffer to HW FIFO. for !uart.Bus.UARTFR.HasBits(rp.UART0_UARTFR_TXFF) { @@ -360,12 +461,14 @@ func (uart *UART) handleInterrupt(interrupt.Interrupt) { break } uart.Bus.UARTDR.Set(uint32(b)) + uart.inc(&uart.stats.txHWBytes, 1) } // Coalesce a Writable notification (space/progress). select { case uart.txNotify <- struct{}{}: default: + uart.inc(&uart.stats.txNotifyDrop, 1) } // If SW buffer empty, manage the tail. @@ -384,22 +487,3 @@ func (uart *UART) handleInterrupt(interrupt.Interrupt) { uart.Bus.UARTICR.Set(rp.UART0_UARTICR_TXIC) } } - -func (uart *UART) noteRXErrors(errs uint32) { - if errs&rp.UART0_UARTDR_OE != 0 { - incrementRegister32(&uart.rxOverruns) - } - if errs&rp.UART0_UARTDR_BE != 0 { - incrementRegister32(&uart.rxBreaks) - } - if errs&rp.UART0_UARTDR_PE != 0 { - incrementRegister32(&uart.rxParity) - } - if errs&rp.UART0_UARTDR_FE != 0 { - incrementRegister32(&uart.rxFraming) - } -} - -func incrementRegister32(r *volatile.Register32) { - r.Set(r.Get() + 1) -} diff --git a/tools/fabric_uart_xfer_smoke.py b/tools/fabric_uart_xfer_smoke.py new file mode 100755 index 0000000..4c8ea16 --- /dev/null +++ b/tools/fabric_uart_xfer_smoke.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python3 +"""Direct fabric-jsonl/1 UART smoke test for the MCU updater/main transfer path. + +This is a host-side diagnostic helper. It speaks the CM5 side of the MCU Fabric +link directly over a serial TTY, without starting the Lua services. It is meant +for the MCU build tagged `fabric_uart_hwtest`, where updater/main staging is a +safe digest/count sink rather than the production A/B flash writer. + +Example: + + python3 tools/fabric_uart_xfer_smoke.py /dev/cu.usbserial-110 --size 1024 + +The script performs: + + hello -> hello_ack + prepare-update RPC + xfer_begin / xfer_chunk* / xfer_commit to updater/main + waits for xfer_done and, where visible, state/self/updater=staged + +It deliberately does not require a successful commit-update. In the default +hardware-test build the applier is still refusing, so commit-update should be +left to a later gate. +""" + +from __future__ import annotations + +import argparse +import base64 +import json +import os +import select +import sys +import termios +import time +import tty +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional + +PROTO = "fabric-jsonl/1" +DEFAULT_NODE = "bigbox-cm5" +DEFAULT_PEER = "mcu" +DEFAULT_TARGET = "updater/main" +DEFAULT_EXPECTED_IMAGE = "hwtest-image" +DEFAULT_BAUD = 115200 + +# Reflected polynomial constants for xxHash32, seed 0. +P1 = 0x9E3779B1 +P2 = 0x85EBCA77 +P3 = 0xC2B2AE3D +P4 = 0x27D4EB2F +P5 = 0x165667B1 + + +def _u32(v: int) -> int: + return v & 0xFFFFFFFF + + +def _rotl32(x: int, r: int) -> int: + return _u32((x << r) | (x >> (32 - r))) + + +def _round(acc: int, lane: int) -> int: + acc = _u32(acc + lane * P2) + acc = _rotl32(acc, 13) + acc = _u32(acc * P1) + return acc + + +def _read32le(data: bytes, off: int) -> int: + return data[off] | (data[off + 1] << 8) | (data[off + 2] << 16) | (data[off + 3] << 24) + + +def xxhash32(data: bytes, seed: int = 0) -> int: + n = len(data) + p = 0 + if n >= 16: + v1 = _u32(seed + P1 + P2) + v2 = _u32(seed + P2) + v3 = _u32(seed) + v4 = _u32(seed - P1) + limit = n - 16 + while p <= limit: + v1 = _round(v1, _read32le(data, p)); p += 4 + v2 = _round(v2, _read32le(data, p)); p += 4 + v3 = _round(v3, _read32le(data, p)); p += 4 + v4 = _round(v4, _read32le(data, p)); p += 4 + h = _u32(_rotl32(v1, 1) + _rotl32(v2, 7) + _rotl32(v3, 12) + _rotl32(v4, 18)) + else: + h = _u32(seed + P5) + h = _u32(h + n) + while p + 4 <= n: + h = _u32(h + _read32le(data, p) * P3) + h = _u32(_rotl32(h, 17) * P4) + p += 4 + while p < n: + h = _u32(h + data[p] * P5) + h = _u32(_rotl32(h, 11) * P1) + p += 1 + h ^= h >> 15 + h = _u32(h * P2) + h ^= h >> 13 + h = _u32(h * P3) + h ^= h >> 16 + return _u32(h) + + +def digest_hex(data: bytes) -> str: + return f"{xxhash32(data):08x}" + + +def b64url_unpadded(data: bytes) -> str: + return base64.urlsafe_b64encode(data).decode("ascii").rstrip("=") + + +@dataclass +class SerialRawConfig: + old_attrs: List[Any] + + +class FabricTTY: + def __init__(self, path: str, baud: int, verbose: bool = False) -> None: + self.path = path + self.verbose = verbose + self.fd = os.open(path, os.O_RDWR | os.O_NOCTTY | os.O_NONBLOCK) + self._config = self._configure(baud) + self._rx = bytearray() + + def _configure(self, baud: int) -> SerialRawConfig: + old = termios.tcgetattr(self.fd) + attrs = termios.tcgetattr(self.fd) + tty.setraw(self.fd, termios.TCSANOW) + attrs = termios.tcgetattr(self.fd) + speed = getattr(termios, f"B{baud}", None) + if speed is None: + raise RuntimeError(f"unsupported baud {baud} on this platform") + attrs[4] = speed + attrs[5] = speed + attrs[2] |= termios.CLOCAL | termios.CREAD + if hasattr(termios, "CRTSCTS"): + attrs[2] &= ~termios.CRTSCTS + attrs[2] &= ~termios.CSTOPB + attrs[2] &= ~termios.PARENB + attrs[2] &= ~termios.CSIZE + attrs[2] |= termios.CS8 + attrs[6][termios.VMIN] = 0 + attrs[6][termios.VTIME] = 0 + termios.tcsetattr(self.fd, termios.TCSANOW, attrs) + termios.tcflush(self.fd, termios.TCIOFLUSH) + return SerialRawConfig(old_attrs=old) + + def close(self) -> None: + try: + termios.tcsetattr(self.fd, termios.TCSANOW, self._config.old_attrs) + finally: + os.close(self.fd) + + def write_msg(self, msg: Dict[str, Any]) -> None: + line = json.dumps(msg, separators=(",", ":")).encode("utf-8") + b"\n" + if self.verbose: + print(">", line.decode("utf-8").rstrip()) + off = 0 + while off < len(line): + try: + n = os.write(self.fd, line[off:]) + except BlockingIOError: + select.select([], [self.fd], [], 0.25) + continue + if n > 0: + off += n + + def read_msg(self, timeout_s: float) -> Dict[str, Any]: + deadline = time.monotonic() + timeout_s + while True: + newline = self._rx.find(b"\n") + if newline >= 0: + raw = bytes(self._rx[:newline]).strip() + del self._rx[: newline + 1] + if not raw: + continue + try: + msg = json.loads(raw.decode("utf-8")) + except json.JSONDecodeError: + print("! ignoring malformed line from peer:", raw.decode("utf-8", "replace"), file=sys.stderr) + continue + if self.verbose: + print("<", json.dumps(msg, separators=(",", ":"))) + return msg + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError("timed out waiting for fabric frame") + r, _, _ = select.select([self.fd], [], [], min(0.25, remaining)) + if not r: + continue + try: + chunk = os.read(self.fd, 4096) + except BlockingIOError: + continue + if not chunk: + continue + self._rx.extend(chunk) + + +def wait_for(ttydev: FabricTTY, want_type: str, timeout_s: float, *, want_id: Optional[str] = None) -> Dict[str, Any]: + deadline = time.monotonic() + timeout_s + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError(f"timed out waiting for {want_type}") + msg = ttydev.read_msg(remaining) + mtype = msg.get("type") + if mtype == "ping": + ttydev.write_msg({"type": "pong", "sid": msg.get("sid", "")}) + continue + if mtype == "pub": + topic = "/".join(str(x) for x in msg.get("topic", [])) + payload = msg.get("payload") + if topic in {"state/self/software", "state/self/updater", "state/self/health"}: + print(f"pub {topic}: {json.dumps(payload, separators=(',', ':'))}") + if want_type == "pub" and (want_id is None or topic == want_id): + return msg + continue + if mtype == want_type and (want_id is None or msg.get("id") == want_id or msg.get("xfer_id") == want_id): + return msg + if mtype == "xfer_abort": + raise RuntimeError(f"peer aborted transfer {msg.get('xfer_id')}: {msg.get('err', '')}") + if mtype == "reply" and want_type == "reply" and want_id is not None and msg.get("id") != want_id: + continue + # Other protocol frames are expected during bring-up and retained export. + + +def payload_bytes(size: int) -> bytes: + # Deterministic content with enough variation to exercise chunk digests. + return bytes(((i * 37 + 11) & 0xFF) for i in range(size)) + + +def transfer(ttydev: FabricTTY, xfer_id: str, target: str, payload: bytes, chunk_size: int, timeout_s: float) -> None: + whole = digest_hex(payload) + ttydev.write_msg({ + "type": "xfer_begin", + "xfer_id": xfer_id, + "target": target, + "size": len(payload), + "digest_alg": "xxhash32", + "digest": whole, + "meta": {"source": "tools/fabric_uart_xfer_smoke.py"}, + }) + wait_for(ttydev, "xfer_ready", timeout_s, want_id=xfer_id) + off = 0 + while off < len(payload): + part = payload[off : off + chunk_size] + ttydev.write_msg({ + "type": "xfer_chunk", + "xfer_id": xfer_id, + "offset": off, + "data": b64url_unpadded(part), + "chunk_digest": digest_hex(part), + }) + off += len(part) + need = wait_for(ttydev, "xfer_need", timeout_s, want_id=xfer_id) + nxt = int(need.get("next", -1)) + if nxt != off: + raise RuntimeError(f"unexpected xfer_need next={nxt}, want {off}") + print(f"chunk ack next={off}") + ttydev.write_msg({ + "type": "xfer_commit", + "xfer_id": xfer_id, + "size": len(payload), + "digest_alg": "xxhash32", + "digest": whole, + }) + wait_for(ttydev, "xfer_done", timeout_s, want_id=xfer_id) + + +def main(argv: Optional[Iterable[str]] = None) -> int: + p = argparse.ArgumentParser(description="Direct fabric-jsonl/1 UART updater/main transfer smoke test") + p.add_argument("tty", help="serial device connected to the MCU Fabric UART") + p.add_argument("--baud", type=int, default=DEFAULT_BAUD) + p.add_argument("--size", type=int, default=1024, help="test payload bytes") + p.add_argument("--chunk-size", type=int, default=256, help="raw bytes per xfer_chunk; keep <= 2048") + p.add_argument("--timeout", type=float, default=10.0) + p.add_argument("--node", default=DEFAULT_NODE) + p.add_argument("--peer", default=DEFAULT_PEER) + p.add_argument("--target", default=DEFAULT_TARGET) + p.add_argument("--expected-image", default=DEFAULT_EXPECTED_IMAGE) + p.add_argument("--job-id", default=None) + p.add_argument("--verbose", action="store_true") + args = p.parse_args(list(argv) if argv is not None else None) + + if args.chunk_size <= 0 or args.chunk_size > 2048: + p.error("--chunk-size must be in 1..2048") + if args.size <= 0: + p.error("--size must be positive") + + sid = f"cm5-smoke-{int(time.time())}" + job_id = args.job_id or f"smoke-{int(time.time())}" + xfer_id = f"xfer-{job_id}" + payload = payload_bytes(args.size) + + ttydev = FabricTTY(args.tty, args.baud, args.verbose) + try: + print(f"hello sid={sid} node={args.node} peer={args.peer}") + ttydev.write_msg({"type": "hello", "proto": PROTO, "sid": sid, "node": args.node}) + ack = wait_for(ttydev, "hello_ack", args.timeout) + if ack.get("node") != args.peer or ack.get("proto") != PROTO: + raise RuntimeError(f"bad hello_ack: {ack}") + print(f"link up peer_sid={ack.get('sid')}") + + call_id = f"prepare-{job_id}" + ttydev.write_msg({ + "type": "call", + "id": call_id, + "topic": ["cap", "self", "updater", "main", "rpc", "prepare-update"], + "payload": { + "job_id": job_id, + "target": "mcu", + "expected_image_id": args.expected_image, + "metadata": {"source": "fabric_uart_xfer_smoke"}, + }, + }) + reply = wait_for(ttydev, "reply", args.timeout, want_id=call_id) + if not reply.get("ok"): + raise RuntimeError(f"prepare rejected: {reply}") + prep = reply.get("payload") or {} + if prep.get("target") != args.target: + raise RuntimeError(f"prepare returned target {prep.get('target')!r}, want {args.target!r}") + max_chunk = int(prep.get("max_chunk_size") or 0) + if max_chunk and args.chunk_size > max_chunk: + raise RuntimeError(f"chunk-size {args.chunk_size} exceeds prepare max_chunk_size {max_chunk}") + print(f"prepare ok target={prep.get('target')} max_chunk_size={max_chunk or 'unknown'}") + + print(f"transfer xfer_id={xfer_id} size={len(payload)} digest={digest_hex(payload)} chunk_size={args.chunk_size}") + transfer(ttydev, xfer_id, args.target, payload, args.chunk_size, args.timeout) + print("transfer done") + + # Give retained state export a brief chance to show staged state. The + # transfer result is authoritative for this smoke test, so this is + # informational rather than a hard requirement. + deadline = time.monotonic() + 3.0 + while time.monotonic() < deadline: + try: + msg = ttydev.read_msg(max(0.1, deadline - time.monotonic())) + except TimeoutError: + break + if msg.get("type") == "ping": + ttydev.write_msg({"type": "pong", "sid": msg.get("sid", "")}) + continue + if msg.get("type") == "pub": + topic = "/".join(str(x) for x in msg.get("topic", [])) + payload_obj = msg.get("payload") + if topic in {"state/self/software", "state/self/updater", "state/self/health"}: + print(f"pub {topic}: {json.dumps(payload_obj, separators=(',', ':'))}") + print("smoke ok") + return 0 + finally: + ttydev.close() + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/x/xxhash/xxhash.go b/x/xxhash/xxhash.go index 9e319c1..ebd517e 100644 --- a/x/xxhash/xxhash.go +++ b/x/xxhash/xxhash.go @@ -3,7 +3,7 @@ // // This package mirrors devicecode-lua/src/shared/hash/xxhash32.lua at // update-migration tip (commit 2c88090). It is used for fabric wire-protocol -// integrity (xfer_begin / xfer_commit checksum field) and for HAL artefact +// integrity (xfer_begin / xfer_commit digest field) and for HAL artefact // hashing. It is not a security primitive. package xxhash