diff --git a/docs/naming.md b/docs/naming.md index 21839bf01..04e59cf16 100644 --- a/docs/naming.md +++ b/docs/naming.md @@ -985,7 +985,7 @@ Raw imported member truth: * `raw/member/mcu/meta` * `raw/member/mcu/status` * `raw/member/mcu/state/...` -* `raw/member/mcu/cap/updater/main/...` +* `cap/self/updater/main/rpc/...` Fabric domain summaries: diff --git a/src/configs/bigbox-v1-cm-2.json b/src/configs/bigbox-v1-cm-2.json index 4152ae879..2db0c537f 100644 --- a/src/configs/bigbox-v1-cm-2.json +++ b/src/configs/bigbox-v1-cm-2.json @@ -1125,13 +1125,11 @@ "peer_id": "mcu", "transport": { "kind": "uart", - "source": "uart", + "source": "uart_uart0", "class": "uart", "id": "uart0", - "terminator": "\n", - "open_opts": { - "baud": 115200 - } + "cap_wait_timeout_s": 60, + "terminator": "\n" }, "session": { "identity_claim": { diff --git a/src/devicecode/main.lua b/src/devicecode/main.lua index 76b9249b2..4c79a1f96 100644 --- a/src/devicecode/main.lua +++ b/src/devicecode/main.lua @@ -59,6 +59,66 @@ local function cleanup_child_scope(child, reason) child:cancel(reason or 'cleanup') end +local function shallow_copy(t) + local out = {} + for k, v in pairs(t or {}) do + out[k] = v + end + return out +end + +local function env_value(getenv, name) + local v = getenv(name) + if v == nil or v == '' then return nil end + return v +end + +local function build_service_opts(base_opts, getenv) + getenv = getenv or os.getenv + local out = shallow_copy(base_opts) + + local ui_password = env_value(getenv, 'DEVICECODE_UI_ADMIN_PASSWORD') + if ui_password == nil then + return out + end + + local ui_opts = shallow_copy(out.ui) + if ui_opts.auth == nil and ui_opts.auth_opts == nil then + local username = env_value(getenv, 'DEVICECODE_UI_ADMIN_USERNAME') or 'admin' + ui_opts.auth_opts = { + users = { + [username] = { + password = ui_password, + principal = authz.user_principal(username, { roles = { 'admin' } }), + }, + }, + } + end + out.ui = ui_opts + + return out +end + +local function service_start_opts(name, env, connect, extra_opts) + local out = shallow_copy(extra_opts) + out.name = name + out.env = env + out.connect = connect + return out +end + +local function auth_user_summary(opts) + local users = opts and opts.auth_opts and opts.auth_opts.users + if type(users) ~= 'table' then return nil, nil end + local count = 0 + local first = nil + for username in pairs(users) do + count = count + 1 + first = first or tostring(username) + end + return count, first +end + local function spawn_service(child, bus, name, mod, env, extra_opts) return child:spawn(function() local conn = bus:connect({ @@ -71,14 +131,7 @@ local function spawn_service(child, bus, name, mod, env, extra_opts) }) end - mod.start(conn, { - name = name, - env = env, - connect = connect_as, - services = extra_opts and extra_opts.services or nil, - run_http = extra_opts and extra_opts.run_http or nil, - verify_login = extra_opts and extra_opts.verify_login or nil, - }) + mod.start(conn, service_start_opts(name, env, connect_as, extra_opts)) error(('service returned unexpectedly: %s'):format(tostring(name)), 0) end) @@ -186,7 +239,7 @@ function M.run(scope, params) local service_loader = params.service_loader or function(name) return require('services.' .. name) end - local service_opts = params.service_opts or {} + local service_opts = build_service_opts(params.service_opts) local main_conn = bus:connect({ principal = authz.service_principal('main'), @@ -215,6 +268,18 @@ function M.run(scope, params) fail_boot(main_conn, name, 'load_failed', lerr) end + if name == 'ui' then + local user_count, first_user = auth_user_summary(service_opts[name]) + if user_count then + main_conn:publish({ 'obs', 'log', 'main', 'info' }, { + what = 'ui_auth_configured', + source = 'service_opts', + users = user_count, + first_user = first_user, + }) + end + end + local child, cerr = scope:child() if not child then fail_boot(main_conn, name, 'child_scope_failed', cerr) @@ -305,4 +370,10 @@ function M.run(scope, params) end end +M._test = { + build_service_opts = build_service_opts, + service_start_opts = service_start_opts, + auth_user_summary = auth_user_summary, +} + return M diff --git a/src/main.lua b/src/main.lua index fe3a9079a..295eee607 100644 --- a/src/main.lua +++ b/src/main.lua @@ -22,6 +22,7 @@ else add_path('../vendor/lua-fibers/src/') add_path('../vendor/lua-bus/src/') add_path('../vendor/lua-trie/src/') + add_path('./') end local fibers = require 'fibers' diff --git a/src/services/device/action_worker.lua b/src/services/device/action_worker.lua index 242aea9b2..0841fa8e1 100644 --- a/src/services/device/action_worker.lua +++ b/src/services/device/action_worker.lua @@ -11,6 +11,7 @@ local scope_mod = require 'fibers.scope' local request_owner = require 'devicecode.support.request_owner' local resource = require 'devicecode.support.resource' local fabric_stage = require 'services.device.fabric_stage' +local store_bus = require 'services.update.artifacts.store_bus' local M = {} @@ -19,6 +20,13 @@ local function request_payload(req) return req.payload end +local function shallow_copy(t) + if type(t) ~= 'table' then return t end + local out = {} + for k, v in pairs(t) do out[k] = v end + return out +end + local function default_call_op(conn, topic, payload, opts) if type(conn) == 'table' and type(conn.call_op) == 'function' then return conn:call_op(topic, payload, opts) @@ -27,6 +35,49 @@ local function default_call_op(conn, topic, payload, opts) return nil, 'connection does not support call_op' end +local function topic_string(topic) + if type(topic) ~= 'table' then return nil end + local out = {} + for i = 1, #topic do out[i] = tostring(topic[i]) end + return table.concat(out, '/') +end + +local function prepare_action(ctx) + return ctx and ctx.action == 'prepare-update' +end + +local function scalar_payload_field(payload, key) + if type(payload) ~= 'table' then return nil end + local v = payload[key] + local tv = type(v) + if tv == 'string' or tv == 'number' or tv == 'boolean' then + return tostring(v) + end + return nil +end + +local function print_action_rpc_diag(ctx, event, payload, fields) + if not prepare_action(ctx) then return end + fields = fields or {} + local action = ctx.action_spec or {} + local parts = { '[device-action-rpc]', 'ev', event } + local function add(key, value) + if value == nil or value == '' then return end + parts[#parts + 1] = key + parts[#parts + 1] = tostring(value) + end + add('component', ctx.component_id) + add('action', ctx.action) + add('request_id', ctx.request_id) + add('topic', topic_string(action.call_topic)) + add('job_id', scalar_payload_field(payload, 'job_id')) + add('expected_image_id', scalar_payload_field(payload, 'expected_image_id')) + add('duration_ms', fields.duration_ms) + add('status', fields.status) + add('err', fields.err) + print(table.concat(parts, ' ')) +end + local function normalise_reply(reply) if type(reply) == 'table' and reply.ok == false then return nil, reply.reason or reply.err or 'action failed' @@ -78,6 +129,9 @@ local function public_reply_payload(result) return sanitise_public_payload(result) end +local open_source_op +local source_terminator + local function finalise_owner(owner, status, primary) if owner:done() then return end if status == 'cancelled' then @@ -89,9 +143,10 @@ local function finalise_owner(owner, status, primary) end end -local function run_rpc_op(ctx, owner) +local function rpc_call_op(ctx, owner, payload) local action = ctx.action_spec - local payload = request_payload(ctx.request) + local started = fibers.now() + print_action_rpc_diag(ctx, 'component_action_rpc_start', payload) local call_ev, cerr = default_call_op(ctx.conn, action.call_topic, payload, { timeout = action.timeout or ctx.timeout, deadline = ctx.deadline, @@ -100,6 +155,11 @@ local function run_rpc_op(ctx, owner) if not call_ev then local reason = cerr or 'rpc_unavailable' owner:fail_once(reason) + print_action_rpc_diag(ctx, 'component_action_rpc_done', payload, { + duration_ms = math.floor(((fibers.now() - started) * 1000) + 0.5), + status = 'unavailable', + err = reason, + }) return op.always(public_result('unavailable', { err = reason })) end @@ -107,25 +167,83 @@ local function run_rpc_op(ctx, owner) if reply == nil then local reason = err or 'rpc_failed' owner:fail_once(reason) + print_action_rpc_diag(ctx, 'component_action_rpc_done', payload, { + duration_ms = math.floor(((fibers.now() - started) * 1000) + 0.5), + status = 'remote_failed', + err = reason, + }) return public_result('remote_failed', { err = reason }) end local value, rerr = normalise_reply(reply) if value == nil and rerr ~= nil then owner:fail_once(rerr) + print_action_rpc_diag(ctx, 'component_action_rpc_done', payload, { + duration_ms = math.floor(((fibers.now() - started) * 1000) + 0.5), + status = 'remote_failed', + err = rerr, + }) return public_result('remote_failed', { err = rerr }) end owner:reply_once(value) + print_action_rpc_diag(ctx, 'component_action_rpc_done', payload, { + duration_ms = math.floor(((fibers.now() - started) * 1000) + 0.5), + status = 'succeeded', + }) return public_result('succeeded', { value = value, reply_payload = value }) end) end +local function rpc_stage_needs_source(ctx, payload) + if ctx.action ~= 'stage-update' or type(payload) ~= 'table' then return false end + if payload.source ~= nil or payload.artifact ~= nil then return false end + return payload.artifact_ref ~= nil or payload.ref ~= nil +end + +local function run_rpc_op(ctx, owner) + local payload = request_payload(ctx.request) + if not rpc_stage_needs_source(ctx, payload) then + return rpc_call_op(ctx, owner, payload) + end + + return fibers.run_scope_op(function (source_scope) + local source, err = fibers.perform(open_source_op(ctx)) + if not source then + owner:fail_once(err or 'source_required') + return public_result('rejected', { err = err or 'source_required' }) + end + + local owned = resource.owned(source, { + terminate = source_terminator(ctx), + label = 'device rpc-stage source termination', + }) + source_scope:finally(function (_, status, primary) + owned:terminate_checked( + primary or status or 'rpc_stage_terminated', + 'device rpc-stage source termination failed' + ) + end) + + local call_payload = shallow_copy(payload) + call_payload.source = owned:value() + return fibers.perform(rpc_call_op(ctx, owner, call_payload)) + end):wrap(function (st, _rep, result_or_primary) + if st == 'ok' then + return result_or_primary + end + + local reason = result_or_primary or st or 'rpc_stage_failed' + owner:fail_once(reason) + return public_result(st == 'cancelled' and 'cancelled' or 'failed', { err = reason }) + end) +end + local function is_op(v) return type(v) == 'table' and getmetatable(v) == op.Op end -local function open_source_op(ctx) +function open_source_op(ctx) local payload = request_payload(ctx.request) or {} local opener = ctx.open_source_op @@ -145,11 +263,17 @@ local function open_source_op(ctx) if payload.source ~= nil then return op.always(payload.source, nil) end if payload.artifact ~= nil then return op.always(payload.artifact, nil) end + if payload.artifact_ref ~= nil or payload.ref ~= nil then + local action = ctx.action_spec or {} + local store_id = payload.artifact_store or action.artifact_store or 'main' + local store = store_bus.new(ctx.conn, { id = store_id }) + return store:open_source_op(payload.artifact_ref or payload.ref) + end return op.always(nil, 'source_required') end -local function source_terminator(ctx) +function source_terminator(ctx) local terminate = ctx.terminate_source if terminate ~= nil and type(terminate) ~= 'function' then error('terminate_source must be a function', 0) @@ -286,7 +410,7 @@ function M.run(scope, ctx) error('device action worker must run in its owning scope', 0) end local req = assert(ctx.request, 'device action worker requires request') - local action_spec = assert(ctx.action_spec, 'device action worker requires action_spec') + assert(ctx.action_spec, 'device action worker requires action_spec') local owner = ctx.request_owner or request_owner.new(req, ctx.request_owner_opts) -- When action work is launched through action_manager, the request owner is diff --git a/src/services/device/catalogue.lua b/src/services/device/catalogue.lua index b330950fe..d457bbe06 100644 --- a/src/services/device/catalogue.lua +++ b/src/services/device/catalogue.lua @@ -11,6 +11,8 @@ local tablex = require 'shared.table' local M = {} +local DEFAULT_FABRIC_STAGE_TIMEOUT_S = 15 * 60 + local copy_array = tablex.array_copy local copy_value = tablex.deep_copy local deep_equal = tablex.deep_equal @@ -168,7 +170,7 @@ local function normalise_actions(actions, where) target = target, chunk_size = opt_pos_int(spec.chunk_size, where .. ': action ' .. action_name .. ' chunk_size', nil), artifact_store = spec.artifact_store or 'main', - timeout = tonumber(spec.timeout_s) or nil, + timeout = tonumber(spec.timeout_s) or DEFAULT_FABRIC_STAGE_TIMEOUT_S, dependency = copy_value(spec.dependency), } else @@ -269,8 +271,9 @@ local function default_components() ['stage-update'] = { kind = 'fabric_stage', target = 'updater/main', - chunk_size = 2048, + chunk_size = 1024, artifact_store = 'main', + timeout_s = DEFAULT_FABRIC_STAGE_TIMEOUT_S, }, ['commit-update'] = { kind = 'rpc', call_topic = topics.raw_member_cap_rpc('mcu', 'updater', 'main', 'commit-update') }, }, diff --git a/src/services/device/model.lua b/src/services/device/model.lua index 461a041b9..16c2760c8 100644 --- a/src/services/device/model.lua +++ b/src/services/device/model.lua @@ -30,6 +30,34 @@ local function empty_snapshot() } end +local function fabric_origin(origin) + local extra = type(origin) == 'table' and origin.extra or nil + local fabric = type(extra) == 'table' and extra.fabric or nil + if type(fabric) ~= 'table' then return nil end + local session = type(fabric.session) == 'table' and fabric.session or {} + local fabric_kind = fabric.kind + local allow_origin_fallback = type(fabric_kind) == 'string' + and fabric_kind:match('^remote_') ~= nil + local out = { + kind = fabric.kind, + link_id = fabric.link_id, + link_generation = fabric.link_generation, + peer_sid = fabric.peer_sid + or session.peer_sid + or (allow_origin_fallback and origin.peer_sid or nil), + session_generation = fabric.session_generation + or session.session_generation + or (allow_origin_fallback and origin.session_generation or nil), + } + local has = false + for _, v in pairs(out) do + if v ~= nil then + has = true + break + end + end + return has and out or nil +end local function recompute_component_status(rec) if type(rec) ~= 'table' then return rec end @@ -168,6 +196,7 @@ function DeviceModel:apply_observation(generation, observation) rec.fact_state[fact] = rec.fact_state[fact] or {} rec.fact_state[fact].seen = observation.payload ~= nil rec.fact_state[fact].updated_at = ts + rec.fact_state[fact].fabric = fabric_origin(observation.origin) rec.source_up = true rec.source_err = nil elseif tag == 'fact_unretained' then @@ -179,6 +208,7 @@ function DeviceModel:apply_observation(generation, observation) rec.fact_state[fact] = rec.fact_state[fact] or {} rec.fact_state[fact].seen = false rec.fact_state[fact].updated_at = ts + rec.fact_state[fact].fabric = nil elseif tag == 'event' or tag == 'event_seen' then local event = observation.event if type(event) ~= 'string' or event == '' then diff --git a/src/services/device/observer.lua b/src/services/device/observer.lua index 041cdba5a..4d21a3a19 100644 --- a/src/services/device/observer.lua +++ b/src/services/device/observer.lua @@ -196,29 +196,30 @@ function M.run(scope, ctx) return { reason = 'fact_closed', fact = item.name } end - if refreshes_freshness(item) then - stale_latched = false - stale_deadline = (type(stale_after_s) == 'number' and stale_after_s > 0) and (fibers.now() + stale_after_s) or nil - end - if item.ev.op == 'retain' then local raw = recv_payload(item.ev) local payload, nerr = normalise_with_component(rec, 'fact', item.name, raw) if nerr ~= nil then emit({ tag = 'source_down', reason = 'bad_fact:' .. item.name .. ':' .. tostring(nerr) }) else - emit({ tag = 'fact_retained', fact = item.name, payload = payload, raw = model.copy_value(raw) }) + emit({ + tag = 'fact_retained', + fact = item.name, + payload = payload, + raw = model.copy_value(raw), + origin = item.ev.origin, + }) + if refreshes_freshness(item) then + stale_latched = false + stale_deadline = (type(stale_after_s) == 'number' and stale_after_s > 0) and (fibers.now() + stale_after_s) or nil + end end elseif item.ev.op == 'unretain' then - emit({ tag = 'fact_unretained', fact = item.name }) + emit({ tag = 'fact_unretained', fact = item.name, origin = item.ev.origin }) + elseif item.ev.op == 'replay_done' then + -- Replay lifecycle metadata is not component state. else - local raw = recv_payload(item.ev) - local payload, nerr = normalise_with_component(rec, 'fact', item.name, raw) - if nerr ~= nil then - emit({ tag = 'source_down', reason = 'bad_fact:' .. item.name .. ':' .. tostring(nerr) }) - else - emit({ tag = 'fact_retained', fact = item.name, payload = payload, raw = model.copy_value(raw) }) - end + emit({ tag = 'source_down', reason = 'unknown_fact_event:' .. item.name .. ':' .. tostring(item.ev.op) }) end elseif item.kind == 'event' then if not item.msg then diff --git a/src/services/device/projection.lua b/src/services/device/projection.lua index d14cca08d..fe6431840 100644 --- a/src/services/device/projection.lua +++ b/src/services/device/projection.lua @@ -108,6 +108,92 @@ local function derive_health(status, updater_state, explicit_health) return 'ok' end +local MCU_CRITICAL_FACTS = { 'software', 'updater', 'health' } + +local function fact_control_state(rec, fact) + local state = type(rec.fact_state) == 'table' and rec.fact_state[fact] or nil + state = type(state) == 'table' and state or {} + return { + seen = state.seen == true, + updated_at = state.updated_at, + fabric = copy(state.fabric), + } +end + +local function fabric_has_session(fabric) + return type(fabric) == 'table' + and type(fabric.peer_sid) == 'string' + and fabric.peer_sid ~= '' + and fabric.session_generation ~= nil + and fabric.session_generation ~= '' +end + +local function control_plane_status(facts) + local missing_facts = {} + local missing_origin = {} + local peer_sid, session_generation, link_id, link_generation + local mixed_sessions = false + local mixed_links = false + + for i = 1, #MCU_CRITICAL_FACTS do + local fact = MCU_CRITICAL_FACTS[i] + local rec = facts[fact] + if not (type(rec) == 'table' and rec.seen == true) then + missing_facts[#missing_facts + 1] = fact + end + local fabric = type(rec) == 'table' and rec.fabric or nil + if not fabric_has_session(fabric) then + missing_origin[#missing_origin + 1] = fact + else + if peer_sid == nil then + peer_sid = fabric.peer_sid + session_generation = fabric.session_generation + elseif peer_sid ~= fabric.peer_sid or session_generation ~= fabric.session_generation then + mixed_sessions = true + end + if fabric.link_id ~= nil and fabric.link_id ~= '' then + if link_id == nil then link_id = fabric.link_id + elseif link_id ~= fabric.link_id then mixed_links = true end + end + if fabric.link_generation ~= nil and fabric.link_generation ~= '' then + if link_generation == nil then link_generation = fabric.link_generation + elseif link_generation ~= fabric.link_generation then mixed_links = true end + end + end + end + + return { + ready = #missing_facts == 0 and #missing_origin == 0 and not mixed_sessions and not mixed_links, + missing_facts = missing_facts, + missing_origin_facts = missing_origin, + mixed_fact_sessions = mixed_sessions, + mixed_fact_links = mixed_links, + peer_sid = peer_sid, + session_generation = session_generation, + link_id = link_id, + link_generation = link_generation, + } +end + +local function component_control_plane(rec) + if not (rec and (rec.subtype == 'mcu' or rec.member_class == 'mcu' or rec.class == 'mcu')) then + return nil + end + local facts = {} + for i = 1, #MCU_CRITICAL_FACTS do + local fact = MCU_CRITICAL_FACTS[i] + facts[fact] = fact_control_state(rec, fact) + end + local status = control_plane_status(facts) + return { + kind = 'mcu_control_plane', + ready = status.ready, + status = status, + facts = facts, + source = derive_source(rec), + } +end + function M.component_view(name, rec, now_ts) rec = rec or {} local base = compose_component(rec) @@ -141,6 +227,7 @@ function M.component_view(name, rec, now_ts) alerts = copy(base.alerts or {}), wired_provider = copy(base.wired_provider), source = derive_source(rec), + control_plane = component_control_plane(rec), last_action = copy(rec.last_action), } end diff --git a/src/services/device/schemas/mcu.lua b/src/services/device/schemas/mcu.lua index 18e142860..b424a2a10 100644 --- a/src/services/device/schemas/mcu.lua +++ b/src/services/device/schemas/mcu.lua @@ -83,6 +83,7 @@ function M.normalise_updater(raw) pending_image_id = raw.pending_image_id, staged_image_id = raw.staged_image_id, job_id = raw.job_id, + boot_buy_rc = raw.boot_buy_rc, } end diff --git a/src/services/device/service.lua b/src/services/device/service.lua index 3bed029a3..b38b71418 100644 --- a/src/services/device/service.lua +++ b/src/services/device/service.lua @@ -24,6 +24,7 @@ local cap_deps_mod = require 'devicecode.support.capability_dependencies' local dep_failure = require 'devicecode.support.dependency_failure' local backpressure = require 'services.device.backpressure' local dependency_mod = require 'services.device.dependencies' +local fabric_topics = require 'services.fabric.topics' local tablex = require 'shared.table' local M = {} @@ -37,6 +38,35 @@ local function new_service_id() return ('device-%d-%d'):format(os.time(), math.random(1, 1000000)) end +local function default_fabric_client(conn) + if type(conn) ~= 'table' or type(conn.call_op) ~= 'function' then return nil end + + return { + send_blob_op = function (_, params, opts) + params = params or {} + opts = opts or {} + local ev, err = conn:call_op(fabric_topics.transfer_manager_rpc('send-blob'), { + link_id = params.link_id, + request_id = params.request_id or params.job_id, + xfer_id = params.xfer_id, + target = params.target, + source_owner = params.source_owner, + size = params.size, + digest_alg = params.digest_alg, + digest = params.digest, + chunk_size = params.chunk_size, + meta = params.meta, + timeout_s = opts.timeout or params.timeout, + }, { timeout = opts.timeout or params.timeout }) + if not ev then return nil, err end + return ev:wrap(function (reply, err) + if reply == nil then return nil, err end + return reply.result or reply, nil + end) + end, + } +end + local function request_publication(state) if state.auto_publish == false then return end if state.publication_requested then return end @@ -837,7 +867,7 @@ local function build_state(scope, params) enable_observers = params.enable_observers, auto_publish = params.auto_publish, emit_events = params.emit_events, - fabric_client = params.fabric_client, + fabric_client = params.fabric_client or default_fabric_client(params.conn), open_source = params.open_source, open_source_op = params.open_source_op, terminate_source = params.terminate_source, @@ -974,5 +1004,6 @@ M.start_generation = start_generation M.cancel_active_generation = cancel_active_generation M.flush_publication = flush_publication M.cleanup_publication_now = cleanup_publication_now +M.default_fabric_client = default_fabric_client return M diff --git a/src/services/fabric.lua b/src/services/fabric.lua index 4fc8b6e11..970a3f50d 100644 --- a/src/services/fabric.lua +++ b/src/services/fabric.lua @@ -123,9 +123,14 @@ local function open_transport_for_link(scope, link_spec, service_caps) end if link_spec.transport ~= nil then + local transport_cfg = link_spec.transport + if type(transport_cfg) == 'table' and link_spec.trace_io ~= nil then + transport_cfg = shallow_copy(transport_cfg) + transport_cfg.trace_io = link_spec.trace_io == true + end local transport, err = hal_transport.open_transport( link_conn(link_spec, service_caps), - link_spec.transport + transport_cfg ) if transport == nil then return nil, transport_open_error(link_spec, err, 'transport_open_failed') diff --git a/src/services/fabric/bridge.lua b/src/services/fabric/bridge.lua index ceb06402d..7da4a563c 100644 --- a/src/services/fabric/bridge.lua +++ b/src/services/fabric/bridge.lua @@ -102,19 +102,45 @@ local function fail_request(req, reason) return false end +local function topic_string(topic) + if type(topic) ~= 'table' then return nil end + local out = {} + for i = 1, #topic do out[i] = tostring(topic[i]) end + return table.concat(out, '/') +end + +local function imported_retained_topic_strings(self) + local out = {} + for _, rec in pairs(self._imported_retained or {}) do + local s = rec and topic_string(rec.topic) + if s ~= nil then out[#out + 1] = s end + end + table.sort(out) + return out +end + local function initial_snapshot(link_id, link_generation) return { link_id = link_id, link_generation = link_generation, state = 'starting', imported_topics = 0, + imported_retained_count = 0, + imported_retained_topics = {}, pending_calls = 0, inbound_calls = 0, frames_sent = 0, frames_received = 0, last_err = nil, session = nil, + session_generation = nil, + peer_sid = nil, session_drop_reason = nil, + last_imported_topic = nil, + last_imported_at = nil, + last_clear_reason = nil, + last_clear_count = 0, + last_outbound_call = nil, } end @@ -144,6 +170,8 @@ local function update_model(self, patch) end s.imported_topics = count_keys(self._imported_retained) + s.imported_retained_count = s.imported_topics + s.imported_retained_topics = imported_retained_topic_strings(self) s.pending_calls = count_keys(self._pending_calls) s.inbound_calls = count_keys(self._inbound_calls) end) @@ -168,6 +196,98 @@ local function has_current_session(self) and self._session.peer_sid ~= '' end +local function scalar_payload_field(payload, key) + if type(payload) ~= 'table' then return nil end + local v = payload[key] + local tv = type(v) + if tv == 'string' or tv == 'number' or tv == 'boolean' then + return tostring(v) + end + return nil +end + +local function prepare_update_topic(topic) + return type(topic) == 'table' + and topic[#topic] == 'prepare-update' + and topic[#topic - 1] == 'rpc' +end + +local function trace_outbound_call_topic(local_topic, remote_topic) + return prepare_update_topic(local_topic) or prepare_update_topic(remote_topic) +end + +local function append_diag_print_field(parts, key, value) + if value == nil or value == '' then return end + parts[#parts + 1] = key + parts[#parts + 1] = tostring(value) +end + +local function print_outbound_call_diag(diag) + local parts = { '[fabric-rpc]', 'ev', tostring(diag.event or 'unknown') } + append_diag_print_field(parts, 'call_id', diag.call_id) + append_diag_print_field(parts, 'local_topic', diag.local_topic) + append_diag_print_field(parts, 'remote_topic', diag.remote_topic) + append_diag_print_field(parts, 'job_id', diag.job_id) + append_diag_print_field(parts, 'expected_image_id', diag.expected_image_id) + append_diag_print_field(parts, 'peer_sid', diag.peer_sid) + append_diag_print_field(parts, 'session_generation', diag.session_generation) + if diag.frame_admitted ~= nil then + append_diag_print_field(parts, 'frame_admitted', tostring(diag.frame_admitted)) + end + if diag.frame_sent ~= nil then + append_diag_print_field(parts, 'frame_sent', tostring(diag.frame_sent)) + end + if diag.reply_routed ~= nil then + append_diag_print_field(parts, 'reply_routed', tostring(diag.reply_routed)) + end + if diag.ok ~= nil then + append_diag_print_field(parts, 'ok', tostring(diag.ok)) + end + append_diag_print_field(parts, 'reason', diag.reason) + append_diag_print_field(parts, 'err', diag.err) + print(table.concat(parts, ' ')) +end + +local function pick_bool(primary, fallback) + if primary ~= nil then return not not primary end + if fallback ~= nil then return not not fallback end + return nil +end + +local function record_outbound_call_diag(self, rec, event, extra) + extra = extra or {} + local local_topic = extra.local_topic or (rec and rec.local_topic) + local remote_topic = extra.remote_topic or (rec and rec.remote_topic) + if not trace_outbound_call_topic(local_topic, remote_topic) then + return + end + + local payload = extra.payload or (rec and rec.payload) + local session = extra.session or (rec and rec.session) or current_session(self) + local diag = { + event = event, + at = fibers.now(), + link_id = self._link_id, + link_generation = self._link_generation, + call_id = extra.call_id or (rec and rec.id), + local_topic = topic_string(local_topic), + remote_topic = topic_string(remote_topic), + job_id = scalar_payload_field(payload, 'job_id'), + expected_image_id = scalar_payload_field(payload, 'expected_image_id'), + peer_sid = session and session.peer_sid or nil, + session_generation = session and session.session_generation or nil, + frame_admitted = pick_bool(extra.frame_admitted, rec and rec.frame_admitted), + frame_sent = pick_bool(extra.frame_sent), + reply_routed = pick_bool(extra.reply_routed, rec and rec.reply_routed), + ok = pick_bool(extra.ok), + reason = extra.reason, + err = extra.err, + } + + update_model(self, { last_outbound_call = diag }) + print_outbound_call_diag(diag) +end + -------------------------------------------------------------------------------- -- Frame output -------------------------------------------------------------------------------- @@ -393,10 +513,18 @@ local function run_outbound_call(call) ok = false, err = send_err or 'send_failed', frame_sent = false, + event = 'send_failed', } end call.mark_frame_admitted() + call.report_diag({ + kind = 'outbound_call_diag', + call_id = call.id, + event = 'sent', + frame_sent = true, + ok = true, + }) if call.reply_policy == 'sent-is-accepted' then local payload = { @@ -412,6 +540,7 @@ local function run_outbound_call(call) ok = true, frame_sent = true, sent_is_accepted = true, + event = 'sent_is_accepted', } end @@ -427,6 +556,8 @@ local function run_outbound_call(call) call_id = call.id, timed_out = true, frame_sent = true, + event = 'timeout', + err = 'timeout', } end @@ -441,6 +572,7 @@ local function run_outbound_call(call) closed = true, err = tostring(why), frame_sent = true, + event = 'closed', } end @@ -452,6 +584,7 @@ local function run_outbound_call(call) ok = true, payload = reply.payload, frame_sent = true, + event = 'reply_ok', } end @@ -463,6 +596,7 @@ local function run_outbound_call(call) ok = false, err = err, frame_sent = true, + event = 'reply_error', } end end @@ -474,12 +608,26 @@ local function start_outbound_call(self, ev) end if not has_current_session(self) then + record_outbound_call_diag(self, nil, 'not_admitted', { + local_topic = checked_topic, + payload = ev.payload, + reason = 'no_session', + frame_admitted = false, + frame_sent = false, + }) fail_request(ev.request or ev, 'no_session') return end local remote_topic, rule = map_local_call(self, checked_topic) if not remote_topic then + record_outbound_call_diag(self, nil, 'not_admitted', { + local_topic = checked_topic, + payload = ev.payload, + reason = 'no_route', + frame_admitted = false, + frame_sent = false, + }) fail_request(ev.request or ev, 'no_route') return end @@ -490,11 +638,29 @@ local function start_outbound_call(self, ev) end if self._pending_calls[id] ~= nil then + record_outbound_call_diag(self, nil, 'not_admitted', { + call_id = id, + local_topic = checked_topic, + remote_topic = remote_topic, + payload = ev.payload, + reason = 'duplicate_call_id', + frame_admitted = false, + frame_sent = false, + }) fail_request(ev.request or ev, 'duplicate_call_id') error('bridge duplicate outbound call id: ' .. id, 0) end if count_keys(self._pending_calls) >= self._max_pending_calls then + record_outbound_call_diag(self, nil, 'not_admitted', { + call_id = id, + local_topic = checked_topic, + remote_topic = remote_topic, + payload = ev.payload, + reason = 'too_many_pending_calls', + frame_admitted = false, + frame_sent = false, + }) fail_request(ev.request or ev, 'too_many_pending_calls') return end @@ -509,6 +675,9 @@ local function start_outbound_call(self, ev) reply_routed = false, frame_admitted = false, session = current_session(self), + local_topic = topics.copy(checked_topic), + remote_topic = topics.copy(remote_topic), + payload = ev.payload, } local call = { @@ -521,6 +690,13 @@ local function start_outbound_call(self, ev) caps = make_bridge_caps(self, rec.session), reply_tx = reply_tx, reply_rx = reply_rx, + report_diag = function (diag) + return queue.try_admit_required( + self._done_tx, + diag, + 'bridge_outbound_call_diag_report_failed' + ) + end, mark_frame_admitted = function () rec.frame_admitted = true @@ -530,6 +706,10 @@ local function start_outbound_call(self, ev) self._pending_calls[id] = rec update_model(self) + record_outbound_call_diag(self, rec, 'queued', { + frame_admitted = false, + frame_sent = false, + }) local handle, start_err = start_bridge_work( self, @@ -548,6 +728,11 @@ local function start_outbound_call(self, ev) self._pending_calls[id] = nil reply_tx:close('outbound_call_start_failed') update_model(self) + record_outbound_call_diag(self, rec, 'not_admitted', { + reason = start_err or 'outbound_call_start_failed', + frame_admitted = false, + frame_sent = false, + }) owner:fail_once(start_err or 'outbound_call_start_failed') error(start_err or 'outbound_call_start_failed', 0) @@ -648,9 +833,11 @@ end local cancel_pending_calls local cancel_inbound_calls -local function clear_imported_retained(self) +local function clear_imported_retained(self, reason) + local count = 0 for key, rec in pairs(self._imported_retained) do self._imported_retained[key] = nil + count = count + 1 if rec and rec.topic then local frame = protocol.unretain(rec.topic) @@ -660,6 +847,9 @@ local function clear_imported_retained(self) end end end + self._last_clear_reason = reason + self._last_clear_count = count + return count end local function clear_peer_session(self, reason, session, opts) @@ -679,7 +869,7 @@ local function clear_peer_session(self, reason, session, opts) return end - clear_imported_retained(self) + clear_imported_retained(self, reason or 'session_dropped') if opts.cancel_calls ~= false then cancel_pending_calls(self, reason or 'session_dropped') @@ -691,7 +881,11 @@ local function clear_peer_session(self, reason, session, opts) update_model(self, { session = nil, + session_generation = nil, + peer_sid = nil, session_drop_reason = self._session_drop_reason, + last_clear_reason = self._last_clear_reason, + last_clear_count = self._last_clear_count, }) end @@ -721,6 +915,8 @@ local function handle_peer_session(self, ev) update_model(self, { session = copy_context(self._session), + session_generation = self._session.session_generation, + peer_sid = self._session.peer_sid, session_drop_reason = nil, }) @@ -751,6 +947,8 @@ local function apply_remote_publish(self, frame, session) payload = frame.payload, session = copy_context(session), } + self._last_imported_topic = topic_string(local_topic) + self._last_imported_at = fibers.now() end local ok, err = bus_publish_import(self, local_topic, frame.payload, local_frame, session) @@ -758,7 +956,10 @@ local function apply_remote_publish(self, frame, session) error(err or 'bridge_bus_publish_import_failed', 0) end - update_model(self) + update_model(self, { + last_imported_topic = self._last_imported_topic, + last_imported_at = self._last_imported_at, + }) end local function apply_remote_unretain(self, frame, session) @@ -797,6 +998,12 @@ local function route_remote_reply(self, frame, session) if ok == true then rec.reply_routed = true + record_outbound_call_diag(self, rec, 'reply_routed', { + reply_routed = true, + frame_sent = true, + ok = frame.ok, + err = frame.err, + }) return end @@ -964,9 +1171,31 @@ local function handle_outbound_call_done(self, ev) record_frame_sent(self) end + local result = ev.result or {} + record_outbound_call_diag(self, rec, result.event or 'done', { + frame_admitted = rec.frame_admitted, + frame_sent = result.frame_sent, + reply_routed = rec.reply_routed, + ok = result.ok, + err = result.err, + reason = result.err, + }) update_model(self) end +local function handle_outbound_call_diag(self, ev) + local rec = self._pending_calls[ev.call_id] + if not rec then return end + record_outbound_call_diag(self, rec, ev.event or 'diag', { + frame_admitted = rec.frame_admitted, + frame_sent = ev.frame_sent, + reply_routed = rec.reply_routed, + ok = ev.ok, + err = ev.err, + reason = ev.reason, + }) +end + local function inbound_reply_frame(ev) local remote_id = ev.remote_call_id @@ -1015,6 +1244,9 @@ local function handle_done(self, ev) if ev.kind == 'outbound_call_done' then handle_outbound_call_done(self, ev) + elseif ev.kind == 'outbound_call_diag' then + handle_outbound_call_diag(self, ev) + elseif ev.kind == 'inbound_call_done' then handle_inbound_call_done(self, ev) @@ -1233,6 +1465,7 @@ local function handle_event(self, ev) handle_frame_event(self, ev) elseif ev.kind == 'outbound_call_done' + or ev.kind == 'outbound_call_diag' or ev.kind == 'inbound_call_done' then handle_done(self, ev) @@ -1310,8 +1543,18 @@ function M.run(scope, params) _done_rx = done_rx, _default_call_timeout = positive_number(params.call_timeout_s, DEFAULT_CALL_TIMEOUT, 'bridge.call_timeout_s', 2), - _max_pending_calls = resolve_nonneg_int(params.max_pending_calls, DEFAULT_MAX_PENDING_CALLS, 'bridge.max_pending_calls', 2), - _max_inbound_calls = resolve_nonneg_int(params.max_inbound_calls, DEFAULT_MAX_INBOUND_CALLS, 'bridge.max_inbound_calls', 2), + _max_pending_calls = resolve_nonneg_int( + params.max_pending_calls, + DEFAULT_MAX_PENDING_CALLS, + 'bridge.max_pending_calls', + 2 + ), + _max_inbound_calls = resolve_nonneg_int( + params.max_inbound_calls, + DEFAULT_MAX_INBOUND_CALLS, + 'bridge.max_inbound_calls', + 2 + ), _import_rules = params.import_rules or {}, _export_publish_rules = params.export_publish_rules or {}, @@ -1332,6 +1575,10 @@ function M.run(scope, params) _frames_sent = 0, _frames_received = 0, + _last_imported_topic = nil, + _last_imported_at = nil, + _last_clear_reason = nil, + _last_clear_count = 0, _event_pending = {}, _next_call_seq = 0, }, Bridge) diff --git a/src/services/fabric/config.lua b/src/services/fabric/config.lua index 3af43ce03..a2b2f0c22 100644 --- a/src/services/fabric/config.lua +++ b/src/services/fabric/config.lua @@ -22,9 +22,11 @@ local M = {} local SCHEMA = 'devicecode.config/fabric/1' local DEFAULTS = { + trace_io = false, reader = { bad_frame_limit = 5, bad_frame_window_s = 10.0, + bad_frame_quiet_s = 2.0, }, session = { hello_interval_s = 2.0, @@ -34,6 +36,7 @@ local DEFAULTS = { writer = { rpc_quota = 4, bulk_quota = 1, + flush_each = true, }, bridge = { max_pending_calls = 64, @@ -54,7 +57,7 @@ local DEFAULTS = { } local ROOT_KEYS = { - schema = true, local_node = true, links = true, + schema = true, local_node = true, links = true, trace_io = true, } local LINK_KEYS = { @@ -66,6 +69,7 @@ local LINK_KEYS = { local TRANSPORT_KEYS = { kind = true, source = true, class = true, id = true, open_verb = true, open_opts = true, terminator = true, + cap_wait_timeout_s = true, } local SESSION_KEYS = { @@ -74,10 +78,10 @@ local SESSION_KEYS = { } local READER_KEYS = { - bad_frame_limit = true, bad_frame_window_s = true, + bad_frame_limit = true, bad_frame_window_s = true, bad_frame_quiet_s = true, } -local WRITER_KEYS = { rpc_quota = true, bulk_quota = true } +local WRITER_KEYS = { rpc_quota = true, bulk_quota = true, flush_each = true } local BRIDGE_KEYS = { imports = true, exports = true, rpc = true, @@ -289,6 +293,8 @@ local function compile_transport(raw, link_id) if e4 then return nil, e4 end local open_verb, e5 = opt_str(raw.open_verb, 'transport.open_verb', 'open') if e5 then return nil, e5 end + local cap_wait_timeout_s, e6 = opt_pos_number(raw.cap_wait_timeout_s, 'transport.cap_wait_timeout_s') + if e6 then return nil, e6 end local terminator = raw.terminator if terminator == nil then terminator = '\n' end @@ -304,6 +310,7 @@ local function compile_transport(raw, link_id) open_verb = open_verb, open_opts = copy_plain(raw.open_opts), terminator = terminator, + cap_wait_timeout_s = cap_wait_timeout_s, }, nil end @@ -338,14 +345,29 @@ local function compile_reader(raw) local ok, err = allowed(raw, READER_KEYS, 'reader') if not ok then return nil, err end - local bad_frame_limit, e1 = pos_int(raw.bad_frame_limit, 'reader.bad_frame_limit', DEFAULTS.reader.bad_frame_limit) + local bad_frame_limit, e1 = pos_int( + raw.bad_frame_limit, + 'reader.bad_frame_limit', + DEFAULTS.reader.bad_frame_limit + ) if e1 then return nil, e1 end - local bad_frame_window_s, e2 = pos_number(raw.bad_frame_window_s, 'reader.bad_frame_window_s', DEFAULTS.reader.bad_frame_window_s) + local bad_frame_window_s, e2 = pos_number( + raw.bad_frame_window_s, + 'reader.bad_frame_window_s', + DEFAULTS.reader.bad_frame_window_s + ) if e2 then return nil, e2 end + local bad_frame_quiet_s, e3 = pos_number( + raw.bad_frame_quiet_s, + 'reader.bad_frame_quiet_s', + DEFAULTS.reader.bad_frame_quiet_s + ) + if e3 then return nil, e3 end return { bad_frame_limit = bad_frame_limit, bad_frame_window_s = bad_frame_window_s, + bad_frame_quiet_s = bad_frame_quiet_s, }, nil end @@ -359,8 +381,10 @@ local function compile_writer(raw) if e1 then return nil, e1 end local bulk_quota, e2 = pos_int(raw.bulk_quota, 'writer.bulk_quota', DEFAULTS.writer.bulk_quota) if e2 then return nil, e2 end + local flush_each, e3 = bool(raw.flush_each, 'writer.flush_each', DEFAULTS.writer.flush_each) + if e3 then return nil, e3 end - return { rpc_quota = rpc_quota, bulk_quota = bulk_quota }, nil + return { rpc_quota = rpc_quota, bulk_quota = bulk_quota, flush_each = flush_each }, nil end local function compile_rule_item(raw, direction, path, keys) @@ -601,6 +625,8 @@ function M.compile(raw) local local_node, nerr = opt_str(raw.local_node, 'fabric.local_node') if nerr then return fail(nerr) end + local trace_io, terr = bool(raw.trace_io, 'fabric.trace_io', DEFAULTS.trace_io) + if terr then return fail(terr) end local links_in, lerr = list_or_empty(raw.links, 'fabric.links') if not links_in then return fail(lerr) end @@ -609,6 +635,7 @@ function M.compile(raw) service = { schema = raw.schema, local_node = local_node, + trace_io = trace_io, }, links = {}, routing = { diff --git a/src/services/fabric/hal_transport.lua b/src/services/fabric/hal_transport.lua index e51968f59..b3defba83 100644 --- a/src/services/fabric/hal_transport.lua +++ b/src/services/fabric/hal_transport.lua @@ -8,14 +8,24 @@ local fibers = require 'fibers' local op = require 'fibers.op' +local sleep = require 'fibers.sleep' local protocol = require 'services.fabric.protocol' local resource = require 'devicecode.support.resource' local cap_sdk = require 'services.hal.sdk.cap' local dep_failure = require 'devicecode.support.dependency_failure' +local xxhash32 = require 'shared.hash.xxhash32' local M = {} +local DEFAULT_CAP_WAIT_TIMEOUT_S = 5.0 +local DEFAULT_DRAIN_MAX_BYTES = 64 * 1024 +local DEFAULT_DRAIN_TOTAL_S = 0.100 +local DEFAULT_DRAIN_QUIET_S = 0.020 +local DEFAULT_DRAIN_READ_S = 0.010 +local BAD_LINE_SNIP_BYTES = 80 +local RESYNC_PREFIX_SCAN_MAX_BYTES = 4096 + -------------------------------------------------------------------------------- -- JSONL transport wrapper -------------------------------------------------------------------------------- @@ -57,6 +67,138 @@ local function write_bytes_result(n, err) return true, nil end +local function log_wire(enabled, event, fields) + if enabled ~= true then return end + local parts = { '[fabric-wire]', tostring(event) } + for k, v in pairs(fields or {}) do + if v ~= nil then + parts[#parts + 1] = tostring(k) + parts[#parts + 1] = tostring(v) + end + end + print(table.concat(parts, ' ')) +end + +local function wire_fields(frame, line) + local out = { + type = type(frame) == 'table' and frame.type or type(frame), + id = type(frame) == 'table' and (frame.xfer_id or frame.sid) or nil, + offset = type(frame) == 'table' and frame.offset or nil, + next = type(frame) == 'table' and frame.next or nil, + size = type(frame) == 'table' and frame.size or nil, + line_len = type(line) == 'string' and #line or nil, + line_xxhash32 = type(line) == 'string' and xxhash32.digest_hex(line) or nil, + } + if type(frame) == 'table' and frame.type == 'xfer_chunk' and type(frame.data) == 'string' then + out.raw_len = #frame.data + out.encoded_len = #protocol.encode_chunk(frame.data) + out.chunk_digest = frame.chunk_digest + end + return out +end + +local function escape_bytes(s) + if type(s) ~= 'string' then return nil end + local out = {} + for i = 1, #s do + local b = s:byte(i) + if b == 9 then + out[#out + 1] = '\\t' + elseif b == 10 then + out[#out + 1] = '\\n' + elseif b == 13 then + out[#out + 1] = '\\r' + elseif b == 92 then + out[#out + 1] = '\\\\' + elseif b >= 32 and b <= 126 then + out[#out + 1] = string.char(b) + else + out[#out + 1] = string.format('\\x%02x', b) + end + end + return table.concat(out) +end + +local function bad_line_diag(line, err) + if type(line) ~= 'string' then + return err + end + local head = line:sub(1, BAD_LINE_SNIP_BYTES) + local tail = line + if #line > BAD_LINE_SNIP_BYTES then + tail = line:sub(#line - BAD_LINE_SNIP_BYTES + 1) + end + return { + err = tostring(err or 'decode_failed'), + last_decode_error = tostring(err or 'decode_failed'), + last_bad_line_len = #line, + last_bad_line_xxhash32 = xxhash32.digest_hex(line), + last_bad_line_head = escape_bytes(head), + last_bad_line_tail = escape_bytes(tail), + } +end + +local function blank_line(line) + return type(line) == 'string' and line:match('^%s*$') ~= nil +end + +local function non_printable_prefix(s) + if type(s) ~= 'string' or s == '' then + return false + end + for i = 1, #s do + local b = s:byte(i) + if b >= 32 and b <= 126 then + return false + end + end + return true +end + +local function resync_diag(line, prefix_len, frame) + return { + line_resync = true, + last_line_resync_prefix_len = prefix_len, + last_line_resync_line_len = #line, + last_line_resync_xxhash32 = xxhash32.digest_hex(line), + last_line_resync_type = type(frame) == 'table' and frame.type or nil, + last_line_resync_peer_sid = type(frame) == 'table' and frame.sid or nil, + last_line_resync_xfer_id = type(frame) == 'table' and frame.xfer_id or nil, + } +end + +local function handshake_frame(frame) + return type(frame) == 'table' + and (frame.type == 'hello' or frame.type == 'hello_ack') +end + +local function decode_line_with_resync(line) + local frame, err = protocol.decode_line(line) + if frame ~= nil then return frame, nil, nil end + + local scan = #line + if scan > RESYNC_PREFIX_SCAN_MAX_BYTES then + scan = RESYNC_PREFIX_SCAN_MAX_BYTES + end + local start = line:find('{', 1, true) + if start ~= nil and start > 1 then + if start > scan then + return nil, err + end + local prefix_len = start - 1 + local prefix = line:sub(1, prefix_len) + if non_printable_prefix(prefix) then + local resynced, rerr = protocol.decode_line(line:sub(start)) + if handshake_frame(resynced) then + return resynced, nil, resync_diag(line, prefix_len, resynced) + end + return nil, rerr or err + end + end + + return nil, err +end + --- Wrap a raw HAL line/stream session as a fabric-jsonl/1 frame transport. --- --- Accepted raw shapes: @@ -84,10 +226,15 @@ function M.wrap_transport(session, opts) return nil, 'invalid_transport_terminator' end + if type(session.set_trace_io) == 'function' then + session:set_trace_io(opts.trace_io == true) + end + return setmetatable({ _session = session, _mode = mode, _terminator = terminator, + _trace_io = opts.trace_io == true, _closed = false, }, Transport), nil end @@ -146,11 +293,32 @@ function Transport:write_line_op(line) end function Transport:read_frame_op() - return self:read_line_op():wrap(function (line, err) - if line == nil then - return nil, err + return fibers.run_scope_op(function () + while true do + local line, err = fibers.perform(self:read_line_op()) + if line == nil then + return nil, err + end + if blank_line(line) then + log_wire(self._trace_io, 'blank_line_ignored', { line_len = #line }) + else + local frame, derr, diag = decode_line_with_resync(line) + if frame == nil then + local bad_diag = bad_line_diag(line, derr) + log_wire(self._trace_io, 'decode_failed', bad_diag) + return nil, bad_diag + end + if diag ~= nil then + log_wire(self._trace_io, 'line_resynced', diag) + end + return frame, nil, diag + end end - return protocol.decode_line(line) + end):wrap(function (status, report, frame, err, diag) + if status ~= 'ok' then + return nil, err or report or 'read_failed' + end + return frame, err, diag end) end @@ -166,16 +334,31 @@ function Transport:write_frame_op(frame) local line, enc_err = protocol.encode_line(checked) if line == nil then + log_wire(self._trace_io, 'encode_failed', { + type = type(frame) == 'table' and frame.type or type(frame), + id = type(frame) == 'table' and (frame.xfer_id or frame.sid) or nil, + err = enc_err, + }) return op.always(nil, enc_err) end - return self:write_line_op(line) + log_wire(self._trace_io, 'write_line_begin', wire_fields(checked, line)) + return self:write_line_op(line):wrap(function (ok, err) + local fields = wire_fields(checked, line) + fields.err = err + if ok == true then + log_wire(self._trace_io, 'write_line_done', fields) + else + log_wire(self._trace_io, 'write_line_failed', fields) + end + return ok, err + end) end) end function Transport:flush_op() return op.guard(function () - local session, closed = self:_active_session_op() + local session = self:_active_session_op() if session == nil then return op.always(true, nil) end @@ -184,12 +367,112 @@ function Transport:flush_op() return op.always(true, nil) end + log_wire(self._trace_io, 'flush_begin', {}) return session:flush_op():wrap(function (ok, err) + if ok == true then + log_wire(self._trace_io, 'flush_done', {}) + else + log_wire(self._trace_io, 'flush_failed', { err = err or 'flush_failed' }) + end return normalise_bool(ok, err, 'flush_failed') end) end) end +local function drain_limits(opts) + opts = opts or {} + local max_bytes = tonumber(opts.max_bytes) or DEFAULT_DRAIN_MAX_BYTES + local total_s = tonumber(opts.total_s) or DEFAULT_DRAIN_TOTAL_S + local quiet_s = tonumber(opts.quiet_s) or DEFAULT_DRAIN_QUIET_S + local read_s = tonumber(opts.read_s) or DEFAULT_DRAIN_READ_S + local chunk_size = tonumber(opts.chunk_size) or 4096 + + if max_bytes < 0 then max_bytes = 0 end + if total_s < 0 then total_s = 0 end + if quiet_s < 0 then quiet_s = 0 end + if read_s <= 0 then read_s = DEFAULT_DRAIN_READ_S end + if chunk_size <= 0 then chunk_size = 4096 end + + return max_bytes, total_s, quiet_s, read_s, chunk_size +end + +function Transport:drain_input_op(opts) + return op.guard(function () + local session, closed = self:_active_session_op() + if session == nil then + return closed + end + if type(session.read_some_op) ~= 'function' then + return op.always({ + bytes = 0, + reads = 0, + reason = 'unsupported', + }, 'drain_unsupported') + end + + return fibers.run_scope_op(function () + local max_bytes, total_s, quiet_s, read_s, chunk_size = drain_limits(opts) + local deadline = fibers.now() + total_s + local quiet_until = fibers.now() + quiet_s + local bytes = 0 + local reads = 0 + local reason = 'quiet' + + while bytes < max_bytes do + local now = fibers.now() + local remaining_total = deadline - now + local remaining_quiet = quiet_until - now + if remaining_total <= 0 then + reason = 'deadline' + break + end + if remaining_quiet <= 0 then + reason = 'quiet' + break + end + + local want = max_bytes - bytes + if want > chunk_size then want = chunk_size end + local timeout_s = math.min(read_s, remaining_total, remaining_quiet) + local which, data, err = fibers.perform(fibers.named_choice { + read = session:read_some_op(want), + timeout = sleep.sleep_op(timeout_s), + }) + + if which ~= 'timeout' then + if data == nil then + reason = err or 'read_closed' + break + elseif data == '' then + reason = 'empty_read' + break + else + local n = #data + bytes = bytes + n + reads = reads + 1 + quiet_until = fibers.now() + quiet_s + if bytes >= max_bytes then + reason = 'max_bytes' + break + end + end + end + end + + return { + bytes = bytes, + reads = reads, + reason = reason, + }, nil + end):wrap(function (status, report, result, err) + if status ~= 'ok' then + return nil, err or report or 'drain_failed' + end + return result, err + end) + end) +end + function Transport:terminate(reason) if self._closed then return true, nil @@ -231,13 +514,39 @@ local function require_transport_cfg(cfg, level) for _, field in ipairs({ 'source', 'class', 'id' }) do if type(cfg[field]) ~= 'string' or cfg[field] == '' then - error('fabric.hal_transport.open_transport_op: transport.' .. field .. ' must be a non-empty string', (level or 1) + 1) + error( + 'fabric.hal_transport.open_transport_op: transport.' .. field .. ' must be a non-empty string', + (level or 1) + 1 + ) end end return cfg end +local function append_field(parts, k, v) + if v == nil then return end + parts[#parts + 1] = tostring(k) + parts[#parts + 1] = tostring(v) +end + +local function log_transport(event, cfg, fields) + if type(cfg) ~= 'table' or cfg.trace_io ~= true then return end + fields = fields or {} + local parts = { '[fabric-transport]', tostring(event) } + if type(cfg) == 'table' then + append_field(parts, 'source', cfg.source) + append_field(parts, 'class', cfg.class) + append_field(parts, 'id', cfg.id) + append_field(parts, 'dependency', cfg.dependency_key) + end + append_field(parts, 'mode', fields.mode) + append_field(parts, 'status', fields.status) + append_field(parts, 'err', fields.err) + append_field(parts, 'detail', fields.detail) + print(table.concat(parts, ' ')) +end + local function transport_open_error(cfg, err, detail) local e = { err = err or 'transport_open_failed', @@ -309,6 +618,18 @@ local function unwrap_open_transport_reply(transport_cfg, reply, err) return session, nil end +local function normalise_open_opts(transport_cfg) + local opts = transport_cfg.open_opts + if transport_cfg.class == 'uart' then + local checked, err = cap_sdk.args.new.UARTOpenOpts(opts) + if not checked then + return nil, err or 'invalid uart open opts' + end + return checked, nil + end + return opts, nil +end + function M.open_transport_op(conn, transport_cfg, transport_session) transport_cfg = require_transport_cfg(transport_cfg, 2) @@ -316,12 +637,15 @@ function M.open_transport_op(conn, transport_cfg, transport_session) if transport_session ~= nil then local transport, terr = M.wrap_transport(transport_session, transport_cfg) if not transport then + log_transport('wrap_failed', transport_cfg, { err = terr or 'transport_wrap_failed' }) return op.always(nil, terr or 'transport_wrap_failed') end + log_transport('wrap_ok', transport_cfg, { mode = transport._mode }) return op.always(transport, nil) end if conn == nil then + log_transport('open_failed', transport_cfg, { err = 'transport_open_requires_bus_connection' }) return op.always(nil, 'transport_open_requires_bus_connection') end @@ -331,28 +655,113 @@ function M.open_transport_op(conn, transport_cfg, transport_session) transport_cfg.class, transport_cfg.id ) + local open_opts, oerr = normalise_open_opts(transport_cfg) + if open_opts == nil and oerr ~= nil then + log_transport('open_failed', transport_cfg, { err = oerr }) + return op.always(nil, oerr) + end + log_transport('open_start', transport_cfg) return cap:call_control_op( transport_cfg.open_verb or 'open', - transport_cfg.open_opts + open_opts ):wrap(function (reply, err) local session, uerr = unwrap_open_transport_reply(transport_cfg, reply, err) if not session then + log_transport('open_failed', transport_cfg, { + err = reason_text(uerr), + detail = reason_text(err), + }) return nil, uerr end local transport, terr = M.wrap_transport(session, transport_cfg) if not transport then + log_transport('wrap_failed', transport_cfg, { err = terr or 'transport_wrap_failed' }) return nil, terr or 'transport_wrap_failed' end + log_transport('open_ok', transport_cfg, { mode = transport._mode }) return transport, nil end) end) end +local function wait_for_transport_cap(conn, transport_cfg) + local listener = cap_sdk.new_raw_host_cap_listener( + conn, + transport_cfg.source, + transport_cfg.class, + transport_cfg.id + ) + + local timeout_s = transport_cfg.cap_wait_timeout_s or DEFAULT_CAP_WAIT_TIMEOUT_S + local which, cap, err = fibers.perform(op.named_choice { + cap = listener:wait_for_cap_op(), + timeout = sleep.sleep_op(timeout_s), + }) + listener:close() + + if which == 'cap' then + if cap == nil then + return nil, err or 'transport_capability_unavailable' + end + return cap, nil + end + + return nil, ('transport_capability_timeout source=%s class=%s id=%s timeout_s=%s'):format( + tostring(transport_cfg.source), + tostring(transport_cfg.class), + tostring(transport_cfg.id), + tostring(timeout_s) + ) +end + function M.open_transport(conn, transport_cfg, transport_session) - return fibers.perform(M.open_transport_op(conn, transport_cfg, transport_session)) + transport_cfg = require_transport_cfg(transport_cfg, 2) + + if transport_session ~= nil then + return fibers.perform(M.open_transport_op(conn, transport_cfg, transport_session)) + end + + if conn == nil then + log_transport('open_failed', transport_cfg, { err = 'transport_open_requires_bus_connection' }) + return nil, 'transport_open_requires_bus_connection' + end + + log_transport('open_start', transport_cfg) + local cap, cap_err = wait_for_transport_cap(conn, transport_cfg) + if cap == nil then + log_transport('cap_wait_failed', transport_cfg, { err = cap_err }) + return nil, cap_err + end + + local open_opts, oerr = normalise_open_opts(transport_cfg) + if open_opts == nil and oerr ~= nil then + log_transport('open_failed', transport_cfg, { err = oerr }) + return nil, oerr + end + + local reply, err = fibers.perform(cap:call_control_op( + transport_cfg.open_verb or 'open', + open_opts + )) + local session, uerr = unwrap_open_transport_reply(transport_cfg, reply, err) + if not session then + log_transport('open_failed', transport_cfg, { + err = reason_text(uerr), + detail = reason_text(err), + }) + return nil, uerr + end + + local transport, terr = M.wrap_transport(session, transport_cfg) + if not transport then + log_transport('wrap_failed', transport_cfg, { err = terr or 'transport_wrap_failed' }) + return nil, terr or 'transport_wrap_failed' + end + log_transport('open_ok', transport_cfg, { mode = transport._mode }) + return transport, nil end M.unwrap_open_transport_reply = unwrap_open_transport_reply diff --git a/src/services/fabric/io.lua b/src/services/fabric/io.lua index 68d89cc8e..437d06130 100644 --- a/src/services/fabric/io.lua +++ b/src/services/fabric/io.lua @@ -11,7 +11,9 @@ -- this module. local fibers = require 'fibers' +local fiber_scope = require 'fibers.scope' local op = require 'fibers.op' +local sleep = require 'fibers.sleep' local protocol = require 'services.fabric.protocol' local queue = require 'devicecode.support.queue' local priority_event = require 'devicecode.support.priority_event' @@ -20,6 +22,14 @@ local validate = require 'shared.validate' local M = {} +local DEFAULT_BAD_FRAME_LIMIT = 5 +local DEFAULT_BAD_FRAME_WINDOW_S = 10.0 +local DEFAULT_BAD_FRAME_QUIET_S = 2.0 +local DRAIN_SLICE_MAX_S = 0.100 +local DRAIN_SLICE_QUIET_S = 0.020 +local DRAIN_SLICE_READ_S = 0.010 +local DRAIN_SLICE_MAX_BYTES = 64 * 1024 + -------------------------------------------------------------------------------- -- Checks and small helpers -------------------------------------------------------------------------------- @@ -78,10 +88,59 @@ local function positive_int(v, fallback, name) return v end +local function positive_number(v, fallback, name) + if v == nil then return fallback end + if type(v) ~= 'number' or v <= 0 or v ~= v or v == math.huge or v == -math.huge then + error(name .. ' must be a positive finite number', 3) + end + return v +end + local function clean_read_end(frame, err) return frame == nil and (err == nil or err == 'eof' or err == 'closed') end +local BAD_LINE_FIELDS = { + 'last_decode_error', + 'last_bad_line_len', + 'last_bad_line_xxhash32', + 'last_bad_line_head', + 'last_bad_line_tail', +} + +local RESYNC_FIELDS = { + 'line_resync', + 'last_line_resync_prefix_len', + 'last_line_resync_line_len', + 'last_line_resync_xxhash32', + 'last_line_resync_type', + 'last_line_resync_peer_sid', + 'last_line_resync_xfer_id', +} + +local function wire_error_string(err) + if type(err) == 'table' then + return tostring(err.err or err.reason or err.last_decode_error or 'wire_error') + end + return tostring(err or 'wire_error') +end + +local function copy_bad_line_fields(dst, src) + if type(dst) ~= 'table' or type(src) ~= 'table' then return dst end + for _, k in ipairs(BAD_LINE_FIELDS) do + if src[k] ~= nil then dst[k] = src[k] end + end + return dst +end + +local function copy_resync_fields(dst, src) + if type(dst) ~= 'table' or type(src) ~= 'table' then return dst end + for _, k in ipairs(RESYNC_FIELDS) do + if src[k] ~= nil then dst[k] = src[k] end + end + return dst +end + local function reader_result(frames_read, wire_errors, reason) return { role = 'reader', @@ -91,19 +150,38 @@ local function reader_result(frames_read, wire_errors, reason) } end -local function frame_event(frame) - return { +local function frame_event(frame, diag) + return copy_resync_fields({ kind = 'frame_received', frame = frame, - } + }, diag) end -local function wire_error_event(err) - return { +local function wire_error_event(err, wire_errors, bad_frame_count) + return copy_bad_line_fields({ kind = 'wire_error', - err = err, + err = wire_error_string(err), at = fibers.now(), - } + wire_errors = wire_errors, + bad_frame_count = bad_frame_count, + }, err) +end + +local function wire_recovery_event(reason, wire_errors, bad_frame_count, drain_result, drain_err, quiet_until, bad_line_diag) + drain_result = drain_result or {} + return copy_bad_line_fields({ + kind = 'wire_recovery', + reason = reason or 'bad_frame_limit', + err = reason or 'bad_frame_limit', + at = fibers.now(), + wire_errors = wire_errors, + bad_frame_count = bad_frame_count, + drained_bytes = drain_result.bytes or 0, + drain_attempts = drain_result.drain_attempts or 0, + drain_reason = drain_result.reason, + drain_err = drain_err, + quiet_until = quiet_until, + }, bad_line_diag) end local function send_item_frame(item) @@ -119,6 +197,203 @@ local function send_item_frame(item) return frame end +local function same_session(a, b) + return type(a) == 'table' + and type(b) == 'table' + and a.link_id == b.link_id + and a.link_generation == b.link_generation + and a.session_generation == b.session_generation + and a.peer_sid == b.peer_sid +end + +local function send_item_matches_session_gate(gate, item) + if type(gate) ~= 'table' then return true end + if type(item) ~= 'table' or item.session == nil then return true end + return same_session(gate.current_session, item.session) +end + +local function log_io(enabled, event, fields) + if enabled ~= true then return end + local parts = { '[fabric-io]', tostring(event) } + for k, v in pairs(fields or {}) do + if v ~= nil then + parts[#parts + 1] = tostring(k) + parts[#parts + 1] = tostring(v) + end + end + print(table.concat(parts, ' ')) +end + +local function frame_fields(frame) + if type(frame) ~= 'table' then + return { type = type(frame) } + end + local out = { + type = frame.type, + id = frame.xfer_id or frame.sid, + offset = frame.offset, + next = frame.next, + size = frame.size, + err = frame.err, + } + if frame.type == 'xfer_chunk' and type(frame.data) == 'string' then + out.raw_len = #frame.data + out.chunk_digest = frame.chunk_digest + end + return out +end + +local function record_bad_frame(times, at, window_s) + local kept = {} + local cutoff = at - window_s + for _, seen_at in ipairs(times or {}) do + if seen_at >= cutoff then + kept[#kept + 1] = seen_at + end + end + kept[#kept + 1] = at + return kept, #kept +end + +local function max_time(a, b) + a = tonumber(a) or 0 + b = tonumber(b) or 0 + if a > b then return a end + return b +end + +local function set_recovery_gate(gate, drain_active, quiet_until) + if type(gate) ~= 'table' then return end + gate.drain_active = drain_active == true + if quiet_until ~= nil then + gate.hello_quiet_until = max_time(gate.hello_quiet_until, quiet_until) + end +end + +local function perform_drain(drain_input_op, opts) + if type(drain_input_op) ~= 'function' then + return { bytes = 0, reads = 0, reason = 'drain_unsupported' }, 'drain_unsupported' + end + opts = opts or {} + local ok, result, err = pcall(function () + return fibers.perform(drain_input_op({ + max_bytes = opts.max_bytes or DRAIN_SLICE_MAX_BYTES, + total_s = opts.total_s or DRAIN_SLICE_MAX_S, + quiet_s = opts.quiet_s or DRAIN_SLICE_QUIET_S, + read_s = opts.read_s or DRAIN_SLICE_READ_S, + })) + end) + if not ok then + if fiber_scope.is_cancelled(result) then + error(result, 0) + end + return { bytes = 0, reads = 0, reason = 'failed' }, tostring(result or 'drain_failed') + end + if result == nil then + return { bytes = 0, reads = 0, reason = err or 'drain_failed' }, err or 'drain_failed' + end + return result, err +end + +local function recovery_fields(ev) + return copy_bad_line_fields({ + reason = ev.reason, + wire_errors = ev.wire_errors, + bad_frame_count = ev.bad_frame_count, + drained_bytes = ev.drained_bytes, + drain_attempts = ev.drain_attempts, + drain_err = ev.drain_err, + drain_reason = ev.drain_reason, + quiet_until = ev.quiet_until, + }, ev) +end + +local function should_continue_drain(reason, drain_err) + if drain_err ~= nil then return false end + return reason == 'deadline' or reason == 'max_bytes' +end + +local function drain_slice_opts(quiet_until) + local remaining = quiet_until - fibers.now() + if remaining <= 0 then return nil end + local total_s = remaining + if total_s > DRAIN_SLICE_MAX_S then + total_s = DRAIN_SLICE_MAX_S + end + return { + max_bytes = DRAIN_SLICE_MAX_BYTES, + total_s = total_s, + quiet_s = DRAIN_SLICE_QUIET_S, + read_s = DRAIN_SLICE_READ_S, + } +end + +local function run_recovery_window(scope, downstream_tx, recovery_gate, drain_input_op, params) + require_table(scope, 'fabric.io.run_recovery_window: scope', 2) + params = params or {} + + local quiet_until = fibers.now() + params.bad_frame_quiet_s + local aggregate = { + bytes = 0, + reads = 0, + reason = 'drain_not_started', + drain_attempts = 0, + } + local drain_err + + set_recovery_gate(recovery_gate, true, quiet_until) + + local ok, send_ok, send_err = pcall(function () + while true do + local slice_opts = drain_slice_opts(quiet_until) + if slice_opts == nil then + aggregate.reason = 'recovery_window_expired' + break + end + + local drain_result, err = perform_drain(drain_input_op, slice_opts) + drain_result = drain_result or {} + aggregate.drain_attempts = aggregate.drain_attempts + 1 + aggregate.bytes = aggregate.bytes + (tonumber(drain_result.bytes) or 0) + aggregate.reads = aggregate.reads + (tonumber(drain_result.reads) or 0) + aggregate.reason = drain_result.reason or err or 'drain_failed' + drain_err = err + + if not should_continue_drain(aggregate.reason, drain_err) then + break + end + + if fibers.now() >= quiet_until then + aggregate.reason = 'recovery_window_expired' + break + end + end + + local ev = wire_recovery_event( + params.reason or 'bad_frame_limit', + params.wire_errors, + params.bad_frame_count, + aggregate, + drain_err, + quiet_until, + params.bad_line_diag + ) + log_io(params.trace_io == true, 'reader_wire_recovery', recovery_fields(ev)) + return perform_send(downstream_tx, ev) + end) + + set_recovery_gate(recovery_gate, false, quiet_until) + + if not ok then + if fiber_scope.is_cancelled(send_ok) then + error(send_ok, 0) + end + error(send_ok, 0) + end + + return send_ok, send_err +end + -------------------------------------------------------------------------------- -- Reader owner -------------------------------------------------------------------------------- @@ -129,12 +404,34 @@ function M.run_reader(scope, params) local read_frame_op = require_function(params.read_frame_op, 'run_reader: read_frame_op', 2) local downstream_tx = require_tx(params.downstream_tx, 'run_reader: downstream_tx', 2) + local drain_input_op = params.drain_input_op + if drain_input_op ~= nil then + require_function(drain_input_op, 'run_reader: drain_input_op', 2) + end + local recovery_gate = params.recovery_gate + local trace_io = params.trace_io == true + local bad_frame_limit = positive_int( + params.bad_frame_limit, + DEFAULT_BAD_FRAME_LIMIT, + 'run_reader: bad_frame_limit' + ) + local bad_frame_window_s = positive_number( + params.bad_frame_window_s, + DEFAULT_BAD_FRAME_WINDOW_S, + 'run_reader: bad_frame_window_s' + ) + local bad_frame_quiet_s = positive_number( + params.bad_frame_quiet_s, + DEFAULT_BAD_FRAME_QUIET_S, + 'run_reader: bad_frame_quiet_s' + ) local frames_read = 0 local wire_errors = 0 + local bad_frame_times = {} while true do - local frame, read_err = fibers.perform(read_frame_op()) + local frame, read_err, read_diag = fibers.perform(read_frame_op()) if clean_read_end(frame, read_err) then return reader_result(frames_read, wire_errors, read_err or 'eof') @@ -142,28 +439,62 @@ function M.run_reader(scope, params) local ev local label + local sent_by_recovery = false if frame ~= nil then - ev = frame_event(frame) + ev = frame_event(frame, read_diag) label = 'frame' elseif protocol.is_wire_protocol_error and protocol.is_wire_protocol_error(read_err) then - ev = wire_error_event(read_err) - label = 'wire error' + wire_errors = wire_errors + 1 + local at = fibers.now() + local count + bad_frame_times, count = record_bad_frame(bad_frame_times, at, bad_frame_window_s) + if count >= bad_frame_limit then + local ok, send_err = run_recovery_window( + scope, + downstream_tx, + recovery_gate, + drain_input_op, + { + reason = 'bad_frame_limit', + wire_errors = wire_errors, + bad_frame_count = count, + bad_frame_quiet_s = bad_frame_quiet_s, + trace_io = trace_io, + bad_line_diag = read_err, + } + ) + sent_by_recovery = true + bad_frame_times = {} + if ok == nil then + return reader_result(frames_read, wire_errors, send_err or 'downstream_closed') + elseif ok ~= true then + error('reader downstream rejected wire event: ' .. tostring(send_err or 'full'), 0) + end + else + ev = wire_error_event(read_err, wire_errors, count) + end + label = 'wire event' else error('reader read failed: ' .. tostring(read_err or 'unknown'), 0) end - local ok, send_err = perform_send(downstream_tx, ev) + if sent_by_recovery then + ev = nil + end + + local ok, send_err = true, nil + if ev ~= nil then + ok, send_err = perform_send(downstream_tx, ev) + end if ok == true then if frame ~= nil then frames_read = frames_read + 1 - else - wire_errors = wire_errors + 1 end elseif ok == nil then @@ -294,11 +625,48 @@ local function next_writer_item_op(state) } end +local function frame_is_hello(frame) + return type(frame) == 'table' and frame.type == 'hello' +end + +local function wait_for_recovery_gate_op(gate, frame) + if type(gate) ~= 'table' then + return op.always(true, nil) + end + + return fibers.run_scope_op(function () + while true do + local now = fibers.now() + local quiet_until = tonumber(gate.hello_quiet_until) or 0 + local wait_s + + if gate.drain_active == true then + wait_s = 0.020 + elseif frame_is_hello(frame) and quiet_until > now then + wait_s = quiet_until - now + if wait_s > 0.020 then wait_s = 0.020 end + else + return true, nil + end + + fibers.perform(sleep.sleep_op(wait_s)) + end + end):wrap(function (status, report, ok, err) + if status ~= 'ok' then + return nil, err or report or 'recovery_gate_wait_failed' + end + return ok, err + end) +end + function M.run_lane_writer(scope, params) require_table(scope, 'fabric.io.run_lane_writer: scope', 2) params = require_table(params, 'fabric.io.run_lane_writer: params table', 2) local write_frame_op = require_function(params.write_frame_op, 'run_lane_writer: write_frame_op', 2) + local trace_io = params.trace_io == true + local recovery_gate = params.recovery_gate + local session_gate = params.session_gate local flush_op = params.flush_op if flush_op ~= nil then @@ -335,29 +703,63 @@ function M.run_lane_writer(scope, params) if selected ~= nil and selected.item ~= nil then local lane = selected.lane local frame = send_item_frame(selected.item) + if not send_item_matches_session_gate(session_gate, selected.item) then + local f = frame_fields(frame) + f.lane = lane + f.reason = (type(session_gate) == 'table' and session_gate.drop_reason) or 'stale_session' + log_io(trace_io, 'writer_stale_session_drop', f) + commit_turn(state, lane) + else + fibers.perform(wait_for_recovery_gate_op(recovery_gate, frame)) + + local f = frame_fields(frame) + f.lane = lane + log_io(trace_io, 'writer_tx_begin', f) + + local ok, err = perform_write(write_frame_op, frame) + if ok ~= true then + local fail_fields = frame_fields(frame) + fail_fields.lane = lane + fail_fields.err = err + log_io(trace_io, 'writer_tx_failed', fail_fields) + error('writer write failed: ' .. tostring(err), 0) + end - local ok, err = perform_write(write_frame_op, frame) - if ok ~= true then - error('writer write failed: ' .. tostring(err), 0) - end - - written = written + 1 - by_lane[lane] = (by_lane[lane] or 0) + 1 - commit_turn(state, lane) - - if flush_each then - local flushed, flush_err = perform_flush(flush_op) - if flushed ~= true then - error('writer flush failed: ' .. tostring(flush_err), 0) + written = written + 1 + by_lane[lane] = (by_lane[lane] or 0) + 1 + local ok_fields = frame_fields(frame) + ok_fields.lane = lane + ok_fields.count = written + log_io(trace_io, 'writer_tx_done', ok_fields) + commit_turn(state, lane) + + if flush_each then + local flush_fields = frame_fields(frame) + flush_fields.lane = lane + log_io(trace_io, 'writer_flush_begin', flush_fields) + local flushed, flush_err = perform_flush(flush_op) + if flushed ~= true then + local fail_fields = frame_fields(frame) + fail_fields.lane = lane + fail_fields.err = flush_err + log_io(trace_io, 'writer_flush_failed', fail_fields) + error('writer flush failed: ' .. tostring(flush_err), 0) + end + local done_fields = frame_fields(frame) + done_fields.lane = lane + log_io(trace_io, 'writer_flush_done', done_fields) end end end end + log_io(trace_io, 'writer_final_flush_begin', lane_counts(by_lane)) local flushed, flush_err = perform_flush(flush_op) if flushed ~= true then + log_io(trace_io, 'writer_final_flush_failed', { err = flush_err }) error('writer flush failed: ' .. tostring(flush_err), 0) end + log_io(trace_io, 'writer_final_flush_done', lane_counts(by_lane)) return { role = 'writer', diff --git a/src/services/fabric/link.lua b/src/services/fabric/link.lua index 6a4f9ce12..2a6a57808 100644 --- a/src/services/fabric/link.lua +++ b/src/services/fabric/link.lua @@ -601,6 +601,7 @@ local function transfer_params_from(params, admission_rx, session_rx, outbound, t.state_tx = state_tx t.component_name = 'transfer_manager' t.receive_targets = t.receive_targets or params.receive_targets + t.trace_io = params.trace_io == true return t end @@ -642,6 +643,12 @@ function M.composed_components(scope, params, service_caps) local read_frame_op = function () return transport:read_frame_op() end + local drain_input_op + if type(transport.drain_input_op) == 'function' then + drain_input_op = function (opts) + return transport:drain_input_op(opts) + end + end local write_frame_op = function (frame) return transport:write_frame_op(frame) end @@ -682,10 +689,16 @@ function M.composed_components(scope, params, service_caps) { full = 'reject_newest' } ) + local session_gate = { + current_session = nil, + drop_reason = 'no_session', + } + local outbound_gate = session_mod.new_outbound_gate { tx_control = outbound_control_tx, tx_rpc = outbound_rpc_tx, tx_bulk = outbound_bulk_tx, + session_gate = session_gate, } local local_rx = params.local_rx @@ -698,6 +711,11 @@ function M.composed_components(scope, params, service_caps) local session_cfg = params.session or {} local reader_cfg = params.reader or {} local writer_cfg = params.writer or {} + local recovery_gate = { + drain_active = false, + hello_quiet_until = 0, + } + local trace_io = params.trace_io == true if type(session_cfg) ~= 'table' then error('fabric.link.run_composed: session must be a table', 2) end @@ -719,7 +737,13 @@ function M.composed_components(scope, params, service_caps) return io_mod.run_reader(component_scope, { read_frame_op = read_frame_op, + drain_input_op = drain_input_op, downstream_tx = inbound_frame_tx, + recovery_gate = recovery_gate, + bad_frame_limit = reader_cfg.bad_frame_limit, + bad_frame_window_s = reader_cfg.bad_frame_window_s, + bad_frame_quiet_s = reader_cfg.bad_frame_quiet_s, + trace_io = trace_io, }) end, } @@ -748,8 +772,7 @@ function M.composed_components(scope, params, service_caps) hello_interval_s = session_cfg.hello_interval_s, ping_interval_s = session_cfg.ping_interval_s, liveness_timeout_s = session_cfg.liveness_timeout_s, - bad_frame_limit = reader_cfg.bad_frame_limit, - bad_frame_window_s = reader_cfg.bad_frame_window_s, + recovery_gate = recovery_gate, state_tx = state_tx, component_name = 'session', }) @@ -765,7 +788,10 @@ function M.composed_components(scope, params, service_caps) bulk_rx = outbound_bulk_rx, write_frame_op = write_frame_op, flush_op = flush_op, - flush_each = params.flush_each, + recovery_gate = recovery_gate, + session_gate = session_gate, + trace_io = trace_io, + flush_each = writer_cfg.flush_each ~= false, rpc_quota = writer_cfg.rpc_quota, bulk_quota = writer_cfg.bulk_quota, }) diff --git a/src/services/fabric/protocol.lua b/src/services/fabric/protocol.lua index 40feeb850..145bf2277 100644 --- a/src/services/fabric/protocol.lua +++ b/src/services/fabric/protocol.lua @@ -19,6 +19,9 @@ end function M.is_wire_protocol_error(err) if err == nil then return false end + if type(err) == 'table' then + err = err.err or err.reason or err.last_decode_error + end local s = tostring(err) diff --git a/src/services/fabric/service.lua b/src/services/fabric/service.lua index afe18b554..641856a67 100644 --- a/src/services/fabric/service.lua +++ b/src/services/fabric/service.lua @@ -44,6 +44,47 @@ local DEFAULT_DONE_QUEUE = 64 local shallow_copy = tablex.shallow_copy +local function reason_summary(value) + if type(value) ~= 'table' then + return tostring(value) + end + + local fields = {} + local keys = { + 'kind', + 'err', + 'code', + 'reason', + 'detail', + 'dependency_key', + 'link_id', + 'component', + 'class', + 'id', + 'verb', + 'status', + } + + for i = 1, #keys do + local key = keys[i] + local v = value[key] + if v ~= nil and type(v) ~= 'table' then + fields[#fields + 1] = key .. '=' .. tostring(v) + end + end + + if type(value.result) == 'table' then + fields[#fields + 1] = 'result={' .. reason_summary(value.result) .. '}' + elseif value.result ~= nil then + fields[#fields + 1] = 'result=' .. tostring(value.result) + end + + if #fields == 0 then + return tostring(value) + end + return table.concat(fields, ' ') +end + local function copy_link_entry(v) local out = shallow_copy(v) @@ -145,6 +186,11 @@ local function compiled_from_params(params) return c, nil end +local function env_trace_io_enabled() + local v = os.getenv('DEVICECODE_FABRIC_TRACE_IO') + return v == '1' or v == 'true' or v == 'TRUE' or v == 'yes' or v == 'YES' +end + local function link_override(params, id, index) local overrides = params.link_overrides if type(overrides) ~= 'table' then @@ -172,6 +218,12 @@ local function normalise_link_specs(params) end local list = (compiled and compiled.links) or params.links + local service_trace_io = compiled + and compiled.service + and compiled.service.trace_io + if env_trace_io_enabled() then + service_trace_io = true + end if type(list) ~= 'table' then error('fabric.service: links array required', 3) @@ -198,6 +250,9 @@ local function normalise_link_specs(params) local copy = shallow_copy(spec) copy.link_id = id copy.link_generation = copy.link_generation or i + if copy.trace_io == nil then + copy.trace_io = service_trace_io == true + end local override = link_override(params, id, i) if override ~= nil then @@ -345,7 +400,7 @@ local function default_policy(_, ev) action = 'fail', reason = ('link %s failed: %s'):format( tostring(ev.link_id), - tostring(ev.primary or 'failed') + reason_summary(ev.primary or 'failed') ), } end @@ -1040,7 +1095,7 @@ local function handle_generation_done(state, ev) return end - state.last_error = tostring(ev.primary or ev.status or 'generation_failed') + state.last_error = reason_summary(ev.primary or ev.status or 'generation_failed') publish_service_lifecycle(state, 'degraded', { reason = 'generation_failed', last_error = state.last_error, diff --git a/src/services/fabric/session.lua b/src/services/fabric/session.lua index 8587ff04f..501008b3e 100644 --- a/src/services/fabric/session.lua +++ b/src/services/fabric/session.lua @@ -8,7 +8,6 @@ local priority_event = require 'devicecode.support.priority_event' local model_mod = require 'services.fabric.model' local protocol = require 'services.fabric.protocol' local contracts = require 'devicecode.support.contracts' -local validate = require 'shared.validate' local M = {} @@ -21,8 +20,6 @@ OutboundGate.__index = OutboundGate local DEFAULT_HELLO_INTERVAL = 1.0 local DEFAULT_PING_INTERVAL = 5.0 local DEFAULT_LIVENESS_TIMEOUT = 15.0 -local DEFAULT_BAD_FRAME_LIMIT = 5 -local DEFAULT_BAD_FRAME_WINDOW_S = 10.0 local function require_rx(v, name, level) return contracts.require_rx(v, name, (level or 1) + 1) @@ -40,11 +37,6 @@ local function positive_number(v, fallback, name) return v end -local function positive_integer(v, fallback, name) - if v == nil then return fallback end - return validate.positive_integer(v, 'fabric.session: ' .. name, 2) -end - function M.new_session_context(args) if type(args) ~= 'table' then error('fabric.session.new_session_context: args table required', 2) @@ -135,12 +127,20 @@ end function OutboundGate:bind(ctx) self._session = M.copy_context(ctx) self._drop_reason = nil + if type(self._session_gate) == 'table' then + self._session_gate.current_session = M.copy_context(ctx) + self._session_gate.drop_reason = nil + end return true, nil end function OutboundGate:drop(reason) self._session = nil self._drop_reason = reason or 'no_session' + if type(self._session_gate) == 'table' then + self._session_gate.current_session = nil + self._session_gate.drop_reason = self._drop_reason + end return true, nil end @@ -153,6 +153,10 @@ function OutboundGate:terminate(reason) self._closed = true self._session = nil self._drop_reason = reason or 'session_outbound_closed' + if type(self._session_gate) == 'table' then + self._session_gate.current_session = nil + self._session_gate.drop_reason = self._drop_reason + end close_unique_txs(self._lane_txs, self._drop_reason) return true, nil end @@ -217,6 +221,7 @@ function M.new_outbound_gate(params) }, _session = nil, _drop_reason = 'no_session', + _session_gate = params.session_gate, _closed = false, }, OutboundGate) end @@ -281,6 +286,40 @@ local function session_event(self, kind, ctx, extra, at) return ev end +local BAD_LINE_FIELDS = { + 'last_decode_error', + 'last_bad_line_len', + 'last_bad_line_xxhash32', + 'last_bad_line_head', + 'last_bad_line_tail', +} + +local RESYNC_FIELDS = { + 'line_resync', + 'last_line_resync_prefix_len', + 'last_line_resync_line_len', + 'last_line_resync_xxhash32', + 'last_line_resync_type', + 'last_line_resync_peer_sid', + 'last_line_resync_xfer_id', +} + +local function copy_bad_line_fields(dst, src) + if type(dst) ~= 'table' or type(src) ~= 'table' then return dst end + for _, k in ipairs(BAD_LINE_FIELDS) do + if src[k] ~= nil then dst[k] = src[k] end + end + return dst +end + +local function copy_resync_fields(dst, src) + if type(dst) ~= 'table' or type(src) ~= 'table' then return dst end + for _, k in ipairs(RESYNC_FIELDS) do + if src[k] ~= nil then dst[k] = src[k] end + end + return dst +end + local function peer_session_event(self, ctx, at) return session_event(self, 'peer_session', ctx, nil, at) end @@ -327,6 +366,13 @@ local function same_peer(cur, frame) and frame.sid == cur.peer_sid end +local function is_unexpected_peer(self, frame) + local expected = self._expected_peer + if expected == nil or expected == '' then return false end + if frame.type ~= 'hello' and frame.type ~= 'hello_ack' then return false end + return frame.node ~= expected +end + local function establish_from_peer(self, frame, at) at = at or fibers.now() local cur = session_snapshot(self) @@ -436,23 +482,39 @@ end local function frame_event_from_item(item) if item == nil then return { kind = 'frame_closed' } end if type(item) == 'table' and item.kind == 'frame_received' then - return { + return copy_resync_fields({ kind = 'frame', frame = item.frame, at = item.at or fibers.now(), - } + }, item) end if type(item) == 'table' and item.kind == 'wire_error' then - return { + return copy_bad_line_fields({ kind = 'wire_error', err = item.err or 'wire_error', at = item.at or fibers.now(), - } + wire_errors = item.wire_errors, + bad_frame_count = item.bad_frame_count, + }, item) + end + + if type(item) == 'table' and item.kind == 'wire_recovery' then + return copy_bad_line_fields({ + kind = 'wire_recovery', + reason = item.reason or item.err or 'bad_frame_limit', + err = item.err or item.reason or 'bad_frame_limit', + at = item.at or fibers.now(), + wire_errors = item.wire_errors, + bad_frame_count = item.bad_frame_count, + drained_bytes = item.drained_bytes, + drain_err = item.drain_err, + quiet_until = item.quiet_until, + }, item) end return { kind = 'invalid_frame_item', - err = 'fabric.session frame_rx accepts only frame_received events', + err = 'fabric.session frame_rx accepts only frame_received, wire_error, or wire_recovery events', } end @@ -517,43 +579,48 @@ local function route_downstream(self, lane, frame, at) return true, nil end -local function record_bad_frame(self, at) - at = at or fibers.now() - local window_s = self._bad_frame_window_s - local cutoff = at - window_s - local kept = {} - - for _, seen_at in ipairs(self._bad_frame_times or {}) do - if type(seen_at) == 'number' and seen_at >= cutoff then - kept[#kept + 1] = seen_at - end - end - - kept[#kept + 1] = at - self._bad_frame_times = kept +local function handle_wire_error(self, ev) + local err = (type(ev) == 'table' and ev.err) or 'wire_error' - return #kept + update_session(self, function (s) + s.wire_errors = ev.wire_errors or ((s.wire_errors or 0) + 1) + s.bad_frame_count = ev.bad_frame_count or ((s.bad_frame_count or 0) + 1) + s.last_wire_error = tostring(err) + copy_bad_line_fields(s, ev) + end) end -local function handle_wire_error(self, ev) +local function handle_wire_recovery(self, ev) local at = (type(ev) == 'table' and ev.at) or fibers.now() - local err = (type(ev) == 'table' and ev.err) or 'wire_error' - local count = record_bad_frame(self, at) + local reason = (type(ev) == 'table' and ev.reason) or 'bad_frame_limit' + local quiet_until = tonumber(type(ev) == 'table' and ev.quiet_until) or at update_session(self, function (s) - s.wire_errors = (s.wire_errors or 0) + 1 - s.bad_frame_count = count - s.last_wire_error = tostring(err) + s.wire_errors = ev.wire_errors or ((s.wire_errors or 0) + 1) + s.bad_frame_count = ev.bad_frame_count or s.bad_frame_count or 0 + s.last_wire_error = tostring(reason) + s.last_drain_bytes = ev.drained_bytes + s.last_drain_err = ev.drain_err + copy_bad_line_fields(s, ev) end) - if count >= self._bad_frame_limit then - self._bad_frame_times = {} - reset_to_hello(self, 'bad_frame_limit', at) - end + reset_to_hello(self, reason, at) + self._next_hello_at = quiet_until +end + +local function handle_line_resync(self, ev) + if type(ev) ~= 'table' or ev.line_resync ~= true then return end + update_session(self, function (s) + s.line_resyncs = (s.line_resyncs or 0) + 1 + copy_resync_fields(s, ev) + end) end local function handle_session_frame(self, checked, at) local cur = session_snapshot(self) + if is_unexpected_peer(self, checked) then + return + end if (checked.type == 'hello' or checked.type == 'hello_ack') and not protocol.proto_supported(checked.proto) then @@ -596,6 +663,7 @@ end local function handle_frame(self, ev) local checked, err = protocol.validate_wire(ev.frame) if not checked then error('session invalid frame: ' .. tostring(err), 0) end + handle_line_resync(self, ev) local lane = protocol.dispatch_lane(checked) if lane == 'session_control' then handle_session_frame(self, checked, ev.at or fibers.now()) @@ -655,6 +723,19 @@ function M.run(scope, params) wire_errors = 0, bad_frame_count = 0, last_wire_error = nil, + last_decode_error = nil, + last_bad_line_len = nil, + last_bad_line_xxhash32 = nil, + last_bad_line_head = nil, + last_bad_line_tail = nil, + line_resyncs = 0, + line_resync = nil, + last_line_resync_prefix_len = nil, + last_line_resync_line_len = nil, + last_line_resync_xxhash32 = nil, + last_line_resync_type = nil, + last_line_resync_peer_sid = nil, + last_line_resync_xfer_id = nil, } local session_model = model_mod.new(initial, { @@ -679,6 +760,7 @@ function M.run(scope, params) _transfer_tx = transfer_tx, _session_model = session_model, _local_node = local_node, + _expected_peer = params.peer_id, _identity_claim = protocol.normalise_reserved_claim(params.identity_claim), _auth_claim = protocol.normalise_reserved_claim(params.auth_claim), _auth_state = 'unauthenticated', @@ -688,9 +770,6 @@ function M.run(scope, params) _hello_interval = positive_number(params.hello_interval_s, DEFAULT_HELLO_INTERVAL, 'hello_interval_s'), _ping_interval = positive_number(params.ping_interval_s, DEFAULT_PING_INTERVAL, 'ping_interval_s'), _liveness_timeout = positive_number(params.liveness_timeout_s, DEFAULT_LIVENESS_TIMEOUT, 'liveness_timeout_s'), - _bad_frame_limit = positive_integer(params.bad_frame_limit, DEFAULT_BAD_FRAME_LIMIT, 'bad_frame_limit'), - _bad_frame_window_s = positive_number(params.bad_frame_window_s, DEFAULT_BAD_FRAME_WINDOW_S, 'bad_frame_window_s'), - _bad_frame_times = {}, _next_hello_at = fibers.now(), _next_ping_at = math.huge, _last_peer_at = nil, @@ -718,6 +797,8 @@ function M.run(scope, params) handle_frame(self, ev) elseif ev.kind == 'wire_error' then handle_wire_error(self, ev) + elseif ev.kind == 'wire_recovery' then + handle_wire_recovery(self, ev) elseif ev.kind == 'invalid_frame_item' then error(ev.err or 'fabric.session invalid frame input', 0) elseif ev.kind == 'timer' then diff --git a/src/services/fabric/state.lua b/src/services/fabric/state.lua index 2bccf3208..a292bd4fd 100644 --- a/src/services/fabric/state.lua +++ b/src/services/fabric/state.lua @@ -145,7 +145,7 @@ local function transfer_payload_from_snapshot(link_id, link_generation, snapshot status = rec.status, target = rec.target or result.target, size = result.size or rec.size, - sent_bytes = result.sent_bytes, + sent_bytes = result.sent_bytes or rec.sent, received_bytes = result.received_bytes, digest_alg = result.digest_alg or rec.digest_alg, digest = result.digest or rec.digest, diff --git a/src/services/fabric/transfer.lua b/src/services/fabric/transfer.lua index baf0cf2a3..c3d5ccde9 100644 --- a/src/services/fabric/transfer.lua +++ b/src/services/fabric/transfer.lua @@ -77,6 +77,11 @@ local function copy_active(a) target = a.target, meta = copy(a.meta), size = a.size, + sent = a.sent, + chunk_size = a.chunk_size, + pending_offset = a.pending_offset, + pending_next = a.pending_next, + last_transfer_event = a.last_transfer_event, digest_alg = a.digest_alg, digest = a.digest, } @@ -110,6 +115,7 @@ local function snapshot_equal(a, b) if aa.status ~= ba.status then return false end if aa.direction ~= ba.direction then return false end if aa.xfer_id ~= ba.xfer_id then return false end + if aa.sent ~= ba.sent then return false end if not same_ctx(aa.session, ba.session) then return false end end @@ -162,6 +168,7 @@ local function active_matches(state, ev) return a ~= nil and a.request_id == ev.request_id and a.request_generation == ev.request_generation + and a.xfer_id == ev.xfer_id and same_ctx(a.session, ev_ctx(ev)) end @@ -192,6 +199,11 @@ function M.claim_slot(state, rec) target = rec.target, meta = copy(rec.meta), size = rec.size, + sent = rec.sent or 0, + chunk_size = rec.chunk_size, + pending_offset = rec.pending_offset, + pending_next = rec.pending_next, + last_transfer_event = rec.last_transfer_event, digest_alg = rec.digest_alg, digest = rec.digest, frame_tx = rec.frame_tx, @@ -252,6 +264,31 @@ function M.apply_attempt_done(state, ev) return true, nil, active, state.last end +function M.apply_progress(state, ev) + if not active_matches(state, ev) then + state.stats.stale = state.stats.stale + 1 + return false, 'stale_transfer_progress' + end + + local active = state.active + local sent = tonumber(ev.sent) + if sent == nil then return false, 'invalid_transfer_progress' end + if sent < 0 then sent = 0 end + if type(active.sent) == 'number' and sent < active.sent then + return false, 'regressing_transfer_progress' + end + + active.sent = sent + if ev.status ~= nil then active.status = ev.status end + if ev.size ~= nil then active.size = ev.size end + active.chunk_size = ev.chunk_size + active.pending_offset = ev.pending_offset + active.pending_next = ev.pending_next + active.last_transfer_event = ev.last_transfer_event + + return true, nil, active +end + local function manager_snapshot(self) return M.snapshot(self._state) end @@ -293,6 +330,7 @@ local function attempt_identity(req) request_id = req_id(req), request_generation = req_gen(req), session = ctx(req.session), + xfer_id = req.xfer_id, } end @@ -319,6 +357,11 @@ local function attempt_caps(self, frame_rx, session) chunk_size = self._chunk_size, timeout_s = self._timeout_s, retry_limit = self._retry_limit, + trace_io = self._trace_io == true, + + report_progress_now = function (ev) + return report(self, ev, 'transfer_progress_report_failed') + end, send_control_frame_now = function (frame, label) return outbound:send_transfer_control_frame_now(c, frame, label) @@ -351,6 +394,24 @@ local function run_attempt(scope, req, caps) local worker_req = copy(req) worker_req.source = source worker_req.source_owner = nil + worker_req.on_progress = function (progress) + if type(caps.report_progress_now) ~= 'function' then return true, nil end + progress = type(progress) == 'table' and progress or {} + return caps.report_progress_now({ + kind = 'transfer_progress', + request_id = req_id(req), + request_generation = req_gen(req), + session = ctx(req.session), + xfer_id = req.xfer_id, + sent = progress.sent, + size = progress.size, + status = progress.status or 'sending', + chunk_size = progress.chunk_size, + pending_offset = progress.pending_offset, + pending_next = progress.pending_next, + last_transfer_event = progress.last_transfer_event, + }) + end local result = transfer_sender.run(scope, worker_req, caps) if type(result) ~= 'table' then error('transfer attempt must return a result table', 0) end @@ -377,6 +438,7 @@ local function receive_attempt_identity(req) request_id = req_id(req), request_generation = req_gen(req), session = ctx(req.session), + xfer_id = req.xfer_id, } end @@ -568,20 +630,21 @@ function SlotLease:start_attempt(request_scope, req) outcome = function () return raw:outcome() end, identity = function () return raw:identity() end, - outcome_op = function () - return local_rx:recv_op():wrap(function (ev, recv_err) - if ev ~= nil then return ev end - - return { - kind = 'transfer_attempt_done', - request_id = attempt_req.request_id, - request_generation = attempt_req.request_generation, - session = ctx(attempt_req.session), - status = 'failed', - primary = recv_err or 'transfer attempt observer closed', - } - end) - end, + outcome_op = function () + return local_rx:recv_op():wrap(function (ev, recv_err) + if ev ~= nil then return ev end + + return { + kind = 'transfer_attempt_done', + request_id = attempt_req.request_id, + request_generation = attempt_req.request_generation, + session = ctx(attempt_req.session), + xfer_id = attempt_req.xfer_id, + status = 'failed', + primary = recv_err or 'transfer attempt observer closed', + } + end) + end, }, nil end @@ -701,6 +764,7 @@ local function active_done(self, reason, session) request_id = active.request_id, request_generation = active.request_generation, session = ctx(active.session), + xfer_id = active.xfer_id, status = 'cancelled', primary = reason or 'session_dropped', } @@ -724,6 +788,11 @@ local function handle_slot_released(self, ev) emit_model(self) end +local function handle_progress(self, ev) + local accepted = M.apply_progress(self._state, ev) + if accepted then emit_model(self) end +end + local function handle_frame(self, ev) self._state.stats.frames_received = self._state.stats.frames_received + 1 @@ -864,6 +933,9 @@ local function dispatch(self, ev) elseif ev.kind == 'transfer_attempt_done' then handle_attempt_done(self, ev) + elseif ev.kind == 'transfer_progress' then + handle_progress(self, ev) + elseif ev.kind == 'transfer_slot_released' then handle_slot_released(self, ev) @@ -962,6 +1034,7 @@ function M.run(scope, params) _chunk_size = params.chunk_size, _timeout_s = params.timeout_s, _retry_limit = params.retry_limit, + _trace_io = params.trace_io == true, _session = nil, _event_pending = {}, }, Manager) diff --git a/src/services/fabric/transfer_receive.lua b/src/services/fabric/transfer_receive.lua index 196ae3bd5..1427119fe 100644 --- a/src/services/fabric/transfer_receive.lua +++ b/src/services/fabric/transfer_receive.lua @@ -200,37 +200,50 @@ function M.run(scope, req, caps) if type(frame) ~= 'table' or frame.xfer_id ~= xfer_id then -- Manager normally filters these. + deadline = deadline elseif frame.type == 'xfer_abort' then fail(caps, xfer_id, sink, frame.err or 'remote_abort', false) elseif frame.type == 'xfer_chunk' then - if frame.offset ~= received then fail(caps, xfer_id, sink, 'unexpected_offset', true) end - local chunk = frame.data - if type(chunk) ~= 'string' then fail(caps, xfer_id, sink, 'invalid_chunk_data', true) end - if received + #chunk > size then fail(caps, xfer_id, sink, 'size_overrun', true) end - if not protocol.verify_chunk_digest(chunk, frame.chunk_digest) then - if retries_at_offset < retry_limit then - retries_at_offset = retries_at_offset + 1 - chunk_retries = chunk_retries + 1 - deadline = fibers.now() + timeout_s - send_control(caps, construct('xfer_need', protocol.xfer_need, xfer_id, received), 'transfer_receive_retry_need_send_failed') + if type(frame.offset) == 'number' and frame.offset < received then + local need = construct('xfer_need', protocol.xfer_need, xfer_id, received) + send_control(caps, need, 'transfer_receive_stale_need_send_failed') + deadline = fibers.now() + timeout_s + elseif type(frame.offset) == 'number' and frame.offset > received then + local need = construct('xfer_need', protocol.xfer_need, xfer_id, received) + send_control(caps, need, 'transfer_receive_future_need_send_failed') + elseif frame.offset ~= received then + fail(caps, xfer_id, sink, 'unexpected_offset', true) + else + local chunk = frame.data + if type(chunk) ~= 'string' then fail(caps, xfer_id, sink, 'invalid_chunk_data', true) end + if received + #chunk > size then fail(caps, xfer_id, sink, 'size_overrun', true) end + if not protocol.verify_chunk_digest(chunk, frame.chunk_digest) then + if retries_at_offset < retry_limit then + retries_at_offset = retries_at_offset + 1 + chunk_retries = chunk_retries + 1 + deadline = fibers.now() + timeout_s + local need = construct('xfer_need', protocol.xfer_need, xfer_id, received) + send_control(caps, need, 'transfer_receive_retry_need_send_failed') + else + fail(caps, xfer_id, sink, 'chunk_digest_mismatch', true) + end else - fail(caps, xfer_id, sink, 'chunk_digest_mismatch', true) + local ok, werr = append_chunk(sink, chunk) + if ok ~= true then fail(caps, xfer_id, sink, werr or 'write_failed', true) end + xxhash32.update(digest_state, chunk) + received = received + #chunk + retries_at_offset = 0 + deadline = fibers.now() + timeout_s + -- Acknowledge every accepted chunk, including the final one. The + -- sender waits for xfer_need next == size before sending xfer_commit. + -- This keeps commit ordered after the receiver has actually processed + -- the last bulk frame, even when the writer uses separate control and + -- bulk lanes. + local need = construct('xfer_need', protocol.xfer_need, xfer_id, received) + send_control(caps, need, 'transfer_receive_need_send_failed') end - else - local ok, werr = append_chunk(sink, chunk) - if ok ~= true then fail(caps, xfer_id, sink, werr or 'write_failed', true) end - xxhash32.update(digest_state, chunk) - received = received + #chunk - retries_at_offset = 0 - deadline = fibers.now() + timeout_s - -- Acknowledge every accepted chunk, including the final one. The - -- sender waits for xfer_need next == size before sending xfer_commit. - -- This keeps commit ordered after the receiver has actually processed - -- the last bulk frame, even when the writer uses separate control and - -- bulk lanes. - send_control(caps, construct('xfer_need', protocol.xfer_need, xfer_id, received), 'transfer_receive_need_send_failed') end elseif frame.type == 'xfer_commit' then diff --git a/src/services/fabric/transfer_sender.lua b/src/services/fabric/transfer_sender.lua index 449da5d2f..b927075d6 100644 --- a/src/services/fabric/transfer_sender.lua +++ b/src/services/fabric/transfer_sender.lua @@ -6,11 +6,49 @@ local fibers = require 'fibers' local sleep = require 'fibers.sleep' local protocol = require 'services.fabric.protocol' +local xxhash32 = require 'shared.hash.xxhash32' local M = {} local DEFAULT_TIMEOUT = 1.0 local DEFAULT_CHUNK_SIZE = protocol.DEFAULT_CHUNK_SIZE or 2048 +local DEFAULT_REPORT_BYTES = 8192 +local BACKPRESSURE_RETRY_S = 0.005 +local RESEND_MIN_INTERVAL_S = 0.25 +local BEGIN_RETRY_INTERVAL_S = 1.0 +local BEGIN_MAX_ATTEMPTS = 3 +local BEGIN_STARTUP_TIMEOUT_S = 5.0 +local COMMIT_RESEND_MIN_INTERVAL_S = RESEND_MIN_INTERVAL_S + +local REPORT_BYTES = DEFAULT_REPORT_BYTES + +local function log_xfer(trace_io, event, fields) + if trace_io ~= true then return end + local parts = { '[fabric-xfer-tx]', tostring(event) } + for k, v in pairs(fields or {}) do + if v ~= nil then + parts[#parts + 1] = tostring(k) + parts[#parts + 1] = tostring(v) + end + end + print(table.concat(parts, ' ')) +end + +local function encoded_chunk_len(frame) + if type(frame) ~= 'table' or frame.type ~= 'xfer_chunk' or type(frame.data) ~= 'string' then + return nil + end + local encoded = protocol.encode_chunk(frame.data) + return #encoded +end + +local function line_diag(frame) + local line, err = protocol.encode_line(frame) + if type(line) ~= 'string' then + return nil, nil, err + end + return #line, xxhash32.digest_hex(line), nil +end local function nonempty(v) return type(v) == 'string' and v ~= '' @@ -64,15 +102,40 @@ local function construct(label, fn, ...) return frame end -local function send(caps, lane, frame, label) +local function is_backpressure(err) + local s = tostring(err or '') + return s == 'full' + or s == 'would_block' + or s:match(': full$') ~= nil + or s:match(': would_block$') ~= nil +end + +local function fail_send(label, err) + local prefix = label or 'transfer_send_failed' + local s = tostring(err or 'unknown') + if s == prefix or s:sub(1, #prefix + 2) == prefix .. ': ' then + error(s, 0) + end + error(prefix .. ': ' .. s, 0) +end + +local function send(caps, lane, frame, label, deadline) local fn = lane == 'bulk' and caps.send_bulk_frame_now or caps.send_control_frame_now if type(fn) ~= 'function' then error('transfer_sender: missing session-bound sender for ' .. tostring(lane), 0) end - local ok, err = fn(frame, label) - if ok ~= true then error((label or 'transfer_send_failed') .. ': ' .. tostring(err), 0) end - return true + while true do + local ok, err = fn(frame, label) + if ok == true then return true end + if not is_backpressure(err) then + fail_send(label, err) + end + if deadline ~= nil and fibers.now() >= deadline then + fail_send(label, err) + end + fibers.perform(sleep.sleep_op(BACKPRESSURE_RETRY_S)) + end end local function try_abort(caps, xfer_id, reason) @@ -109,19 +172,30 @@ local function read_chunk(source, n) return chunk, nil end -local function send_commit(caps, xfer_id, size, alg, digest, timeout_s) - local frame = construct('xfer_commit', protocol.xfer_commit, xfer_id, size, alg, digest) - send(caps, 'control', frame, 'transfer_commit_send_failed') - return 'committing', fibers.now() + timeout_s +local function make_commit_frame(xfer_id, size, alg, digest) + return construct('xfer_commit', protocol.xfer_commit, xfer_id, size, alg, digest) +end + +local function send_commit(caps, frame, deadline, trace_io, event, fields) + fields = fields or {} + fields.id = frame.xfer_id + fields.size = frame.size + fields.digest = frame.digest + log_xfer(trace_io, event or 'commit_tx', fields) + send(caps, 'control', frame, 'transfer_commit_send_failed', deadline) + return fibers.now() end -local function make_next_chunk(caps, source, xfer_id, offset, size, chunk_size) +local function make_next_chunk(caps, source, xfer_id, offset, size, chunk_size, trace_io) local want = math.min(chunk_size, size - offset) + local read_start = fibers.now() local chunk, err = read_chunk(source, want) + local read_ms = math.floor((fibers.now() - read_start) * 1000 + 0.5) if err ~= nil then fail(caps, xfer_id, err, true) end if type(chunk) ~= 'string' or #chunk == 0 then fail(caps, xfer_id, 'short_source', true) end if offset + #chunk > size then fail(caps, xfer_id, 'source_overrun', true) end + local chunk_digest = protocol.chunk_digest(chunk) local frame = construct( 'xfer_chunk', @@ -129,18 +203,55 @@ local function make_next_chunk(caps, source, xfer_id, offset, size, chunk_size) xfer_id, offset, chunk, - protocol.chunk_digest(chunk) + chunk_digest ) + local encoded_len + local line_len + local line_hash + local line_err + if trace_io == true then + encoded_len = encoded_chunk_len(frame) + line_len, line_hash, line_err = line_diag(frame) + end + + log_xfer(trace_io, 'chunk_make', { + id = xfer_id, + offset = offset, + next = offset + #chunk, + raw_len = #chunk, + encoded_len = encoded_len, + chunk_digest = chunk_digest, + line_len = line_len, + line_xxhash32 = line_hash, + line_err = line_err, + read_ms = read_ms, + }) return { offset = offset, next = offset + #chunk, frame = frame, + raw_len = #chunk, + encoded_len = encoded_len, + chunk_digest = chunk_digest, + line_len = line_len, + line_xxhash32 = line_hash, } end -local function send_chunk(caps, pending) - send(caps, 'bulk', pending.frame, 'transfer_chunk_send_failed') +local function send_chunk(caps, pending, deadline, trace_io) + log_xfer(trace_io, 'chunk_send', { + id = pending.frame.xfer_id, + offset = pending.offset, + next = pending.next, + raw_len = pending.raw_len, + encoded_len = pending.encoded_len, + chunk_digest = pending.chunk_digest, + line_len = pending.line_len, + line_xxhash32 = pending.line_xxhash32, + }) + send(caps, 'bulk', pending.frame, 'transfer_chunk_send_failed', deadline) + pending.last_tx_at = fibers.now() return true end @@ -158,12 +269,30 @@ function M.run(scope, req, caps) local xfer_id, target, size, alg, digest = require_request(req) local timeout_s = positive(req.timeout_s or caps.timeout_s, DEFAULT_TIMEOUT, 'timeout_s') local chunk_size = positive(req.chunk_size or caps.chunk_size, DEFAULT_CHUNK_SIZE, 'chunk_size', true) + local begin_retry_interval_s = positive(req.begin_retry_interval_s or caps.begin_retry_interval_s, + BEGIN_RETRY_INTERVAL_S, 'begin_retry_interval_s') + local begin_max_attempts = positive(req.begin_max_attempts or caps.begin_max_attempts, + BEGIN_MAX_ATTEMPTS, 'begin_max_attempts', true) + local begin_startup_timeout_s = positive(req.begin_startup_timeout_s or caps.begin_startup_timeout_s, + BEGIN_STARTUP_TIMEOUT_S, 'begin_startup_timeout_s') + local trace_io = req.trace_io == true or caps.trace_io == true + + log_xfer(trace_io, 'start', { + id = xfer_id, + target = target, + size = size, + digest_alg = alg, + digest = digest, + chunk_size = chunk_size, + timeout_s = timeout_s, + begin_retry_interval_s = begin_retry_interval_s, + begin_max_attempts = begin_max_attempts, + begin_startup_timeout_s = begin_startup_timeout_s, + }) local begin = construct('xfer_begin', protocol.xfer_begin, xfer_id, target, size, alg, digest, req.meta) - send(caps, 'control', begin, 'transfer_begin_send_failed') - -- `sent` is the receiver-acknowledged offset. `pending` is the one -- outstanding chunk that may be resent if the receiver asks again for the -- same offset. This deliberately stays stop-and-wait; there is no seek, @@ -172,79 +301,394 @@ function M.run(scope, req, caps) local pending = nil local retransmits = 0 local state = 'waiting_ready' - local deadline = fibers.now() + timeout_s + local started_at = fibers.now() + local overall_deadline = started_at + timeout_s + local startup_deadline = math.min(overall_deadline, started_at + begin_startup_timeout_s) + local deadline = overall_deadline + local begin_attempts = 0 + local next_begin_retry_at = startup_deadline + local next_report_at = chunk_size + local last_need_next = nil + local last_transfer_event = nil + local commit_frame = nil + local last_commit_tx_at = 0 + local commit_resends = 0 + + local function report_progress(status, event) + if type(req.on_progress) ~= 'function' then return end + if event ~= nil then last_transfer_event = event end + local ok, err = req.on_progress({ + xfer_id = xfer_id, + sent = sent, + size = size, + status = status or state, + chunk_size = chunk_size, + pending_offset = pending and pending.offset or nil, + pending_next = pending and pending.next or nil, + last_transfer_event = last_transfer_event, + }) + if ok == false then error(err or 'transfer_progress_report_failed', 0) end + end - while true do - local which, item = fibers.perform(wait_frame_op(rx, deadline)) - if which == 'timeout' then fail(caps, xfer_id, 'timeout', true) end - if item == nil then error('transfer_sender_frame_feed_closed', 0) end + local function waiting_ready_timeout_reason() + return 'waiting_ready_timeout state=waiting_ready sent=' + .. tostring(sent) + .. ' size=' + .. tostring(size) + .. ' pending_offset=' + .. tostring(pending and pending.offset or nil) + .. ' pending_next=' + .. tostring(pending and pending.next or nil) + .. ' last_need_next=' + .. tostring(last_need_next) + .. ' begin_attempts=' + .. tostring(begin_attempts) + .. ' commit_resends=' + .. tostring(commit_resends) + end - local frame = item.frame or item + local function timeout_reason() + if state == 'waiting_ready' then return waiting_ready_timeout_reason() end + return 'timeout: state=' + .. tostring(state) + .. ' sent=' + .. tostring(sent) + .. ' size=' + .. tostring(size) + .. ' pending_offset=' + .. tostring(pending and pending.offset or nil) + .. ' pending_next=' + .. tostring(pending and pending.next or nil) + .. ' last_need_next=' + .. tostring(last_need_next) + .. ' begin_attempts=' + .. tostring(begin_attempts) + .. ' commit_resends=' + .. tostring(commit_resends) + end - if type(frame) ~= 'table' or frame.xfer_id ~= xfer_id then - -- Manager normally filters these. + local function send_begin(event) + begin_attempts = begin_attempts + 1 + local now = fibers.now() + local retry = begin_attempts > 1 + log_xfer(trace_io, event or (retry and 'begin_retry_tx' or 'begin_tx'), { + id = xfer_id, + target = target, + size = size, + digest = digest, + attempt = begin_attempts, + age_ms = retry and math.floor((now - started_at) * 1000 + 0.5) or nil, + }) + send(caps, 'control', begin, 'transfer_begin_send_failed', startup_deadline) + next_begin_retry_at = math.min(startup_deadline, fibers.now() + begin_retry_interval_s) + report_progress('waiting_ready', event or 'begin_tx') + end - elseif frame.type == 'xfer_abort' then - fail(caps, xfer_id, frame.err or 'remote_abort', false) + local function note_report_progress() + if REPORT_BYTES <= 0 then return end + if sent < size and sent < next_report_at then return end + report_progress('sending', 'chunk_ack') + while next_report_at <= sent do + next_report_at = next_report_at + REPORT_BYTES + end + end - elseif frame.type == 'xfer_ready' then + local function resend_pending(event, suppressed_event, requested_next) + local last_tx_at = pending.last_tx_at or 0 + if fibers.now() - last_tx_at >= RESEND_MIN_INTERVAL_S then + send_chunk(caps, pending, deadline, trace_io) + retransmits = retransmits + 1 + log_xfer(trace_io, event or 'chunk_resend', { + id = xfer_id, + offset = pending.offset, + next = pending.next, + requested_next = requested_next, + retransmits = retransmits, + }) + report_progress(state, event or 'chunk_resend') + else + log_xfer(trace_io, suppressed_event or 'chunk_resend_suppressed', { + id = xfer_id, + offset = pending.offset, + next = pending.next, + requested_next = requested_next, + age_ms = math.floor((fibers.now() - last_tx_at) * 1000 + 0.5), + }) + end + end + + local function enter_committing() + commit_frame = make_commit_frame(xfer_id, size, alg, digest) + last_commit_tx_at = send_commit(caps, commit_frame, deadline, trace_io, 'commit_tx') + state = 'committing' + deadline = fibers.now() + timeout_s + end + + local function resend_commit_if_due(requested_next) + if commit_frame == nil then + commit_frame = make_commit_frame(xfer_id, size, alg, digest) + end + + local now = fibers.now() + local age_s = now - (last_commit_tx_at or 0) + if age_s >= COMMIT_RESEND_MIN_INTERVAL_S then + last_commit_tx_at = send_commit(caps, commit_frame, deadline, trace_io, 'commit_resend_tx', { + requested_next = requested_next, + commit_resends = commit_resends + 1, + age_ms = math.floor(age_s * 1000 + 0.5), + }) + commit_resends = commit_resends + 1 + else + log_xfer(trace_io, 'commit_resend_suppressed', { + id = xfer_id, + requested_next = requested_next, + commit_resends = commit_resends, + age_ms = math.floor(age_s * 1000 + 0.5), + }) + end + end + + local function handle_committing_need(requested_next) + if sent ~= size then + log_xfer(trace_io, 'commit_need_invariant_failed', { + id = xfer_id, + next = requested_next, + sent = sent, + size = size, + }) + fail(caps, xfer_id, 'commit_need_invariant_failed', true) + end + + if requested_next == sent then + resend_commit_if_due(requested_next) + elseif type(requested_next) == 'number' and requested_next < sent then + log_xfer(trace_io, 'stale_need_while_committing', { + id = xfer_id, + next = requested_next, + sent = sent, + size = size, + }) + else + log_xfer(trace_io, 'future_need_while_committing', { + id = xfer_id, + next = requested_next, + sent = sent, + size = size, + }) + end + end + + send_begin('begin_tx') + + while true do + repeat + local wait_deadline = deadline if state == 'waiting_ready' then - state = 'sending' - deadline = fibers.now() + timeout_s + wait_deadline = math.min(startup_deadline, next_begin_retry_at) end - elseif frame.type == 'xfer_need' then - if state ~= 'sending' then fail(caps, xfer_id, 'unexpected_need', true) end - - if pending ~= nil then - if frame.next == pending.offset then - -- Receiver rejected or lost the last chunk before advancing. - -- Resend the cached frame without reading from the source again. - send_chunk(caps, pending) - retransmits = retransmits + 1 - deadline = fibers.now() + timeout_s - - elseif frame.next == pending.next then - sent = pending.next - pending = nil - if sent >= size then - state, deadline = send_commit(caps, xfer_id, size, alg, digest, timeout_s) - else - pending = make_next_chunk(caps, source, xfer_id, sent, size, chunk_size) - send_chunk(caps, pending) + local which, item = fibers.perform(wait_frame_op(rx, wait_deadline)) + if which == 'timeout' then + if state == 'waiting_ready' + and fibers.now() < startup_deadline + and begin_attempts < begin_max_attempts + then + send_begin('begin_retry_tx') + break + end + local reason = timeout_reason() + log_xfer(trace_io, 'timeout', { + id = xfer_id, + state = state, + sent = sent, + size = size, + begin_attempts = begin_attempts, + pending_offset = pending and pending.offset, + pending_next = pending and pending.next, + last_need_next = last_need_next, + retransmits = retransmits, + commit_resends = commit_resends, + }) + fail(caps, xfer_id, reason, true) + end + if item == nil then + local reason = type(rx.why) == 'function' and rx:why() or nil + log_xfer(trace_io, 'frame_feed_closed', { + id = xfer_id, + state = state, + sent = sent, + begin_attempts = begin_attempts, + reason = reason or 'closed', + }) + local err = 'transfer_sender_frame_feed_closed: ' .. tostring(reason or 'closed') + if state == 'waiting_ready' then + err = err .. ' state=waiting_ready sent=' + .. tostring(sent) + .. ' begin_attempts=' + .. tostring(begin_attempts) + end + error(err, 0) + end + + local frame = item.frame or item + + if type(frame) == 'table' and frame.xfer_id == xfer_id then + if frame.type == 'xfer_abort' then + log_xfer(trace_io, 'abort_rx', { + id = xfer_id, + err = frame.err or 'remote_abort', + state = state, + sent = sent, + }) + fail(caps, xfer_id, frame.err or 'remote_abort', false) + + elseif frame.type == 'xfer_ready' then + log_xfer(trace_io, 'ready_rx', { + id = xfer_id, + state = state, + sent = sent, + after_begin_attempts = begin_attempts, + }) + if state == 'waiting_ready' then + state = 'sending' deadline = fibers.now() + timeout_s end - else - fail(caps, xfer_id, 'unexpected_offset', true) - end + elseif frame.type == 'xfer_need' then + last_need_next = frame.next + log_xfer(trace_io, 'need_rx', { + id = xfer_id, + next = frame.next, + state = state, + sent = sent, + pending_offset = pending and pending.offset, + pending_next = pending and pending.next, + }) + + if state == 'waiting_ready' then + if frame.next ~= 0 then + fail(caps, xfer_id, 'unexpected_need', true) + end + log_xfer(trace_io, 'implicit_ready_from_need', { + id = xfer_id, + next = frame.next, + sent = sent, + after_begin_attempts = begin_attempts, + }) + state = 'sending' + deadline = fibers.now() + timeout_s + elseif state == 'committing' then + handle_committing_need(frame.next) + break + elseif state ~= 'sending' then + fail(caps, xfer_id, 'unexpected_need', true) + end - else - if frame.next ~= sent then fail(caps, xfer_id, 'unexpected_offset', true) end - if sent >= size then - state, deadline = send_commit(caps, xfer_id, size, alg, digest, timeout_s) - else - pending = make_next_chunk(caps, source, xfer_id, sent, size, chunk_size) - send_chunk(caps, pending) - deadline = fibers.now() + timeout_s + if pending ~= nil then + if frame.next == pending.offset then + -- Receiver rejected or lost the last chunk before advancing. + -- Resend the cached frame without reading from the source again, + -- but coalesce duplicate needs while the previous copy is still + -- likely queued or on a slow UART. + resend_pending('chunk_resend', 'chunk_resend_suppressed', frame.next) + deadline = fibers.now() + timeout_s + + elseif frame.next == pending.next then + sent = pending.next + log_xfer(trace_io, 'chunk_ack', { + id = xfer_id, + next = sent, + raw_len = pending.raw_len, + chunk_digest = pending.chunk_digest, + }) + note_report_progress() + pending = nil + if sent >= size then + enter_committing() + else + pending = make_next_chunk(caps, source, xfer_id, sent, size, chunk_size, trace_io) + send_chunk(caps, pending, deadline, trace_io) + report_progress('sending', 'chunk_tx') + deadline = fibers.now() + timeout_s + end + + elseif type(frame.next) == 'number' and frame.next < pending.offset then + -- UART links can deliver an older xfer_need after the sender has + -- already advanced. That is stale control traffic, not a transfer + -- contract violation. + log_xfer(trace_io, 'stale_need_ignored', { + id = xfer_id, + next = frame.next, + pending_offset = pending.offset, + pending_next = pending.next, + }) + deadline = fibers.now() + timeout_s + + else + log_xfer(trace_io, 'future_need', { + id = xfer_id, + next = frame.next, + pending_offset = pending.offset, + pending_next = pending.next, + }) + resend_pending('future_need_resend', 'future_need_resend_suppressed', frame.next) + end + + else + if type(frame.next) == 'number' and frame.next < sent then + -- Stale request for already-acknowledged data; ignore it. + log_xfer(trace_io, 'stale_need_ignored', { + id = xfer_id, + next = frame.next, + sent = sent, + }) + deadline = fibers.now() + timeout_s + else + if frame.next ~= sent then + log_xfer(trace_io, 'future_need', { + id = xfer_id, + next = frame.next, + sent = sent, + }) + end + if sent >= size then + enter_committing() + else + pending = make_next_chunk(caps, source, xfer_id, sent, size, chunk_size, trace_io) + send_chunk(caps, pending, deadline, trace_io) + report_progress('sending', 'chunk_tx') + if frame.next == sent then + deadline = fibers.now() + timeout_s + end + end + end + end + + elseif frame.type == 'xfer_done' and state == 'committing' then + log_xfer(trace_io, 'done_rx', { + id = xfer_id, + sent = sent, + size = size, + retransmits = retransmits, + commit_resends = commit_resends, + }) + return { + request_id = req.request_id, + job_id = type(req.meta) == 'table' and req.meta.job_id or nil, + component = type(req.meta) == 'table' and req.meta.component or nil, + image_id = type(req.meta) == 'table' and req.meta.image_id or nil, + target = target, + xfer_id = xfer_id, + digest_alg = alg, + digest = digest, + sent_bytes = sent, + size = size, + retransmits = retransmits, + commit_resends = commit_resends, + } end end - - elseif frame.type == 'xfer_done' and state == 'committing' then - return { - request_id = req.request_id, - job_id = type(req.meta) == 'table' and req.meta.job_id or nil, - component = type(req.meta) == 'table' and req.meta.component or nil, - image_id = type(req.meta) == 'table' and req.meta.image_id or nil, - target = target, - xfer_id = xfer_id, - digest_alg = alg, - digest = digest, - sent_bytes = sent, - size = size, - retransmits = retransmits, - } - end + until true end end diff --git a/src/services/hal/drivers/artifact_store.lua b/src/services/hal/drivers/artifact_store.lua index 0424186cc..6a5dafc51 100644 --- a/src/services/hal/drivers/artifact_store.lua +++ b/src/services/hal/drivers/artifact_store.lua @@ -253,7 +253,11 @@ end local function open_stream_now(path, mode) local stream, err = file.open(path, mode) if not stream then - return nil, tostring(err) + return nil, ('open_failed path=%s mode=%s err=%s'):format( + tostring(path), + tostring(mode), + tostring(err) + ) end return stream, nil end diff --git a/src/services/hal/drivers/control_store_provider.lua b/src/services/hal/drivers/control_store_provider.lua index ef53e3cf7..f2d53bb33 100644 --- a/src/services/hal/drivers/control_store_provider.lua +++ b/src/services/hal/drivers/control_store_provider.lua @@ -75,7 +75,11 @@ local function with_open_file_op(path, mode, body_fn) return fibers.run_scope_op(function (scope) local f, err = file.open(path, mode) if not f then - return false, tostring(err) + return false, ('open_failed path=%s mode=%s err=%s'):format( + tostring(path), + tostring(mode), + tostring(err) + ) end scope:finally(function (_, status, primary) @@ -167,7 +171,11 @@ function Provider:get_op(opts) return false, 'not found' end - return fibers.perform(read_file_required_op(path_for(self.root, opts.key))) + local ok_body, body_or_err = fibers.perform(read_file_required_op(path_for(self.root, opts.key))) + if not ok_body and is_not_found_err(body_or_err) then + return false, 'not found' + end + return ok_body, body_or_err end):wrap(function (st, rep, ok, value_or_err) if st ~= 'ok' then return false, tostring(value_or_err or rep) diff --git a/src/services/hal/drivers/uart.lua b/src/services/hal/drivers/uart.lua index 0b78c5bbb..b79bdfd9b 100644 --- a/src/services/hal/drivers/uart.lua +++ b/src/services/hal/drivers/uart.lua @@ -11,11 +11,9 @@ local hal_types = require 'services.hal.types.core' local cap_types = require 'services.hal.types.capabilities' local cap_args = require 'services.hal.types.capability_args' local resource = require 'devicecode.support.resource' +local xxhash32 = require 'shared.hash.xxhash32' local unpack = rawget(table, 'unpack') or _G.unpack -local pack = rawget(table, 'pack') or function (...) - return { n = select('#', ...), ... } -end local M = {} @@ -24,6 +22,7 @@ local DEFAULT_STOP_TIMEOUT = 5.0 ---@class UARTSession ---@field lease_id string +---@field path string ---@field stream Stream ---@field release_lease_now function|nil ---@field release_lease_op function|nil @@ -54,6 +53,22 @@ local function dlog(self, level, payload) end end +local function log_uart(self, event, fields, force) + if force ~= true and (not self or self.trace_io ~= true) then return end + local parts = { '[uart-session]', tostring(event) } + for k, v in pairs(fields or {}) do + if v ~= nil then + parts[#parts + 1] = tostring(k) + parts[#parts + 1] = tostring(v) + end + end + print(table.concat(parts, ' ')) +end + +function UARTSession:set_trace_io(trace_io) + self.trace_io = trace_io == true +end + local function finalise_shell_scope(self, shell_scope, status, primary) if self.scope ~= shell_scope then return @@ -153,9 +168,10 @@ local function session_release_lease_now(session, reason) return session.release_lease_now(session.lease_id, reason) end -local function new_session(lease_id, stream, release_lease_now, release_lease_op) +local function new_session(lease_id, path, stream, release_lease_now, release_lease_op) return setmetatable({ lease_id = lease_id, + path = path, stream = stream, release_lease_now = release_lease_now, release_lease_op = release_lease_op, @@ -206,7 +222,39 @@ function UARTSession:write_op(...) if self.closed then return op.always(nil, 'uart session closed') end - return self.stream:write_op(unpack(parts)) + local data = {} + local len = 0 + for i = 1, #parts do + local s = tostring(parts[i] or '') + data[#data + 1] = s + len = len + #s + end + local joined = table.concat(data) + log_uart(self, 'write_begin', { + lease = self.lease_id, + path = self.path, + len = len, + xxhash32 = xxhash32.digest_hex(joined), + }) + return self.stream:write_op(unpack(parts)):wrap(function (n, err) + if n == nil then + log_uart(self, 'write_failed', { + lease = self.lease_id, + path = self.path, + len = len, + err = err, + }, true) + else + log_uart(self, 'write_done', { + lease = self.lease_id, + path = self.path, + len = len, + n = n, + xxhash32 = xxhash32.digest_hex(joined), + }) + end + return n, err + end) end) end @@ -215,7 +263,25 @@ function UARTSession:flush_op() if self.closed then return op.always(nil, 'uart session closed') end - return self.stream:flush_op() + log_uart(self, 'flush_begin', { + lease = self.lease_id, + path = self.path, + }) + return self.stream:flush_op():wrap(function (ok, err) + if ok == nil or ok == false then + log_uart(self, 'flush_failed', { + lease = self.lease_id, + path = self.path, + err = err, + }, true) + else + log_uart(self, 'flush_done', { + lease = self.lease_id, + path = self.path, + }) + end + return ok, err + end) end) end @@ -257,7 +323,7 @@ end function UARTSession:terminate(reason) local why = reason or 'uart session terminated' - local first_err + local first_err = false self.closed = true @@ -312,13 +378,18 @@ local function open_session_op(self) local handed_off = false scope:finally(function (_, status, primary) if not handed_off then - resource.terminate_checked(stream, primary or status or 'uart open failed', 'uart open stream cleanup failed') + resource.terminate_checked( + stream, + primary or status or 'uart open failed', + 'uart open stream cleanup failed' + ) end end) local lease_id = uuid.new() local session = new_session( lease_id, + self.path, stream, function (active_lease_id, reason) return release_session_now(self, active_lease_id, reason) @@ -416,7 +487,11 @@ local function methods_for(self) scope:finally(function () if not handed_off and self.active_session == reply.session then - resource.terminate_checked(reply.session, 'uart open abandoned', 'UART open session cleanup failed') + resource.terminate_checked( + reply.session, + 'uart open abandoned', + 'UART open session cleanup failed' + ) self.active_session = nil self.active_lease_id = nil end diff --git a/src/services/hal/managers/uart.lua b/src/services/hal/managers/uart.lua index 82b9e4b5e..25a07e245 100644 --- a/src/services/hal/managers/uart.lua +++ b/src/services/hal/managers/uart.lua @@ -63,11 +63,22 @@ local function valid_mode(mode) or mode == '8O1' end -local function validate_config(entries) - if type(entries) ~= 'table' then +local function normalise_config(config) + if type(config) ~= 'table' then return false, 'config must be a list' end + if config.serial_ports ~= nil then + if type(config.serial_ports) ~= 'table' then + return false, 'config.serial_ports must be a list' + end + return config.serial_ports, nil + end + + return config, nil +end + +local function validate_config(entries) for _, entry in ipairs(entries) do if type(entry) ~= 'table' then return false, 'each uart entry must be a table' @@ -89,12 +100,17 @@ local function validate_config(entries) return true, nil end +local function raw_source_id_for_driver(driver) + return ('uart_%s'):format(tostring(driver.id)) +end + local function emit_device_added_op(driver, caps) return device_events.added_op(S.dev_ev_ch, 'uart', driver.id, { path = driver.path, baud = driver.default_baud, mode = driver.default_mode, source = 'uart_manager', + source_id = raw_source_id_for_driver(driver), }, caps) end @@ -282,7 +298,11 @@ end function M.apply_config_op(entries) return fibers.run_scope_op(function () - local ok, err = validate_config(entries) + local normalised, nerr = normalise_config(entries) + if not normalised then + return false, nerr + end + local ok, err = validate_config(normalised) if not ok then return false, err end @@ -299,7 +319,7 @@ function M.apply_config_op(entries) local reply_ch = channel.new(1) local admitted, admit_err = fibers.perform(cfg_ch:put_op({ generation = generation, - config = entries, + config = normalised, reply_ch = reply_ch, }):wrap(function () return true, nil diff --git a/src/services/monitor.lua b/src/services/monitor.lua index e0752137b..d5600ce21 100644 --- a/src/services/monitor.lua +++ b/src/services/monitor.lua @@ -158,6 +158,14 @@ local function format_canonical_line(msg) fmt_time(), tostring(svc), topic_to_string(msg.topic), payload_s) end +local function should_print_canonical(msg) + local kind, svc = classify_canonical(msg) + if kind == 'metric' and (svc == 'ui' or svc == 'http') then + return false + end + return true +end + -- Formats a warning line for traffic on the legacy obs plane that indicates -- a service is not publishing on the canonical plane. -- reason: 'legacy-only' — topic is a known dual-publish target but no canonical seen @@ -174,6 +182,14 @@ local function format_legacy_warn(msg, reason) fmt_time(), svc, tostring(reason), topic_to_string(t), payload_s) end +local function should_print_legacy_warn(msg, reason) + local t = msg.topic or {} + if t[2] == 'event' and t[3] == 'main' and t[4] == 'tick' then + return false + end + return true +end + function M.start(conn, ctx) ctx = ctx or {} local svc = base.new(conn, { name = ctx.name or 'monitor', env = ctx.env }) @@ -253,7 +269,9 @@ function M.start(conn, ctx) legacy_count[svc_name][can_kind] = nil end end - write_line(format_canonical_line(msg)) + if should_print_canonical(msg) then + write_line(format_canonical_line(msg)) + end elseif which == 'legacy' then if msg == nil then @@ -278,12 +296,14 @@ function M.start(conn, ctx) legacy_count[svc_name] = svc_counts local count = (svc_counts[kind] or 0) + 1 svc_counts[kind] = count - if count >= LEGACY_WARN_THRESHOLD then + if count >= LEGACY_WARN_THRESHOLD and should_print_legacy_warn(msg, 'legacy-only') then write_line(format_legacy_warn(msg, 'legacy-only')) end end else - write_line(format_legacy_warn(msg, 'unknown-endpoint')) + if should_print_legacy_warn(msg, 'unknown-endpoint') then + write_line(format_legacy_warn(msg, 'unknown-endpoint')) + end end end end diff --git a/src/services/ui/http/request.lua b/src/services/ui/http/request.lua index 024fbd65d..34cb5b946 100644 --- a/src/services/ui/http/request.lua +++ b/src/services/ui/http/request.lua @@ -67,6 +67,12 @@ local function perform_response(ev) return true end +local function encode_json(v) + local encoded, err = cjson.encode(v) + if encoded == nil then error(err or 'json_encode_failed', 0) end + return encoded +end + local function content_type_is_json(v) v = tostring(v or ''):lower() if v == '' then return false end @@ -135,6 +141,24 @@ local function principal_from(ctx, deps) return nil, nil end +local function shallow_copy(t) + local out = {} + for k, v in pairs(t or {}) do out[k] = v end + return out +end + +local function set_if_present(t, key, value) + if value ~= nil and value ~= '' then t[key] = value end +end + +local function positive_int_header(ctx, name) + local raw = ctx_header(ctx, name) + if raw == nil or raw == '' then return nil end + local n = tonumber(raw) + if n and n > 0 and n == math.floor(n) then return n end + return nil +end + local function handle_read(owner, route, deps) local model = assert(deps.model, 'HTTP read requires model') local snap = model:snapshot() @@ -145,6 +169,8 @@ local function handle_read(owner, route, deps) result = queries.services_snapshot(snap) elseif route.query == 'fabric' then result = queries.fabric_status(snap) + elseif route.query == 'fabric_link' then + result = queries.fabric_link_status(snap, route.link_id) elseif route.query == 'topic' then result = queries.topic(snap, route.topic) else @@ -168,7 +194,7 @@ local function handle_login(owner, ctx, deps) local sess = assert(deps.sessions, 'login requires sessions'):create(principal, { data = { user_agent = ctx_header(ctx, 'user-agent') }, }) - perform_response(owner:reply_json_op(200, { session = sess })) + perform_response(owner:reply_json_op(200, { session = sess, session_id = sess.id })) return { status = 'ok', session_id = sess.id } end @@ -227,9 +253,43 @@ local function handle_command(scope, owner, ctx, route, deps) return { status = 'ok' } end +local function handle_upload(scope, owner, ctx, deps) + local principal = principal_from(ctx, deps) + if principal == nil then + perform_response(owner:reply_error_op(401, 'unauthenticated')) + return { status = 'unauthenticated' } + end + + local upload_opts = shallow_copy(deps.update or deps) + upload_opts.principal = principal + + local component = ctx_header(ctx, 'x-artifact-component') + if component ~= nil and component ~= '' then + upload_opts.component = upload_opts.component or component + if upload_opts.create_job == nil then upload_opts.create_job = true end + if upload_opts.start_job == nil then upload_opts.start_job = true end + end + + local metadata = shallow_copy(upload_opts.metadata) + set_if_present(metadata, 'name', ctx_header(ctx, 'x-artifact-name')) + set_if_present(metadata, 'version', ctx_header(ctx, 'x-artifact-version')) + set_if_present(metadata, 'build', ctx_header(ctx, 'x-artifact-build')) + local image_id = ctx_header(ctx, 'x-artifact-image-id') + set_if_present(metadata, 'image_id', image_id) + set_if_present(metadata, 'expected_image_id', image_id) + set_if_present(metadata, 'compat_commit_image_id', ctx_header(ctx, 'x-artifact-compat-commit-image-id')) + set_if_present(metadata, 'transfer_chunk_raw', positive_int_header(ctx, 'x-transfer-chunk-raw')) + if metadata.format == nil and metadata.name and tostring(metadata.name):match('%.dcmcu$') then + metadata.format = 'dcmcu-v1' + end + if next(metadata) ~= nil then upload_opts.metadata = metadata end + + return upload.run(scope, owner, ctx, upload_opts) +end + function M.run(scope, ctx, deps) deps = deps or {} - local owner = response_mod.new(ctx, { encode = deps.encode_json }) + local owner = response_mod.new(ctx, { encode = deps.encode_json or encode_json }) scope:finally(function (_, status, primary) resource.terminate_checked(owner, primary or status or 'request_closed', 'HTTP response termination') @@ -249,7 +309,7 @@ function M.run(scope, ctx, deps) elseif route.kind == 'command' then return handle_command(scope, owner, ctx, route, deps) elseif route.kind == 'upload' then - return upload.run(scope, owner, ctx, deps.update or deps) + return handle_upload(scope, owner, ctx, deps) elseif route.kind == 'sse' then return sse.run(scope, owner, route, deps) elseif route.kind == 'static' then diff --git a/src/services/ui/http/routes.lua b/src/services/ui/http/routes.lua index 81d6eac6b..a159d5e8a 100644 --- a/src/services/ui/http/routes.lua +++ b/src/services/ui/http/routes.lua @@ -67,6 +67,10 @@ function M.decode(ctx) return { kind = 'read', query = 'services' } end + if parts[2] == 'fabric' and parts[3] == 'link' and parts[4] ~= nil and method == 'GET' then + return { kind = 'read', query = 'fabric_link', link_id = parts[4] } + end + if parts[2] == 'fabric' and method == 'GET' then return { kind = 'read', query = 'fabric' } end diff --git a/src/services/ui/queries.lua b/src/services/ui/queries.lua index 602218166..289834913 100644 --- a/src/services/ui/queries.lua +++ b/src/services/ui/queries.lua @@ -66,6 +66,51 @@ function M.fabric_status(snapshot) } end +local function payload_at(snapshot, topic) + local item = M.topic(snapshot, topic) + return item and item.payload or nil +end + +local function component_view(payload) + if type(payload) ~= 'table' then return nil end + local status = copy_value(payload.snapshot or payload.status or payload) + if payload.component == 'session' and type(status) == 'table' then + if status.ready == nil then status.ready = status.established == true end + if status.state == nil then + if status.ready then + status.state = 'ready' + else + status.state = status.phase + end + end + end + return { + link_id = payload.link_id, + link_generation = payload.link_generation, + component = payload.component, + state = payload.state, + status = status, + } +end + +function M.fabric_link_status(snapshot, link_id) + local root = { 'state', 'fabric', 'link', link_id } + local function component(name) + return component_view(payload_at(snapshot, { + 'state', 'fabric', 'link', link_id, 'component', name, + })) + end + + return { + version = snapshot and snapshot.version or 0, + link = component_view(payload_at(snapshot, root)), + session = component('session'), + bridge = component('rpc_bridge') or component('bridge'), + transfer = component('transfer'), + transfer_manager = component('transfer_manager') or component('transfer'), + } +end + function M.update_jobs_snapshot(snapshot) return { version = snapshot and snapshot.version or 0, diff --git a/src/services/ui/update/client.lua b/src/services/ui/update/client.lua index a21ba3fd9..5f54ee6d0 100644 --- a/src/services/ui/update/client.lua +++ b/src/services/ui/update/client.lua @@ -38,4 +38,13 @@ function M.start_job_op(conn, job_id, opts) return conn:call_op(method, { job_id = job_id }, call_opts(opts, 10.0)) end +function M.discard_job_op(conn, job_id, opts) + opts = opts or {} + local method = update_manager_rpc('discard-job') + return conn:call_op(method, { + job_id = job_id, + reason = opts.reason, + }, call_opts(opts, 10.0)) +end + return M diff --git a/src/services/ui/update/upload.lua b/src/services/ui/update/upload.lua index 222ad2531..fb5837e9f 100644 --- a/src/services/ui/update/upload.lua +++ b/src/services/ui/update/upload.lua @@ -90,6 +90,38 @@ local function perform_with_deadline(scope, ev, deadline, on_timeout) cancel_for_timeout(scope, on_timeout) end +local function cleanup_opts_for_discard(opts, reason) + local out = {} + for k, v in pairs(opts or {}) do out[k] = v end + out.deadline = nil + out.timeout = out.discard_timeout or out.cleanup_timeout or 5.0 + out.reason = reason + return out +end + +local function discard_created_job(conn, job_id, reason, opts) + if type(job_id) ~= 'string' or job_id == '' then + return nil, 'job_id_missing' + end + local ok, reply, err = pcall(function () + return fibers.perform(client.discard_job_op(conn, job_id, cleanup_opts_for_discard(opts, reason))) + end) + if not ok then + return nil, reply or 'discard_job_failed' + end + if reply == nil or reply == false then + return nil, err or 'discard_job_failed' + end + if type(reply) == 'table' and reply.ok == false then + return nil, reply.reason or reply.error or err or 'discard_job_failed' + end + return true, nil +end + +local function start_failure_is_definitive_non_start(err) + return err == 'slot_busy' +end + local function upload_body_op(ctx, opts, deadline) return fibers.run_scope_op(function (scope) local timed_out = false @@ -152,13 +184,25 @@ local function upload_body_op(ctx, opts, deadline) local job, jerr = perform_with_deadline(scope, client.create_job_op(conn, artifact_id, call_opts), deadline, mark_timeout) if not job then error(jerr or 'update job create failed', 0) end out.job = job - if opts.start_job then - call_opts.timeout = false - local started, serr = perform_with_deadline(scope, client.start_job_op(conn, job.job_id, call_opts), deadline, mark_timeout) - if not started then error(serr or 'update job start failed', 0) end - out.started = started + if opts.start_job then + call_opts.timeout = false + local started, serr = perform_with_deadline(scope, client.start_job_op(conn, job.job_id, call_opts), deadline, mark_timeout) + if not started then + local start_err = serr or 'update job start failed' + if start_failure_is_definitive_non_start(start_err) then + local cleanup_reason = 'upload_start_failed:' .. tostring(start_err) + -- The artifact was already committed and handed off. This route + -- only owns cleanup for a job that definitively did not start. + local discarded, derr = discard_created_job(conn, job.job_id, cleanup_reason, call_opts) + if not discarded then + error(start_err .. '; discard_job_failed:' .. tostring(derr or 'discard_job_failed'), 0) + end + end + error(start_err, 0) + end + out.started = started + end end - end return out end) end diff --git a/src/services/update/active_job.lua b/src/services/update/active_job.lua index c2f169a41..a2e8e12b9 100644 --- a/src/services/update/active_job.lua +++ b/src/services/update/active_job.lua @@ -13,6 +13,15 @@ local sleep = require 'fibers.sleep' local M = {} +local function copy(v) + if type(v) ~= 'table' then return v end + local out = {} + for k, value in pairs(v) do + out[k] = copy(value) + end + return out +end + local function backend_method(backend, name, required) if type(backend) ~= 'table' then error('active_job: backend required', 0) @@ -222,12 +231,26 @@ local function deadline_reached(deadline) return deadline ~= nil and fibers.now() >= deadline end -local function timeout_result(job, deadline) - return { +local function timeout_result(job, deadline, last_result) + local out = { tag = 'reconcile_timeout', job_id = job.job_id, deadline = deadline, } + if type(last_result) == 'table' then + out.last_reason = last_result.reason + out.reason = last_result.reason or 'timeout' + out.missing_facts = copy(last_result.missing_facts) + out.state = copy(last_result.state) + out.last_observed = copy(last_result.last_observed) + out.last_reconcile = copy(last_result) + if last_result.reason == 'waiting_for_mcu_critical_state' then + out.reason = 'mcu_critical_state_timeout' + end + else + out.reason = 'timeout' + end + return out end local function observer_closed_result(job, reason) @@ -282,6 +305,7 @@ function M.reconcile(_scope, params) local deadline = params.deadline local seen = observer and observer.version and observer:version() or 0 local ctx = base_ctx(params, 'reconcile') + local last_result while true do local snapshot = observer and observer.snapshot and observer:snapshot() or nil @@ -290,14 +314,18 @@ function M.reconcile(_scope, params) if result.done then return normalise_reconcile_done(job, result) end + last_result = copy(result) + if ctx.last_observed ~= nil and type(last_result) == 'table' then + last_result.last_observed = copy(ctx.last_observed) + end if deadline_reached(deadline) then - return timeout_result(job, deadline) + return timeout_result(job, deadline, last_result) end local status, a, b = wait_for_reconcile_progress(observer, seen, deadline, params.poll_s) if status == 'timeout' then - return timeout_result(job, deadline) + return timeout_result(job, deadline, last_result) end if status == 'observer_closed' then return observer_closed_result(job, a) diff --git a/src/services/update/active_policy.lua b/src/services/update/active_policy.lua index b0c7a69cb..a128de00d 100644 --- a/src/services/update/active_policy.lua +++ b/src/services/update/active_policy.lua @@ -62,7 +62,7 @@ function M.apply_completion(job, ev, seq) elseif result.tag == 'reconciled_success' then repo_mod.mark_terminal(job, 'succeeded', nil, result, { seq = seq, reason = 'reconcile_success' }) elseif result.tag == 'reconcile_timeout' then - repo_mod.mark_terminal(job, 'timed_out', 'timeout', result, { seq = seq, reason = 'reconcile_timeout' }) + repo_mod.mark_terminal(job, 'timed_out', result.reason or 'timeout', result, { seq = seq, reason = 'reconcile_timeout' }) elseif result.tag == 'reconcile_observer_closed' then repo_mod.mark_terminal(job, 'failed', result.reason or 'observer_closed', result, { seq = seq, reason = 'reconcile_observer_closed' }) elseif result.tag == 'reconciled_failure' then diff --git a/src/services/update/active_runtime.lua b/src/services/update/active_runtime.lua index 68642c451..1ebdb00cc 100644 --- a/src/services/update/active_runtime.lua +++ b/src/services/update/active_runtime.lua @@ -14,6 +14,7 @@ local scoped_work = require 'devicecode.support.scoped_work' local queue = require 'devicecode.support.queue' local model = require 'services.update.model' local active_job = require 'services.update.active_job' +local repo_mod = require 'services.update.job_repository' local M = {} @@ -409,6 +410,28 @@ local function active_intent_for(job) return job and (job.active_intent or job.active) or nil end +local function terminal_job(job) + return type(job) == 'table' and repo_mod.is_terminal(job.state) +end + +local function job_lookup(jobs, job_id) + if type(jobs) ~= 'table' or type(job_id) ~= 'string' or job_id == '' then + return nil + end + if type(jobs.get) == 'function' then + return jobs:get(job_id) + end + return jobs[job_id] +end + +local function cancel_handle(handle, reason) + if handle and type(handle.cancel) == 'function' then + handle:cancel(reason) + return true + end + return false +end + local function reconcile_token_for(job, generation) return table.concat({ tostring(generation or 0), @@ -430,6 +453,55 @@ function Component:claim(rec) return M.claim(self._state, rec) end +function M.cleanup_stale_active(state, jobs) + local active = state and state.active or nil + if active == nil then return false, 'idle' end + + local phase = active.phase or 'stage' + local job = job_lookup(jobs, active.job_id) + local missing = job == nil + local terminal = terminal_job(job) + + if phase == 'reconcile' then + if missing or terminal then + local reason + if missing then + reason = 'stale_reconcile_missing_job' + else + reason = 'stale_reconcile_terminal_job:' .. tostring(job.state) + end + cancel_handle(active.handle, reason) + state.active = nil + state.stats.released = state.stats.released + 1 + return true, reason, { + action = 'released', + job_id = active.job_id, + phase = phase, + token = active.token, + } + end + return false, 'active_reconcile_current' + end + + if (phase == 'stage' or phase == 'commit') and terminal then + local reason = 'stale_' .. tostring(phase) .. '_terminal_job:' .. tostring(job.state) + if active.cancel_requested ~= reason then + active.cancel_requested = reason + active.status = 'cancelling' + cancel_handle(active.handle, reason) + return true, reason, { + action = 'cancel_requested', + job_id = active.job_id, + phase = phase, + token = active.token, + } + end + return false, 'active_' .. tostring(phase) .. '_cancel_already_requested' + end + + return false, 'active_current' +end + function Component:_report_to_service(ev, label) local ok, err = queue.try_admit_required( self._service_done_tx, @@ -453,6 +525,18 @@ function Component:_report_changed(reason, extra) return self:_report_to_service(ev, 'update_active_runtime_changed_report_failed') end +function Component:_cleanup_stale_active() + local changed, reason, extra = M.cleanup_stale_active(self._state, self._jobs) + if changed ~= true then + return changed, reason + end + extra = extra or {} + extra.cleanup_reason = reason + local ok, err = self:_report_changed('stale_active_cleanup', extra) + if ok ~= true then return nil, err end + return true, reason +end + function Component:_start_apply(ev) if not (self._jobs and type(self._jobs.admit_transition) == 'function') then return nil, 'job_runtime_unavailable' @@ -520,6 +604,10 @@ end function Component:_start_reconcile(job) if not (job and job.job_id) then return nil, 'not_ready' end + if self._state.active ~= nil then + local cleaned, cerr = self:_cleanup_stale_active() + if cleaned == nil then return nil, cerr end + end if self._state.active ~= nil then return nil, 'slot_busy' end self._adoption_reconcile_started = self._adoption_reconcile_started or {} @@ -573,6 +661,10 @@ function Component:_launch_active_intent(job) if type(intent) ~= 'table' or intent.token == nil or intent.phase == nil then return false, 'no_active_intent' end + if self._state.active ~= nil then + local cleaned, cerr = self:_cleanup_stale_active() + if cleaned == nil then return nil, cerr end + end if self._state.active ~= nil then return nil, 'slot_busy' end self._active_launched = self._active_launched or {} if self._active_launched[intent.token] then @@ -612,6 +704,10 @@ end function Component:consider_jobs() if not self._jobs then return false, 'not_ready' end + if self._state.active ~= nil then + local cleaned, cerr = self:_cleanup_stale_active() + if cleaned == nil then return nil, cerr end + end if self._state.active ~= nil then return false, 'slot_busy' end for _, job in ipairs(self._jobs:list()) do @@ -635,6 +731,10 @@ end function Component:start_intent(intent, job, spec) intent = intent or {} spec = spec or {} + if self._state.active ~= nil then + local cleaned, cerr = self:_cleanup_stale_active() + if cleaned == nil then return nil, cerr end + end if self._state.active ~= nil then return nil, 'slot_busy' end diff --git a/src/services/update/backends/component.lua b/src/services/update/backends/component.lua index 99b2bd48f..17d6d3f60 100644 --- a/src/services/update/backends/component.lua +++ b/src/services/update/backends/component.lua @@ -23,6 +23,31 @@ local function metadata_of(job) return type(job) == 'table' and type(job.metadata) == 'table' and job.metadata or {} end +local function positive_int(v) + local n = tonumber(v) + if n and n > 0 and n == math.floor(n) then return n end + return nil +end + +local function transfer_chunk_size_after_prepare(self, job, prepared) + local meta = metadata_of(job) + local max_chunk_size = positive_int(type(prepared) == 'table' and prepared.max_chunk_size) + local forced = positive_int(meta.transfer_chunk_raw) + if forced then + if max_chunk_size and forced > max_chunk_size then + print('[update-mcu]', 'transfer_chunk_raw_override', + 'chunk_size=' .. tostring(forced), + 'max_chunk_size=' .. tostring(max_chunk_size)) + end + return forced + end + local selected = positive_int(meta.chunk_size) or positive_int(self._chunk_size) + if selected and max_chunk_size and selected > max_chunk_size then + return max_chunk_size + end + return selected or max_chunk_size +end + local function artifact_record(job) if type(job) ~= 'table' then return nil end return job.artifact or job.artifact_snapshot or job.artifact_meta @@ -64,7 +89,8 @@ local function call_component_op(self, component, method, payload, opts) if type(self._conn) ~= 'table' or type(self._conn.call_op) ~= 'function' then return op.always(nil, 'component_backend_connection_required') end - return self._conn:call_op(topics.component_rpc(component, method), payload, opts or self._call_opts):wrap(function (reply, err) + local target = topics.component_rpc(component, method) + return self._conn:call_op(target, payload, opts or self._call_opts):wrap(function (reply, err) if reply == false then return nil, err or 'component_call_failed' end return reply, err end) @@ -94,6 +120,55 @@ local function validate_stage_reply(reply) return true, nil end +local function commit_acceptance_payload(reply) + if type(reply) ~= 'table' then return nil end + if reply.accepted ~= nil then return reply end + if type(reply.value) == 'table' and reply.value.accepted ~= nil then return reply.value end + if type(reply.reply_payload) == 'table' and reply.reply_payload.accepted ~= nil then + return reply.reply_payload + end + return nil +end + +local function validate_commit_reply(reply) + if type(reply) ~= 'table' then + return nil, 'invalid_commit_reply' + end + if reply.ok == false then + return nil, reply.err or reply.error or reply.reason or 'component_commit_update_failed' + end + if reply.public_status ~= nil and reply.public_status ~= 'succeeded' then + return nil, reply.err or reply.error or reply.reason or reply.public_status + end + local accepted = commit_acceptance_payload(reply) + if type(accepted) ~= 'table' then + return nil, 'component_commit_acceptance_missing' + end + if accepted.accepted ~= true then + return nil, accepted.err or accepted.error or accepted.reason or 'component_commit_rejected' + end + return accepted, nil +end + +local function phase_error(prefix, err) + if err == nil or err == '' then return prefix end + return prefix .. ':' .. tostring(err) +end + +local function print_prepare_diag(event, fields) + local parts = { '[update-prepare]', 'ev', event } + for _, key in ipairs({ + 'component', 'job_id', 'expected_image_id', 'duration_ms', 'ok', 'err', + }) do + local v = fields and fields[key] + if v ~= nil and v ~= '' then + parts[#parts + 1] = key + parts[#parts + 1] = tostring(v) + end + end + print(table.concat(parts, ' ')) +end + local function describe_artifact(artifact) if type(artifact) == 'table' and type(artifact.describe) == 'function' then local ok, rec = pcall(function () return artifact:describe() end) @@ -102,17 +177,187 @@ local function describe_artifact(artifact) return type(artifact) == 'table' and artifact or nil end -function Backend:stage_op(job, ctx) +local function component_snapshot(snapshot, component) + if type(snapshot) ~= 'table' then return nil end + local by_id = snapshot.by_id or snapshot.components + local rec = type(by_id) == 'table' and by_id[component] or nil + if type(rec) == 'table' and type(rec.state) == 'table' then return rec.state end + if type(rec) == 'table' then return rec end + if snapshot.component == component then return snapshot.state or snapshot end + return nil +end + +local function component_record(snapshot, component) + if type(snapshot) ~= 'table' then return nil end + local by_id = snapshot.by_id or snapshot.components + local rec = type(by_id) == 'table' and by_id[component] or nil + if type(rec) == 'table' then return rec end + if snapshot.component == component then return snapshot end + return nil +end + +local function state_from_record(rec) + if type(rec) ~= 'table' then return nil end + if type(rec.state) == 'table' then return rec.state end + return rec +end + +local function latest_component_record(self, component) + local obs = self._observer + if obs and type(obs.snapshot) == 'function' then + return component_record(obs:snapshot(), component) + end + return nil +end + +local function latest_component_state(self, component) + return state_from_record(latest_component_record(self, component)) +end + +local function updater_state(state) + if type(state) ~= 'table' then return nil end + return state.update or state.updater +end + +local MCU_CRITICAL_FACTS = { 'software', 'updater', 'health' } + +local function fact_present(v) + if type(v) == 'table' then return next(v) ~= nil end + return v ~= nil +end + +local function missing_mcu_critical_facts(state) + local missing = {} + state = type(state) == 'table' and state or {} + if not fact_present(state.software) then + missing[#missing + 1] = 'software' + end + if not fact_present(updater_state(state)) then + missing[#missing + 1] = 'updater' + end + if not fact_present(state.health) then + missing[#missing + 1] = 'health' + end + return missing +end + +local function comma_join(items) + local out = {} + for i = 1, #(items or {}) do out[i] = tostring(items[i]) end + return table.concat(out, ',') +end + +local function require_component_boot_id(self, component) + local state = latest_component_state(self, component) + local sw = state and state.software or nil + if type(sw) ~= 'table' or sw.boot_id == nil or sw.boot_id == '' then + return nil, 'component_software_boot_id_unavailable' + end + return true, nil +end + +local function critical_fact_fabric(state, fact) + local cp = type(state) == 'table' and state.control_plane or nil + local facts = type(cp) == 'table' and cp.facts or nil + local rec = type(facts) == 'table' and facts[fact] or nil + if type(rec) ~= 'table' then return nil end + return type(rec.fabric) == 'table' and rec.fabric or nil +end + +local function has_session_identity(fabric) + if type(fabric) ~= 'table' then return false end + if type(fabric.peer_sid) ~= 'string' or fabric.peer_sid == '' then return false end + if fabric.session_generation == nil or fabric.session_generation == '' then return false end + return true +end + +local function require_matching_mcu_fact_sessions(state) + local missing = {} + local metas = {} + for _, fact in ipairs(MCU_CRITICAL_FACTS) do + local fabric = critical_fact_fabric(state, fact) + if not has_session_identity(fabric) then + missing[#missing + 1] = fact + else + metas[fact] = fabric + end + end + if #missing > 0 then + return nil, 'mcu_control_plane_not_ready:fact_origin_missing:' .. comma_join(missing) + end + + local peer_sid, session_generation + for _, fact in ipairs(MCU_CRITICAL_FACTS) do + local fabric = metas[fact] + if peer_sid == nil then + peer_sid = fabric.peer_sid + session_generation = fabric.session_generation + elseif peer_sid ~= fabric.peer_sid or session_generation ~= fabric.session_generation then + return nil, 'mcu_control_plane_not_ready:mixed_fact_sessions' + end + end + + local link_id, link_generation + for _, fact in ipairs(MCU_CRITICAL_FACTS) do + local fabric = metas[fact] + if fabric.link_id ~= nil and fabric.link_id ~= '' then + if link_id == nil then + link_id = fabric.link_id + elseif link_id ~= fabric.link_id then + return nil, 'mcu_control_plane_not_ready:mixed_fact_links' + end + end + if fabric.link_generation ~= nil and fabric.link_generation ~= '' then + if link_generation == nil then + link_generation = fabric.link_generation + elseif link_generation ~= fabric.link_generation then + return nil, 'mcu_control_plane_not_ready:mixed_fact_links' + end + end + end + + return true, nil +end + +local function require_update_admission(self, component) + if component ~= 'mcu' then + return require_component_boot_id(self, component) + end + local rec = latest_component_record(self, component) + local state = state_from_record(rec) + local missing = missing_mcu_critical_facts(state) + if #missing > 0 then + return nil, 'mcu_control_plane_not_ready:missing_critical_facts:' .. comma_join(missing) + end + local sw = state and state.software or nil + if type(sw) ~= 'table' or sw.boot_id == nil or sw.boot_id == '' then + return nil, 'mcu_control_plane_not_ready:software_boot_id_unavailable' + end + local actions = state and state.actions or nil + if type(actions) ~= 'table' or actions['prepare-update'] ~= true then + return nil, 'mcu_control_plane_not_ready:prepare_route_missing' + end + local cp = state and state.control_plane or nil + local source = state and (state.source or (type(cp) == 'table' and cp.source)) or nil + if type(source) == 'table' and source.reason ~= nil and source.reason ~= '' then + return nil, 'mcu_control_plane_not_ready:' .. tostring(source.reason) + end + local session_ok, session_err = require_matching_mcu_fact_sessions(state) + if session_ok ~= true then return nil, session_err end + return true, nil +end + +function Backend:stage_op(job, _ctx) return unwrap_scope_value(fibers.run_scope_op(function () local component = component_of(self, job) + local ready, ready_err = require_update_admission(self, component) + if ready ~= true then return nil, ready_err end + local ref = artifact_ref(job) if not ref then return nil, 'artifact_ref_required' end if not self._artifact_store or type(self._artifact_store.open_op) ~= 'function' then return nil, 'artifact_store_unavailable' end - if type(self._artifact_store.open_source_op) ~= 'function' then - return nil, 'artifact_source_unavailable' - end local artifact, aerr = fibers.perform(self._artifact_store:open_op(ref)) if artifact == nil then return nil, aerr or 'artifact_open_failed' end @@ -127,24 +372,37 @@ function Backend:stage_op(job, ctx) expected_image_id = image_id, metadata = metadata_of(job), } + local prepare_started = fibers.now() + print_prepare_diag('prepare_call_start', { + component = component, + job_id = job.job_id, + expected_image_id = image_id, + }) local prepared, perr = fibers.perform(call_component_op(self, component, 'prepare-update', prepare_payload)) - if prepared == nil then return nil, perr or 'component_prepare_update_failed' end + local prepare_duration = math.floor(((fibers.now() - prepare_started) * 1000) + 0.5) + print_prepare_diag('prepare_call_done', { + component = component, + job_id = job.job_id, + expected_image_id = image_id, + duration_ms = prepare_duration, + ok = prepared ~= nil, + err = perr, + }) + if prepared == nil then return nil, phase_error('component_prepare_update_failed', perr) end - local source, serr = fibers.perform(self._artifact_store:open_source_op(ref)) - if source == nil then return nil, serr or 'artifact_source_open_failed' end local payload = { job_id = job.job_id, expected_image_id = image_id, - source = source, + artifact_ref = ref, size = desc.size, digest_alg = desc.digest_alg or 'xxhash32', digest = desc.digest or desc.checksum, - chunk_size = self._chunk_size or 2048, + chunk_size = transfer_chunk_size_after_prepare(self, job, prepared), format = meta.format or desc.format or 'dcmcu-v1', metadata = metadata_of(job), } - local reply, err = fibers.perform(call_component_op(self, component, 'stage-update', payload)) - if reply == nil then return nil, err or 'component_stage_update_failed' end + local reply, err = fibers.perform(call_component_op(self, component, 'stage-update', payload, { timeout = false })) + if reply == nil then return nil, phase_error('component_stage_update_failed', err) end local ok_reply, rerr = validate_stage_reply(reply) if ok_reply ~= true then return nil, rerr end return { @@ -174,29 +432,6 @@ function Backend:stage_op(job, ctx) end), 'component_stage') end -local function component_snapshot(snapshot, component) - if type(snapshot) ~= 'table' then return nil end - local by_id = snapshot.by_id or snapshot.components - local rec = type(by_id) == 'table' and by_id[component] or nil - if type(rec) == 'table' and type(rec.state) == 'table' then return rec.state end - if type(rec) == 'table' then return rec end - if snapshot.component == component then return snapshot.state or snapshot end - return nil -end - -local function latest_component_state(self, component) - local obs = self._observer - if obs and type(obs.snapshot) == 'function' then - return component_snapshot(obs:snapshot(), component) - end - return nil -end - -local function updater_state(state) - if type(state) ~= 'table' then return nil end - return state.update or state.updater -end - function Backend:pre_commit_record_op(job, ctx) local component = component_of(self, job) local state = latest_component_state(self, component) @@ -221,8 +456,12 @@ function Backend:commit_op(job, ctx) metadata = metadata_of(job), } return call_component_op(self, component, 'commit-update', payload):wrap(function (reply, err) - if reply == nil then return nil, err or 'component_commit_update_failed' end - return { accepted = true, reply = reply } + if reply == nil then + return nil, phase_error('component_commit_update_failed', err) + end + local accepted, rerr = validate_commit_reply(reply) + if accepted == nil then return nil, rerr end + return { accepted = true, reply = reply, component_reply = accepted } end) end @@ -236,15 +475,34 @@ function Backend:evaluate_reconcile(job, snapshot, ctx) local expected = expected_image_id(job, ctx) or (pre and pre.expected_image_id) local pre_boot = pre and pre.pre_commit_boot_id - if type(upd) == 'table' and (upd.state == 'failed' or upd.state == 'rollback_detected') then + if type(upd) == 'table' and upd.state == 'failed' then + return { done = true, ok = false, reason = upd.last_error or upd.state, state = copy(state) } + end + if type(upd) == 'table' and upd.state == 'rollback_detected' then return { done = true, ok = false, reason = upd.state, state = copy(state) } end + if component == 'mcu' then + local missing = missing_mcu_critical_facts(state) + if #missing > 0 then + return { + done = false, + reason = 'waiting_for_mcu_critical_state', + missing_facts = missing, + required_facts = copy(MCU_CRITICAL_FACTS), + state = copy(state), + } + end + end + if type(sw) == 'table' and expected and sw.image_id == expected and pre_boot and sw.boot_id ~= pre_boot then return { done = true, ok = true, state = copy(state) } end - if type(sw) == 'table' and expected and pre_boot and sw.boot_id ~= nil and sw.boot_id ~= pre_boot and sw.image_id ~= expected then + if type(sw) == 'table' + and expected and pre_boot and sw.boot_id ~= nil + and sw.boot_id ~= pre_boot and sw.image_id ~= expected + then return { done = true, ok = false, reason = 'wrong_image_after_reboot', state = copy(state) } end @@ -262,7 +520,7 @@ function M.new(opts) _artifact_store = opts.artifact_store, _observer = opts.observer, _component = opts.component or 'mcu', - _chunk_size = opts.chunk_size or 2048, + _chunk_size = opts.chunk_size, _commit_policy = opts.commit_policy, _call_opts = opts.call_opts, }, Backend) diff --git a/src/services/update/job_runtime.lua b/src/services/update/job_runtime.lua index 23d4f5370..4b3f35e1b 100644 --- a/src/services/update/job_runtime.lua +++ b/src/services/update/job_runtime.lua @@ -12,6 +12,7 @@ local fibers = require 'fibers' local op = require 'fibers.op' local cond = require 'fibers.cond' local mailbox = require 'fibers.mailbox' +local sleep = require 'fibers.sleep' local scoped_work = require 'devicecode.support.scoped_work' local queue = require 'devicecode.support.queue' @@ -22,6 +23,7 @@ local active_policy = require 'services.update.active_policy' local M = {} local DEFAULT_QUEUE = 32 +local DEFAULT_PERSISTENCE_RETRY_BACKOFF = { 0.25, 0.5, 1.0, 2.0, 5.0 } local Runtime = {} Runtime.__index = Runtime @@ -78,6 +80,15 @@ local function transition_record(req, state, details) base.finished = details.finished base.error = details.error base.plan_kind = details.plan_kind + base.store_op = details.store_op + base.store_err = details.store_err + base.retry_attempts = details.retry_attempts + base.retry_delay = details.retry_delay + base.retry_mode = details.retry_mode + base.first_failed_at = details.first_failed_at + base.last_attempt_at = details.last_attempt_at + base.dependency = details.dependency + base.persistence_pending = details.persistence_pending == true or nil if state == 'admitted' or state == 'persisting' or state == 'persisted' @@ -152,6 +163,16 @@ local function transition_outcome_from_record(rec, value) out.token = out.token or rec.token out.transition_state = rec.state out.lifecycle = rec.state + out.plan_kind = out.plan_kind or rec.plan_kind + out.store_op = out.store_op or rec.store_op + out.store_err = out.store_err or rec.store_err + out.retry_attempts = out.retry_attempts or rec.retry_attempts + out.retry_delay = out.retry_delay or rec.retry_delay + out.retry_mode = out.retry_mode or rec.retry_mode + out.first_failed_at = out.first_failed_at or rec.first_failed_at + out.last_attempt_at = out.last_attempt_at or rec.last_attempt_at + out.dependency = out.dependency or rec.dependency + out.persistence_pending = out.persistence_pending or rec.persistence_pending return out end @@ -191,15 +212,18 @@ end local function runtime_snapshot(self_or_jobs, ready, adoption) local jobs = self_or_jobs local transitions + local persistence if type(self_or_jobs) == 'table' and self_or_jobs._jobs ~= nil then jobs = self_or_jobs._jobs transitions = transition_snapshot(self_or_jobs) + persistence = copy(self_or_jobs._persistence) end return { ready = not not ready, jobs = repo_mod.snapshot(jobs), adoption = copy(adoption or {}), transitions = transitions, + persistence = persistence, count = count_keys(jobs and jobs.jobs or {}), } end @@ -281,6 +305,61 @@ local function store_delete_op(store, job_id) return store:delete_job_op(job_id) end +local RETRYABLE_STORE_ERRORS = { + control_store_put_timeout = true, + control_store_get_timeout = true, + control_store_list_timeout = true, + control_store_delete_timeout = true, + liveness_timeout = true, +} + +local function strip_job_store_prefix(reason) + reason = tostring(reason or '') + return reason:match('^job_store_[^:]+_failed:(.+)$') or reason +end + +local function is_retryable_store_error(reason) + local inner = strip_job_store_prefix(reason) + if RETRYABLE_STORE_ERRORS[inner] then return true end + if inner:match('^control_store_[%w_]+_timeout$') then return true end + if inner:find('liveness_timeout', 1, true) then return true end + return false +end + +local function store_op_for_plan(plan) + if plan.kind == 'save_job' then return 'save_job_op' end + if plan.kind == 'delete_job' then return 'delete_job_op' end + return nil +end + +local function retry_delay_for(self, attempt) + local configured = self._params and self._params.persistence_retry_backoff + if type(configured) == 'function' then + local delay = configured(attempt) + if type(delay) == 'number' and delay >= 0 then return delay end + elseif type(configured) == 'table' then + local delay = configured[attempt] or configured[#configured] + if type(delay) == 'number' and delay >= 0 then return delay end + elseif type(configured) == 'number' and configured >= 0 then + return configured + end + return DEFAULT_PERSISTENCE_RETRY_BACKOFF[attempt] + or DEFAULT_PERSISTENCE_RETRY_BACKOFF[#DEFAULT_PERSISTENCE_RETRY_BACKOFF] +end + +local function persist_plan_once(self, plan) + if plan.kind == 'save_job' then + local ok, serr = fibers.perform(store_save_op(self._store, public_job(plan.job))) + if ok ~= true then return nil, 'job_store_save_failed:' .. tostring(serr or 'job_save_failed') end + elseif plan.kind == 'delete_job' then + local ok, derr = fibers.perform(store_delete_op(self._store, plan.job_id)) + if ok ~= true then return nil, 'job_store_delete_failed:' .. tostring(derr or 'job_delete_failed') end + else + return nil, 'unsupported_plan_kind' + end + return true, nil +end + local function sorted_job_ids(jobs) local ids = {} for id in pairs((jobs and jobs.jobs) or {}) do ids[#ids + 1] = id end @@ -504,13 +583,28 @@ end local function durable_active_owner(jobs) for id, job in pairs((jobs and jobs.jobs) or {}) do - if job.active_token ~= nil or type(job.active_intent) == 'table' then + if not repo_mod.is_terminal(job.state) + and (job.active_token ~= nil or type(job.active_intent) == 'table') + then return id, job end end return nil, nil end +local function job_has_active_ownership(job) + return job ~= nil + and (job.active_token ~= nil + or type(job.active_intent) == 'table' + or type(job.active) == 'table') +end + +local function job_is_default_discardable(job) + return job ~= nil + and job.state == 'created' + and not job_has_active_ownership(job) +end + local function compute_create(self, cmd) local payload = copy(cmd.payload or cmd.job or {}) payload.job_id = payload.job_id or cmd.job_id or new_id('job') @@ -877,6 +971,9 @@ end local function compute_discard(self, cmd) local current = cmd.job_id and self._jobs.jobs[cmd.job_id] or nil if not current then return nil, 'not_found' end + if cmd.force ~= true and not job_is_default_discardable(current) then + return nil, 'job_not_discardable' + end return { kind = 'delete_job', transition = cmd.kind, @@ -936,7 +1033,10 @@ local function apply_plan(self, plan) return true, nil end +local persistence_record_from_event + local function start_transition_worker(self, req, plan) + local store_op = store_op_for_plan(plan) local identity = { kind = 'job_transition_done', service_id = self._service_id, @@ -946,6 +1046,8 @@ local function start_transition_worker(self, req, plan) generation = req.cmd and req.cmd.generation, phase = req.cmd and req.cmd.phase, token = req.cmd and req.cmd.token, + plan_kind = plan.kind, + store_op = store_op, } local handle, err = scoped_work.start { @@ -955,14 +1057,51 @@ local function start_transition_worker(self, req, plan) identity = identity, run = function () - if plan.kind == 'save_job' then - local ok, serr = fibers.perform(store_save_op(self._store, public_job(plan.job))) - if ok ~= true then error(serr or 'job_save_failed', 0) end - elseif plan.kind == 'delete_job' then - local ok, derr = fibers.perform(store_delete_op(self._store, plan.job_id)) - if ok ~= true then error(derr or 'job_delete_failed', 0) end - else - error('unsupported_plan_kind', 0) + local attempts = 0 + local first_failed_at + while true do + local persisted, perr = persist_plan_once(self, plan) + if persisted == true then + break + end + + if not is_retryable_store_error(perr) then + error(perr or 'job_transition_persist_failed', 0) + end + + attempts = attempts + 1 + local attempt_at = fibers.now() + first_failed_at = first_failed_at or attempt_at + local retry_delay = retry_delay_for(self, attempts) + local retry_ev = { + kind = 'job_transition_retrying', + service_id = self._service_id, + transition_id = req.id, + transition = plan.transition, + job_id = plan.job_id, + generation = req.cmd and req.cmd.generation, + phase = plan.phase or (req.cmd and req.cmd.phase), + token = plan.token or (req.cmd and req.cmd.token), + plan_kind = plan.kind, + store_op = store_op, + store_err = perr, + retry_attempts = attempts, + retry_delay = retry_delay, + retry_mode = 'indefinite', + first_failed_at = first_failed_at, + last_attempt_at = attempt_at, + dependency = 'job_store', + } + self._persistence = persistence_record_from_event(retry_ev) + local ok_retry, retry_err = queue.try_admit_required( + self._done_tx, + retry_ev, + 'job_transition_retry_report_failed' + ) + if ok_retry ~= true then + error(retry_err or 'job_transition_retry_report_failed', 0) + end + fibers.perform(sleep.sleep_op(retry_delay)) end return { @@ -989,6 +1128,53 @@ end local record_transition_outcome +function persistence_record_from_event(ev) + return { + pending = true, + source = 'job_runtime', + reason = ev.store_err, + transition_id = ev.transition_id, + transition = ev.transition, + job_id = ev.job_id, + generation = ev.generation, + phase = ev.phase, + token = ev.token, + plan_kind = ev.plan_kind, + store_op = ev.store_op, + store_err = ev.store_err, + retry_attempts = ev.retry_attempts, + retry_delay = ev.retry_delay, + retry_mode = ev.retry_mode, + first_failed_at = ev.first_failed_at, + last_attempt_at = ev.last_attempt_at, + dependency = ev.dependency, + } +end + +local function handle_transition_retrying(self, ev) + local inflight = self._inflight + if not inflight or ev.transition_id ~= inflight.request.id then + return + end + + transition_set(self, inflight.request, 'persisting', { + sequence = inflight.request.sequence, + plan_kind = inflight.plan.kind, + store_op = ev.store_op, + store_err = ev.store_err, + error = ev.store_err, + retry_attempts = ev.retry_attempts, + retry_delay = ev.retry_delay, + retry_mode = ev.retry_mode, + first_failed_at = ev.first_failed_at, + last_attempt_at = ev.last_attempt_at, + dependency = ev.dependency, + persistence_pending = true, + }) + self._persistence = persistence_record_from_event(ev) + refresh_model(self) +end + local function start_next(self) if self._inflight ~= nil then return true end @@ -1012,21 +1198,23 @@ local function start_next(self) refresh_model(self) resolve_cell(req.cell, outcome, nil) else - local rec = transition_set(self, req, 'admitted', { + transition_set(self, req, 'admitted', { sequence = req.sequence, plan_kind = plan.kind, + store_op = store_op_for_plan(plan), }) refresh_model(self) - rec = transition_set(self, req, 'persisting', { + transition_set(self, req, 'persisting', { sequence = req.sequence, plan_kind = plan.kind, + store_op = store_op_for_plan(plan), }) refresh_model(self) local handle, herr = start_transition_worker(self, req, plan) if not handle then - rec = transition_set(self, req, 'failed', { + local rec = transition_set(self, req, 'failed', { sequence = req.sequence, plan_kind = plan.kind, error = herr or 'job_transition_start_failed', @@ -1081,6 +1269,12 @@ local function transition_outcome(req, plan, status, reason) out.token = plan.token or (req.cmd and req.cmd.token) out.commit_token = plan.commit_token or (req.cmd and req.cmd.commit_token) or out.commit_token out.commit_policy = plan.commit_policy or (req.cmd and (req.cmd.commit_policy or req.cmd.policy)) or out.commit_policy + out.plan_kind = plan.kind + if plan.kind == 'save_job' then + out.store_op = 'save_job_op' + elseif plan.kind == 'delete_job' then + out.store_op = 'delete_job_op' + end if status ~= 'persisted' then out.reason = reason end return out end @@ -1093,12 +1287,18 @@ function record_transition_outcome(self, outcome) end local function handle_transition_done(self, ev) + if ev.kind == 'job_transition_retrying' then + handle_transition_retrying(self, ev) + return + end + local inflight = self._inflight if not inflight or ev.transition_id ~= inflight.request.id then return end self._inflight = nil + self._persistence = nil local outcome local rec @@ -1108,6 +1308,7 @@ local function handle_transition_done(self, ev) rec = transition_set(self, inflight.request, 'persisted', { sequence = inflight.request.sequence, plan_kind = inflight.plan.kind, + store_op = ev.store_op, }) outcome = transition_outcome_from_record(rec, transition_outcome(inflight.request, inflight.plan, 'persisted')) else @@ -1115,8 +1316,17 @@ local function handle_transition_done(self, ev) sequence = inflight.request.sequence, plan_kind = inflight.plan.kind, error = err or 'job_transition_apply_failed', + store_op = ev.store_op, }) - outcome = transition_outcome_from_record(rec, transition_outcome(inflight.request, inflight.plan, 'failed', err or 'job_transition_apply_failed')) + outcome = transition_outcome_from_record( + rec, + transition_outcome( + inflight.request, + inflight.plan, + 'failed', + err or 'job_transition_apply_failed' + ) + ) end else local reason = ev.primary or ev.status or 'job_transition_failed' @@ -1124,8 +1334,11 @@ local function handle_transition_done(self, ev) sequence = inflight.request.sequence, plan_kind = inflight.plan.kind, error = reason, + store_op = ev.store_op, + store_err = reason, }) outcome = transition_outcome_from_record(rec, transition_outcome(inflight.request, inflight.plan, 'failed', reason)) + outcome.store_err = reason end record_transition_outcome(self, outcome) @@ -1247,6 +1460,14 @@ function Runtime:transition_snapshot() return transition_snapshot(self) end +function Runtime:persistence_pending() + return self._persistence ~= nil +end + +function Runtime:persistence_snapshot() + return copy(self._persistence) +end + function Runtime:transition_outcome(transition_id) local out = self._transition_outcomes and self._transition_outcomes[transition_id] or nil return out and copy(out) or nil @@ -1277,6 +1498,10 @@ function Runtime:admit_transition(cmd) return nil, self._ready_err or 'job_runtime_not_ready' end + if self._persistence ~= nil then + return nil, 'job_persistence_pending' + end + local ok, err = queue.try_admit_now(self._request_tx, req) if ok ~= true then return nil, err or 'job_runtime_busy' diff --git a/src/services/update/job_store_control_store.lua b/src/services/update/job_store_control_store.lua index 5b5efb89e..7ee94c838 100644 --- a/src/services/update/job_store_control_store.lua +++ b/src/services/update/job_store_control_store.lua @@ -35,10 +35,27 @@ local VOID_SUCCESS = { delete = true, } +local function method_timeout_reason(method) + if method == 'put' then return 'control_store_put_timeout' end + if method == 'get' then return 'control_store_get_timeout' end + if method == 'list' then return 'control_store_list_timeout' end + if method == 'delete' then return 'control_store_delete_timeout' end + return 'control_store_call_timeout' +end + +local function normalise_call_error(method, err) + if err == nil then return nil end + local s = tostring(err) + if s == 'timeout' or s:lower():find('timeout', 1, true) ~= nil then + return method_timeout_reason(method) + end + return s +end + local function unwrap_reply_for(method) return function (reply, err) if reply == nil then - return nil, err + return nil, normalise_call_error(method, err) end if type(reply) ~= 'table' or type(reply.ok) ~= 'boolean' then return nil, 'invalid_control_store_reply' @@ -101,6 +118,14 @@ local function decode_job(key, body) return job, nil end +local function is_missing_store_entry(err) + if err == nil then return false end + local s = tostring(err):lower() + return s == 'not found' + or s:find('no such file', 1, true) ~= nil + or s:find('enoent', 1, true) ~= nil +end + function Store:load_all_op() return fibers.run_scope_op(function () local list_opts, lerr = cap_args.new.ControlStoreListOpts(self._prefix) @@ -116,10 +141,15 @@ function Store:load_all_op() local get_opts, gerr = cap_args.new.ControlStoreGetOpts(key) if not get_opts then return nil, gerr or 'invalid_control_store_get_opts' end local body, berr = fibers.perform(call_op(self, 'get', get_opts)) - if body == nil then return nil, berr or ('control_store_get_failed:' .. key) end - local job, derr = decode_job(key, body) - if not job then return nil, derr end - jobs[job.job_id] = copy(job) + if body == nil then + if not is_missing_store_entry(berr) then + return nil, berr or ('control_store_get_failed:' .. key) + end + else + local job, derr = decode_job(key, body) + if not job then return nil, derr end + jobs[job.job_id] = copy(job) + end end end diff --git a/src/services/update/manager.lua b/src/services/update/manager.lua index 1b8e62dbc..b9595ee23 100644 --- a/src/services/update/manager.lua +++ b/src/services/update/manager.lua @@ -148,6 +148,21 @@ local function reject_if_jobs_not_ready(ctx, req) return true end +local function reject_if_persistence_pending(ctx, req) + if ctx.jobs + and type(ctx.jobs.persistence_pending) == 'function' + and ctx.jobs:persistence_pending() == true + then + fail_request(req, 'job_persistence_pending') + return true + end + return false +end + +local function reject_if_jobs_blocked(ctx, req) + return reject_if_jobs_not_ready(ctx, req) or reject_if_persistence_pending(ctx, req) +end + local function is_ingest_method(method) return method == 'ingest_create' or method == 'ingest_append' @@ -156,7 +171,7 @@ local function is_ingest_method(method) end local function handle_create(ctx, req) - if reject_if_jobs_not_ready(ctx, req) then return end + if reject_if_jobs_blocked(ctx, req) then return end return start_scoped_request(ctx, req, 'create_job', function (work_scope, owner) return manager_requests.create_job(work_scope, { request_owner = owner, @@ -169,7 +184,7 @@ local function handle_create(ctx, req) end local function handle_start(ctx, req, payload) - if reject_if_jobs_not_ready(ctx, req) then return end + if reject_if_jobs_blocked(ctx, req) then return end payload = type(payload) == 'table' and payload or {} local job_id = payload.job_id local job = ctx.jobs:get(job_id) @@ -208,7 +223,7 @@ local function handle_start(ctx, req, payload) end local function handle_patch(ctx, req, payload, method, patch) - if reject_if_jobs_not_ready(ctx, req) then return end + if reject_if_jobs_blocked(ctx, req) then return end payload = type(payload) == 'table' and payload or {} local job_id = payload.job_id if type(job_id) ~= 'string' or job_id == '' then @@ -234,6 +249,9 @@ local function handle_cancel(ctx, req, payload) state = 'cancelled', next_step = nil, error = p.reason or 'cancelled', + active = nil, + active_token = nil, + active_intent = nil, } end) end @@ -252,7 +270,7 @@ local function handle_retry(ctx, req, payload) end local function handle_discard(ctx, req, payload) - if reject_if_jobs_not_ready(ctx, req) then return end + if reject_if_jobs_blocked(ctx, req) then return end payload = type(payload) == 'table' and payload or {} local job_id = payload.job_id if type(job_id) ~= 'string' or job_id == '' then @@ -266,6 +284,7 @@ local function handle_discard(ctx, req, payload) jobs = ctx.jobs, job_id = job_id, generation = ctx.generation, + reason = payload.reason, }) end) end diff --git a/src/services/update/manager_requests.lua b/src/services/update/manager_requests.lua index 547a43d94..9452c8b0d 100644 --- a/src/services/update/manager_requests.lua +++ b/src/services/update/manager_requests.lua @@ -210,7 +210,8 @@ function M.discard_job(scope, params) kind = 'discard_job', generation = params.generation, job_id = params.job_id or (params.job and params.job.job_id), - reason = params.method or 'discard_job', + reason = params.reason or params.method or 'discard_job', + force = params.force == true, }) if not result or result.status ~= 'persisted' then local reason = err or (result and result.reason) or 'discard_job_failed' diff --git a/src/services/update/model.lua b/src/services/update/model.lua index c999ff4dd..93e4be6c4 100644 --- a/src/services/update/model.lua +++ b/src/services/update/model.lua @@ -30,6 +30,9 @@ function M.service_initial(service_id, generation) jobs = { count = 0, by_id = {} }, ingest = { count = 0, by_id = {} }, pending = {}, + last_failure = nil, + last_warning = nil, + job_runtime = nil, publisher = { state = 'starting' }, dependencies = {}, } diff --git a/src/services/update/projection.lua b/src/services/update/projection.lua index f42cf557c..9ff7e4195 100644 --- a/src/services/update/projection.lua +++ b/src/services/update/projection.lua @@ -78,6 +78,9 @@ function M.service_state(snapshot) jobs = copy(snapshot.jobs or { count = 0, by_id = {} }), ingest = copy(snapshot.ingest or { count = 0, by_id = {} }), pending = copy(snapshot.pending), + last_failure = copy(snapshot.last_failure), + last_warning = copy(snapshot.last_warning), + job_runtime = copy(snapshot.job_runtime), dependencies = copy(snapshot.dependencies), publisher = copy(snapshot.publisher), } diff --git a/src/services/update/service.lua b/src/services/update/service.lua index 80a836d69..488b4f47c 100644 --- a/src/services/update/service.lua +++ b/src/services/update/service.lua @@ -12,7 +12,7 @@ local mailbox = require 'fibers.mailbox' local scoped_work = require 'devicecode.support.scoped_work' local queue = require 'devicecode.support.queue' local bus_cleanup = require 'devicecode.support.bus_cleanup' -local config_watch = require 'devicecode.support.config_watch' +local config_watch = require 'devicecode.support.config_watch' local service_events = require 'devicecode.support.service_events' local service_base = require 'devicecode.service_base' local cap_deps_mod = require 'devicecode.support.capability_dependencies' @@ -22,7 +22,6 @@ local config_mod = require 'services.update.config' local events = require 'services.update.events' local generation = require 'services.update.generation' local publisher = require 'services.update.publisher' -local projection = require 'services.update.projection' local topics = require 'services.update.topics' local job_store_memory = require 'services.update.job_store_memory' local control_store_jobs = require 'services.update.job_store_control_store' @@ -50,6 +49,8 @@ local ensure_runtime_dependents local reconcile_runtime_components local classify_dependency_route_missing local handle_artifact_route_missing +local fail_update_service +local artifact_store_available_for_work local function copy(v) @@ -108,6 +109,9 @@ local function update_model_state(self, state, reason) local extra = { ready = ready } if reason ~= nil then extra.reason = reason end if snapshot and snapshot.pending ~= nil then extra.pending = snapshot.pending end + if snapshot and snapshot.last_failure ~= nil then extra.last_failure = snapshot.last_failure end + if snapshot and snapshot.last_warning ~= nil then extra.last_warning = snapshot.last_warning end + if snapshot and snapshot.job_runtime ~= nil then extra.job_runtime = snapshot.job_runtime end extra.dependencies = self._deps:snapshot() self._svc:status(state, extra) end @@ -185,6 +189,137 @@ local function active_snapshot(self) return snap and snap.active or nil end +local TERMINAL_JOB_STATE = { + succeeded = true, + failed = true, + cancelled = true, + timed_out = true, + superseded = true, + discarded = true, +} + +local function current_job_snapshot(self) + if not self._jobs or type(self._jobs.snapshot) ~= 'function' then + return nil + end + local snap = self._jobs:snapshot() + local by_id = snap and snap.by_id or nil + if type(by_id) ~= 'table' then return nil end + + local selected + for _, id in ipairs(snap.order or {}) do + local job = by_id[id] + if type(job) == 'table' then + if job.active_intent or job.active or not TERMINAL_JOB_STATE[job.state] then + selected = job + if job.active_intent or job.active then break end + end + if job.state == 'staging' + or job.state == 'awaiting_commit' + or job.state == 'committing' + or job.state == 'awaiting_return' + then + selected = job + break + end + end + end + if selected == nil then + for _, job in pairs(by_id) do + if type(job) == 'table' then + selected = job + break + end + end + end + if selected == nil then return nil end + return { + job_id = selected.job_id, + state = selected.state, + component = selected.component, + next_step = selected.next_step, + phase = selected.phase or (selected.active and selected.active.phase) + or (selected.active_intent and selected.active_intent.phase), + } +end + +local function event_failure_fields(ev) + ev = type(ev) == 'table' and ev or {} + return { + event_kind = ev.kind, + event_status = ev.status, + event_primary = ev.primary, + component = ev.component, + transition_id = ev.transition_id, + transition = ev.transition, + phase = ev.phase, + token = ev.token, + plan_kind = ev.plan_kind, + store_op = ev.store_op, + store_err = ev.store_err, + retry_attempts = ev.retry_attempts, + retry_delay = ev.retry_delay, + retry_mode = ev.retry_mode, + first_failed_at = ev.first_failed_at, + last_attempt_at = ev.last_attempt_at, + dependency = ev.dependency, + } +end + +local function make_last_failure(self, source, reason, ev) + local rec = event_failure_fields(ev) + rec.source = source or 'unknown' + rec.reason = tostring(reason or 'failed') + rec.job_runtime_ready = self._job_runtime_ready == true + rec.active_job = copy(active_snapshot(self)) + rec.current_job = current_job_snapshot(self) + if rec.current_job then + rec.job_id = rec.current_job.job_id + rec.job_state = rec.current_job.state + end + return rec +end + +fail_update_service = function(self, source, reason, ev) + local failure = make_last_failure(self, source, reason, ev) + self._last_failure = failure + self._model:update(function (s) + s.last_failure = copy(failure) + return s + end) + update_model_state(self, 'failed', reason) +end + +local function make_last_warning(_, source, reason, ev) + local rec = event_failure_fields(ev) + rec.source = source or 'unknown' + rec.reason = tostring(reason or 'warning') + return rec +end + +local function persistence_warning(persistence) + persistence = type(persistence) == 'table' and persistence or {} + local reason = persistence.reason or persistence.store_err or 'update_persistence_pending' + return make_last_warning(nil, 'job_runtime', reason, { + kind = 'job_transition_retrying', + status = 'retrying', + primary = persistence.reason or persistence.store_err, + transition_id = persistence.transition_id, + transition = persistence.transition, + phase = persistence.phase, + token = persistence.token, + plan_kind = persistence.plan_kind, + store_op = persistence.store_op, + store_err = persistence.store_err, + retry_attempts = persistence.retry_attempts, + retry_delay = persistence.retry_delay, + retry_mode = persistence.retry_mode, + first_failed_at = persistence.first_failed_at, + last_attempt_at = persistence.last_attempt_at, + dependency = persistence.dependency, + }) +end + local function route_generation_event(self, ev, label) local active = self._current_generation if not active or active.state ~= 'running' or not active.route_port then @@ -323,7 +458,7 @@ local function apply_config(self, payload, reason) local ok, start_err = replace_generation(self, cfg, reason or 'config_changed') if not ok then - update_model_state(self, 'failed', start_err or 'generation_start_failed') + fail_update_service(self, 'generation', start_err or 'generation_start_failed') error(start_err or 'generation_start_failed', 0) end @@ -376,7 +511,7 @@ local function handle_generation_done(self, ev) handle_artifact_route_missing(self, reason) return end - update_model_state(self, 'failed', reason) + fail_update_service(self, 'generation', reason, ev) error('update generation failed: ' .. tostring(reason), 0) end @@ -420,6 +555,16 @@ local function method_dependency_reason(self, method) return nil end +local function is_read_only_manager_method(method) + return method == 'status' or method == 'list-jobs' or method == 'get-job' +end + +local function update_persistence_pending(self) + return self._jobs ~= nil + and type(self._jobs.persistence_pending) == 'function' + and self._jobs:persistence_pending() == true +end + local function reply_shell_manager_request(self, req, method) if method == 'status' then reply_manager_request(req, { ok = true, snapshot = self._model:snapshot() }, 'status_reply_failed') @@ -463,6 +608,11 @@ local function route_manager_request(self, req, method) return end + if not is_read_only_manager_method(method) and update_persistence_pending(self) then + fail_manager_request(req, 'update_persistence_pending') + return + end + -- Read-only job inspection is safe for the service shell to answer directly -- while no generation has been admitted, provided the durable job runtime is -- ready. Mutating work belongs to the active generation. @@ -501,10 +651,46 @@ end local function update_service_jobs_projection(self) if not self._jobs then return end + local runtime = self._jobs:model_snapshot() + local persistence = runtime and runtime.persistence or nil self._model:update(function (s) - s.jobs = self._jobs:snapshot() + s.jobs = (runtime and runtime.jobs) or self._jobs:snapshot() + s.job_runtime = runtime and { + ready = runtime.ready == true, + transitions = copy(runtime.transitions), + persistence = copy(runtime.persistence), + } or nil return s end) + return persistence +end + +local function apply_job_persistence_status(self, persistence) + if persistence ~= nil then + local warning = persistence_warning(persistence) + self._last_warning = warning + self._model:update(function (s) + s.pending = s.pending or {} + s.pending.persistence = copy(persistence) + s.last_warning = copy(warning) + return s + end) + update_model_state(self, 'degraded', 'update_persistence_pending') + return true + end + + local snapshot = self._model:snapshot() + local was_pending = snapshot and snapshot.reason == 'update_persistence_pending' + self._last_warning = nil + self._model:update(function (s) + if s.pending then s.pending.persistence = nil end + s.last_warning = nil + return s + end) + if was_pending then + update_model_state(self, 'running') + end + return false end local function consider_active_jobs(self) @@ -516,7 +702,7 @@ local function consider_active_jobs(self) update_active_projection(self) if ok == nil and err ~= 'slot_busy' and err ~= 'no_active_intent' and err ~= 'not_ready' then - update_model_state(self, 'failed', err) + fail_update_service(self, 'active_runtime', err) error('update active runtime launch failed: ' .. tostring(err), 0) end @@ -527,14 +713,16 @@ local function handle_job_runtime_changed(self, ev) self._jobs_seen = ev.version if self._jobs and self._jobs:ready() and not self._job_runtime_ready then self._job_runtime_ready = true - update_service_jobs_projection(self) + local persistence = update_service_jobs_projection(self) + if apply_job_persistence_status(self, persistence) then return end local ok, err = reconcile_runtime_components(self, 'job_runtime_ready') if ok ~= true then - update_model_state(self, 'failed', err or 'runtime_dependents_start_failed') + fail_update_service(self, 'runtime_reconcile', err or 'runtime_dependents_start_failed', ev) error(err or 'runtime_dependents_start_failed', 0) end else - update_service_jobs_projection(self) + local persistence = update_service_jobs_projection(self) + if apply_job_persistence_status(self, persistence) then return end end if self._current_generation then apply_generation_snapshot(self, self._current_generation.last_snapshot or { @@ -564,7 +752,7 @@ local function handle_active_runtime_changed(self, ev) reason = ev and ev.reason or 'active_runtime_changed', }, 'update_generation_active_snapshot_admission_failed') if ok ~= true then - update_model_state(self, 'failed', err or 'active_snapshot_route_failed') + fail_update_service(self, 'manager_route', err or 'active_snapshot_route_failed', ev) error(err or 'active_snapshot_route_failed', 0) end end @@ -622,7 +810,7 @@ local function reduce_event(self, ev) return end - update_model_state(self, 'failed', reason) + fail_update_service(self, 'job_runtime', reason, ev) error(reason, 0) end @@ -671,10 +859,14 @@ local function reduce_event(self, ev) set_waiting_for_job_store(self, 'job_store_unavailable') return end - update_model_state(self, 'failed', reason) + fail_update_service(self, 'job_runtime', reason, ev) error('update job runtime failed: ' .. tostring(reason), 0) end - if ev.status == 'cancelled' and (self._suppress_dependents_reason or ev.primary == 'job_store_unavailable' or ev.primary == 'artifact_store_unavailable') then + if ev.status == 'cancelled' + and (self._suppress_dependents_reason + or ev.primary == 'job_store_unavailable' + or ev.primary == 'artifact_store_unavailable') + then return end if not self._complete then @@ -684,17 +876,31 @@ local function reduce_event(self, ev) end if ev.kind == 'component_done' and ev.component == 'active_runtime' then - if ev.status == 'cancelled' and (self._suppress_dependents_reason or ev.primary == 'job_store_unavailable' or ev.primary == 'artifact_store_unavailable') then + if ev.status == 'cancelled' + and (self._suppress_dependents_reason + or ev.primary == 'job_store_unavailable' + or ev.primary == 'artifact_store_unavailable') + then return end + self._active_component = nil if ev.status == 'failed' then local reason = ev.primary or 'active_runtime_failed' - if self._artifact_store_dependency_required and classify_dependency_route_missing(self, 'artifact_store', ev, reason) then + if self._deps:classify_call_failure('job_store', ev, reason) == 'route_missing' then + stop_runtime_dependents(self, 'job_store_unavailable') + stop_job_runtime(self, 'job_store_unavailable') + update_dependencies_projection(self) + set_waiting_for_job_store(self, 'job_store_unavailable') + return + end + if self._artifact_store_dependency_required + and classify_dependency_route_missing(self, 'artifact_store', ev, reason) + then handle_artifact_route_missing(self, reason) return end - update_model_state(self, 'failed', reason) + fail_update_service(self, 'active_runtime', reason, ev) error('update active runtime failed: ' .. tostring(reason), 0) end if not self._complete then @@ -704,13 +910,17 @@ local function reduce_event(self, ev) end if ev.kind == 'component_done' and ev.component == 'component_watch' then - if ev.status == 'cancelled' and (self._suppress_dependents_reason or ev.primary == 'job_store_unavailable' or ev.primary == 'artifact_store_unavailable') then + if ev.status == 'cancelled' + and (self._suppress_dependents_reason + or ev.primary == 'job_store_unavailable' + or ev.primary == 'artifact_store_unavailable') + then return end self._component_watch = nil if ev.status == 'failed' then local reason = ev.primary or 'component_watch_failed' - update_model_state(self, 'failed', reason) + fail_update_service(self, 'component_watch', reason, ev) error('update component watch failed: ' .. tostring(reason), 0) end if not self._complete then @@ -724,7 +934,7 @@ local function reduce_event(self, ev) self._publisher = nil if ev.status == 'failed' then local reason = ev.primary or 'publisher_failed' - update_model_state(self, 'failed', reason) + fail_update_service(self, 'publisher', reason, ev) error('update publisher failed: ' .. tostring(reason), 0) end if not self._complete then @@ -1035,7 +1245,7 @@ stop_runtime_dependents = function(self, reason) self._component_watch = nil end -local function artifact_store_available_for_work(self) +artifact_store_available_for_work = function(self) return not self._artifact_store_dependency_required or dependency_effectively_available(self, 'artifact_store') end @@ -1066,6 +1276,7 @@ local function ensure_job_runtime(self, reason) initial_jobs = params.initial_jobs, done_tx = self._done_tx, queue_len = params.job_runtime_queue_len, + persistence_retry_backoff = params.persistence_retry_backoff, }) if not jobs then return nil, jobs_err or 'update_job_repository_start_failed' @@ -1207,7 +1418,7 @@ handle_dependency_changed = function(self, ev) if dependency_effectively_available(self, 'job_store') then local ok, err = reconcile_runtime_components(self, 'job_store_available') if ok ~= true then - update_model_state(self, 'failed', err or 'job_runtime_start_failed') + fail_update_service(self, 'runtime_reconcile', err or 'job_runtime_start_failed', ev) error(err or 'job_runtime_start_failed', 0) end elseif self._job_store_dependency_required then @@ -1224,7 +1435,7 @@ handle_dependency_changed = function(self, ev) if dependency_effectively_available(self, 'artifact_store') then local ok, err = reconcile_runtime_components(self, 'artifact_store_available') if ok ~= true then - update_model_state(self, 'failed', err or 'artifact_store_reconcile_failed') + fail_update_service(self, 'runtime_reconcile', err or 'artifact_store_reconcile_failed', ev) error(err or 'artifact_store_reconcile_failed', 0) end elseif self._artifact_store_dependency_required then @@ -1267,7 +1478,7 @@ function M.run(scope, params) local manager_ep, merr = bind_manager(scope, params.conn, params) if merr then error(merr, 2) end - local config_watch, werr = open_config_watch(scope, params.conn, params) + local cfg_watch, werr = open_config_watch(scope, params.conn, params) if werr then error(werr, 2) end local job_dep_required = job_store_dependency_required(params) @@ -1346,8 +1557,8 @@ function M.run(scope, params) _active_runtime = nil, _manager_ep = manager_ep, manager_rx = manager_ep, - config_watch = config_watch, - config_rx = params.config_rx or config_watch, + config_watch = cfg_watch, + config_rx = params.config_rx or cfg_watch, publisher = nil, _publisher = nil, pending = {}, @@ -1390,7 +1601,15 @@ function M.run(scope, params) end local ok, err = reconcile_runtime_components(self, 'initial') - if ok ~= true then error(err or 'update runtime start failed', 2) end + if ok ~= true then + local reason = err or 'update_runtime_start_failed' + fail_update_service(self, 'runtime_reconcile', reason, { + kind = 'initial_runtime_reconcile', + status = 'failed', + primary = reason, + }) + error(reason, 2) + end return coordinator_loop(self) end diff --git a/src/shared/hash/xxhash32.lua b/src/shared/hash/xxhash32.lua index 6d28cd91d..59138b16e 100644 --- a/src/shared/hash/xxhash32.lua +++ b/src/shared/hash/xxhash32.lua @@ -6,10 +6,13 @@ -- transfer protocol use and tests without pulling in an external dependency. -- It is not a security primitive. -local ok_bit32, bit32_mod = pcall(require, 'bit32') local ok_bit, bit_mod = pcall(require, 'bit') +local ok_bit32, bit32_mod = pcall(require, 'bit32') -local bitops = ok_bit32 and bit32_mod or bit_mod +-- Prefer LuaJIT's native bit library when available. Some deployments also +-- provide a bit32 compatibility module; keeping bit first avoids backend +-- drift in the transfer checksum path. +local bitops = ok_bit and bit_mod or bit32_mod assert(bitops, 'shared.hash.xxhash32 requires bit32 or bit') local band = assert(bitops.band, 'bit library missing band') diff --git a/tests/integration/devhost/mcu_update_full_path_spec.lua b/tests/integration/devhost/mcu_update_full_path_spec.lua index 52bbe2b64..721ceccb8 100644 --- a/tests/integration/devhost/mcu_update_full_path_spec.lua +++ b/tests/integration/devhost/mcu_update_full_path_spec.lua @@ -530,7 +530,7 @@ local function start_ui(scope, bus, port, roots) service_id = 'ui', auth_opts = { users = { - tester = { password = 'test-password', principal = { kind = 'user', id = 'tester' } }, + tester = { password = 'test-password', principal = { kind = 'user', id = 'tester', roles = { 'admin' } } }, }, }, bus = bus, @@ -652,7 +652,8 @@ local function stop_instance(inst, reason) end end -local function run_http_upload(scope, port, body) +local function run_http_upload(scope, port, body, headers) + headers = headers or {} local driver = assert(http_driver_mod.new({ label = 'mcu-full-path-http-client' })) assert_true(driver:start(scope), 'HTTP upload client driver should start') local status, resp_body = fibers.perform(driver:run_op('mcu-full-path-upload', function () @@ -660,6 +661,9 @@ local function run_http_upload(scope, port, body) req.headers:upsert(':method', 'POST') req.headers:upsert('content-type', 'application/octet-stream') req.headers:upsert('content-length', tostring(#body)) + for k, v in pairs(headers) do + req.headers:upsert(k, tostring(v)) + end req:set_body(body) local headers, stream = assert(req:go(8)) local status = headers:get(':status') @@ -717,8 +721,18 @@ function T.ui_http_mcu_update_survives_fake_reboot_and_reconciles() log('waiting for initial canonical MCU software state') wait_component_software(cm5.conn, 'mcu-image-old', 'mcu-boot-1') + log('logging in through real HTTP JSON') + local login_status, login_body, login_decoded = run_http_json(root_scope, port, '/api/login', { + username = 'tester', + password = 'test-password', + }) + assert_eq(login_status, '200', login_body) + assert_not_nil(login_decoded and login_decoded.session, login_body) + local sid = login_decoded.session.id + assert_not_nil(sid, 'login should return a session id') + log('sending real HTTP upload') - local status, body = run_http_upload(root_scope, port, blob) + local status, body = run_http_upload(root_scope, port, blob, { ['x-session-id'] = sid }) assert_eq(status, '200', 'upload HTTP status ' .. tostring(status) .. ': ' .. tostring(body)) local decoded = assert(cjson.decode(body), body) assert_eq(decoded.status, 'ok') @@ -731,16 +745,6 @@ function T.ui_http_mcu_update_survives_fake_reboot_and_reconciles() return fake.staged and fake.staged.bytes == blob end, { timeout = 4.0 }), 'fake MCU should stage transferred artifact') - log('logging in through real HTTP JSON') - local login_status, login_body, login_decoded = run_http_json(root_scope, port, '/api/login', { - username = 'tester', - password = 'test-password', - }) - assert_eq(login_status, '200', login_body) - assert_not_nil(login_decoded and login_decoded.session, login_body) - local sid = login_decoded.session.id - assert_not_nil(sid, 'login should return a session id') - log('committing job through real HTTP JSON command route') local commit_status, commit_body, commit_decoded = run_http_json( root_scope, diff --git a/tests/run.lua b/tests/run.lua index 230bf3a70..0e1fc79fd 100644 --- a/tests/run.lua +++ b/tests/run.lua @@ -73,6 +73,7 @@ local files = { 'integration.devhost.device_public_seams_spec', 'unit.update.test_active_runtime', 'unit.update.test_architecture', + 'unit.update.test_artifact_store_update_adapters', 'unit.update.test_bundled_probe', 'unit.update.test_config', 'unit.update.test_events', diff --git a/tests/unit/device/test_catalogue.lua b/tests/unit/device/test_catalogue.lua index e198ad46b..733a6f1a2 100644 --- a/tests/unit/device/test_catalogue.lua +++ b/tests/unit/device/test_catalogue.lua @@ -102,7 +102,18 @@ function tests.test_fabric_stage_requires_target_and_rejects_receiver() cat, err = config.to_catalogue(cfg) assert_nil(err) assert_not_nil(cat.components.mcu.actions['stage-update']) + assert_eq(cat.components.mcu.actions['stage-update'].timeout, 15 * 60) end + +function tests.test_default_mcu_stage_update_has_long_timeout() + local cat = assert(config.to_catalogue(nil)) + local action = cat.components.mcu.actions['stage-update'] + assert_not_nil(action) + assert_eq(action.kind, 'fabric_stage') + assert_eq(action.chunk_size, 1024) + assert_eq(action.timeout, 15 * 60) +end + function tests.test_catalogue_material_comparison_is_stable_for_copies() local a = assert(config.to_catalogue(sample_config())) local b = catalogue.copy(a) diff --git a/tests/unit/device/test_phase2.lua b/tests/unit/device/test_phase2.lua index 397df89f4..cba578ed1 100644 --- a/tests/unit/device/test_phase2.lua +++ b/tests/unit/device/test_phase2.lua @@ -5,6 +5,7 @@ local op = require 'fibers.op' local cond = require 'fibers.cond' local mailbox = require 'fibers.mailbox' local sleep = require 'fibers.sleep' +local busmod = require 'bus' local config = require 'services.device.config' local catalogue = require 'services.device.catalogue' @@ -15,6 +16,10 @@ local observer = require 'services.device.observer' local topics = require 'services.device.topics' local projection = require 'services.device.projection' local component_mcu = require 'services.device.component_mcu' +local fabric_topics = require 'services.fabric.topics' +local fabric_bus_adapter = require 'services.fabric.bus_adapter' +local fabric_protocol = require 'services.fabric.protocol' +local fabric_session = require 'services.fabric.session' local tests = {} @@ -25,6 +30,12 @@ local function assert_false(v, msg) if v ~= false then fail(msg or ('expected fa local function assert_nil(v, msg) if v ~= nil then fail(msg or ('expected nil, got ' .. tostring(v))) end end local function assert_not_nil(v, msg) if v == nil then fail(msg or 'expected non-nil') end end +local function join_strings(items) + local out = {} + for i = 1, #(items or {}) do out[i] = tostring(items[i]) end + return table.concat(out, ',') +end + local function wait_for_signal(c, msg) local which = fibers.perform(fibers.named_choice{ ready = c:wait_op(), @@ -50,6 +61,39 @@ local function take_stage_source(params, expected) return source end +function tests.test_default_fabric_client_uses_public_transfer_manager_rpc() + local called + local conn = {} + function conn:call_op(topic, payload, opts) + called = { topic = topic, payload = payload, opts = opts } + return op.always({ ok = true, result = { ok = true, staged = true } }) + end + + fibers.run(function () + local client = service.default_fabric_client(conn) + assert_not_nil(client) + local result, err = fibers.perform(client:send_blob_op({ + job_id = 'job-1', + target = 'updater/main', + source_owner = { id = 'owner' }, + size = 123, + digest_alg = 'sha256', + digest = 'abc', + chunk_size = 1024, + meta = { kind = 'firmware' }, + }, { timeout = 3.0 })) + assert_true(result.ok, tostring(err)) + end) + + assert_not_nil(called) + assert_eq(table.concat(called.topic, '/'), table.concat(fabric_topics.transfer_manager_rpc('send-blob'), '/')) + assert_eq(called.payload.request_id, 'job-1') + assert_eq(called.payload.target, 'updater/main') + assert_eq(called.payload.chunk_size, 1024) + assert_eq(called.payload.timeout_s, 3.0) + assert_eq(called.opts.timeout, 3.0) +end + local function sample_config(display_name) return { schema = config.SCHEMA, @@ -638,6 +682,134 @@ function tests.test_fabric_stage_open_source_may_be_an_explicit_op() end) end +function tests.test_fabric_stage_opens_artifact_ref_through_store_bus() + fibers.run(function (scope) + local source = { id = 'src-artifact' } + local artifact = {} + function artifact:open_source_op() + return op.always(true, source) + end + local conn = { + call_op = function (_, topic, payload) + assert_eq(table.concat(topic, '/'), 'cap/artifact-store/main/rpc/open') + assert_eq(payload.artifact_ref, 'artifact-1') + return op.always({ ok = true, reason = artifact }, nil) + end, + } + local r = req({ artifact_ref = 'artifact-1', size = 512 }) + local client = { + send_blob_op = function (_, params) + take_stage_source(params, source) + assert_eq(params.request.artifact_ref, 'artifact-1') + return op.always({ ok = true, staged = true }) + end, + } + + local result = action_worker.run(scope, { + conn = conn, + request = r, + component_id = 'mcu', + action = 'stage-update', + request_id = 'r-artifact-ref', + action_spec = { kind = 'fabric_stage', target = 'updater/main', artifact_store = 'main' }, + fabric_client = client, + terminate_source = function () + fail('source should have been handed off, not terminated') + end, + }) + + assert_true(result.ok) + assert_not_nil(r.replied) + end) +end + +function tests.test_fabric_stage_artifact_ref_failure_terminates_opened_source_once() + local terminate_count = 0 + fibers.run(function (scope) + local source = { id = 'src-artifact-failed' } + local artifact = {} + function artifact:open_source_op() + return op.always(true, source) + end + local conn = { + call_op = function () + return op.always({ ok = true, reason = artifact }, nil) + end, + } + local r = req({ artifact_ref = 'artifact-1' }) + local client = { + send_blob_op = function () + return op.always(nil, 'stage_rejected') + end, + } + + local result = action_worker.run(scope, { + conn = conn, + request = r, + component_id = 'mcu', + action = 'stage-update', + request_id = 'r-artifact-ref-failed', + action_spec = { kind = 'fabric_stage', target = 'updater/main', artifact_store = 'main' }, + fabric_client = client, + terminate_source = function (v) + assert_eq(v, source) + terminate_count = terminate_count + 1 + return true + end, + }) + + assert_eq(result.ok, false) + assert_eq(r.failed, 'stage_rejected') + end) + assert_eq(terminate_count, 1) +end + +function tests.test_rpc_stage_opens_artifact_ref_and_terminates_after_reply() + local terminate_count = 0 + fibers.run(function (scope) + local source = { id = 'src-rpc-stage' } + local artifact = {} + function artifact:open_source_op() + return op.always(true, source) + end + local conn = { + call_op = function (_, topic, payload) + if table.concat(topic, '/') == 'cap/artifact-store/main/rpc/open' then + assert_eq(payload.artifact_ref, 'artifact-1') + return op.always({ ok = true, reason = artifact }, nil) + end + assert_eq(table.concat(topic, '/'), 'raw/host/updater/cap/updater/cm5/rpc/stage') + assert_eq(payload.artifact_ref, 'artifact-1') + assert_eq(payload.source, source) + return op.always({ accepted = true, staged = true }, nil) + end, + } + local r = req({ artifact_ref = 'artifact-1' }) + + local result = action_worker.run(scope, { + conn = conn, + request = r, + component_id = 'cm5', + action = 'stage-update', + request_id = 'r-rpc-artifact-ref', + action_spec = { + kind = 'rpc', + call_topic = { 'raw', 'host', 'updater', 'cap', 'updater', 'cm5', 'rpc', 'stage' }, + artifact_store = 'main', + }, + terminate_source = function (v) + assert_eq(v, source) + terminate_count = terminate_count + 1 + return true + end, + }) + + assert_true(result.ok) + assert_not_nil(r.replied) + end) + assert_eq(terminate_count, 1) +end + function tests.test_fabric_stage_timeout_cancels_child_stage_and_terminates_unhanded_source() local terminate_count = 0 @@ -877,6 +1049,235 @@ local function fake_live_conn() return c end +local function recv_observation(rx, timeout_s) + local which, ev, err = fibers.perform(fibers.named_choice { + event = rx:recv_op(), + timeout = sleep.sleep_op(timeout_s or 1), + }) + if which == 'timeout' then return nil, 'timeout' end + return ev, err +end + +local function assert_no_observation(rx, timeout_s) + local ev, err = recv_observation(rx, timeout_s or 0.02) + assert_nil(ev, 'unexpected observation: ' .. tostring(ev and ev.tag or err)) +end + +local function wait_for_watches(conn, count) + local deadline = fibers.now() + 1 + while #conn.watches < count and fibers.now() < deadline do + fibers.perform(sleep.sleep_op(0.001)) + end + assert_eq(#conn.watches, count, 'observer did not open expected retained watches') +end + +local function watches_by_fact(conn) + local out = {} + for _, watch in ipairs(conn.watches) do + out[watch.topic[#watch.topic]] = watch + end + return out +end + +local function send_watch_event(watch, ev) + assert_not_nil(watch, 'missing retained watch') + assert_true(fibers.perform(watch.tx:send_op(ev))) +end + +local function fabric_ctx(gen, sid) + return fabric_session.new_session_context { + proto = fabric_protocol.PROTO, + link_id = 'mcu-uart0', + link_generation = 4, + session_generation = gen or 9, + peer_sid = sid or 'mcu-sid-1', + peer_node = 'mcu', + } +end + +local function start_observer_worker(scope, conn, tx, component) + local ok, err = scope:spawn(function () + observer.run(scope, { + conn = conn, + tx = tx, + generation = 1, + component_id = 'mcu', + component = component, + }) + end) + assert_true(ok, err) +end + +function tests.test_observer_retained_replay_done_does_not_overwrite_retained_fact() + fibers.run(function () + local st, _rep, primary = fibers.run_scope(function (scope) + local conn = fake_live_conn() + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + start_observer_worker(scope, conn, tx, { + module = component_mcu, + facts = { + software = { watch_topic = { 'raw', 'member', 'mcu', 'state', 'software' } }, + }, + }) + wait_for_watches(conn, 1) + + send_watch_event(conn.watches[1], { + op = 'retain', + payload = { image_id = 'mcu-dev-15.0', boot_id = 'boot-new' }, + }) + local ev = assert(recv_observation(rx)) + assert_eq(ev.tag, 'fact_retained') + assert_eq(ev.fact, 'software') + assert_eq(ev.payload.image_id, 'mcu-dev-15.0') + assert_eq(ev.payload.boot_id, 'boot-new') + + send_watch_event(conn.watches[1], { op = 'replay_done' }) + assert_no_observation(rx, 0.02) + scope:cancel('test_done') + end) + assert_eq(st, 'cancelled', tostring(primary)) + assert_eq(primary, 'test_done') + end) +end + +function tests.test_observer_replay_done_without_retained_fact_does_not_refresh_required_freshness() + fibers.run(function () + local st, _rep, primary = fibers.run_scope(function (scope) + local conn = fake_live_conn() + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + start_observer_worker(scope, conn, tx, { + module = component_mcu, + required_facts = { 'software' }, + observe_opts = { stale_after_s = 0.05 }, + facts = { + software = { watch_topic = { 'raw', 'member', 'mcu', 'state', 'software' } }, + }, + }) + wait_for_watches(conn, 1) + fibers.perform(sleep.sleep_op(0.04)) + + send_watch_event(conn.watches[1], { op = 'replay_done' }) + local ev, err = recv_observation(rx, 0.04) + assert_not_nil(ev, err) + assert_eq(ev.tag, 'source_down') + assert_eq(ev.reason, 'stale') + scope:cancel('test_done') + end) + assert_eq(st, 'cancelled', tostring(primary)) + assert_eq(primary, 'test_done') + end) +end + +function tests.test_observer_mcu_critical_replay_metadata_does_not_overwrite_facts() + fibers.run(function () + local st, _rep, primary = fibers.run_scope(function (scope) + local cat = assert(config.to_catalogue({ + schema = config.SCHEMA, + components = { + mcu = { + subtype = 'mcu', + required_facts = { 'software', 'updater', 'health' }, + facts = { + software = topics.raw_member_state('mcu', 'software'), + updater = topics.raw_member_state('mcu', 'updater'), + health = topics.raw_member_state('mcu', 'health'), + }, + }, + }, + })) + local conn = fake_live_conn() + local tx, rx = mailbox.new(16, { full = 'reject_newest' }) + local m = model_mod.new() + m:apply_catalogue(1, cat) + start_observer_worker(scope, conn, tx, cat.components.mcu) + wait_for_watches(conn, 3) + + local watches = watches_by_fact(conn) + local origin = { + extra = { + fabric = { + kind = 'remote_retain', + link_id = 'mcu-uart0', + link_generation = 5, + session = { + peer_sid = 'mcu-sid-1', + session_generation = 11, + }, + }, + }, + } + send_watch_event(watches.software, { + op = 'retain', + payload = { image_id = 'mcu-dev-15.0', boot_id = 'boot-new' }, + origin = origin, + }) + send_watch_event(watches.updater, { + op = 'retain', + payload = { state = 'idle', job_id = 'job-1' }, + origin = origin, + }) + send_watch_event(watches.health, { + op = 'retain', + payload = { state = 'ok' }, + origin = origin, + }) + + for _ = 1, 3 do + local ev = assert(recv_observation(rx)) + assert_eq(ev.tag, 'fact_retained') + assert_true(m:apply_observation(1, ev)) + end + + send_watch_event(watches.software, { op = 'replay_done' }) + send_watch_event(watches.updater, { op = 'replay_done' }) + send_watch_event(watches.health, { op = 'replay_done' }) + assert_no_observation(rx, 0.03) + + local rec = m:snapshot().components.mcu + assert_eq(rec.raw_facts.software.image_id, 'mcu-dev-15.0') + assert_eq(rec.raw_facts.software.boot_id, 'boot-new') + assert_eq(rec.raw_facts.updater.state, 'idle') + assert_eq(rec.raw_facts.health, 'ok') + local payloads = projection.component_payloads('mcu', rec, 12) + assert_eq(payloads.component.control_plane.facts.software.fabric.peer_sid, 'mcu-sid-1') + assert_eq(payloads.component.control_plane.facts.updater.fabric.session_generation, 11) + assert_eq(payloads.component.control_plane.facts.health.fabric.link_id, 'mcu-uart0') + scope:cancel('test_done') + end) + assert_eq(st, 'cancelled', tostring(primary)) + assert_eq(primary, 'test_done') + end) +end + +function tests.test_observer_unretain_and_unknown_retained_watch_events_are_strict() + fibers.run(function () + local st, _rep, primary = fibers.run_scope(function (scope) + local conn = fake_live_conn() + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + start_observer_worker(scope, conn, tx, { + facts = { + software = { watch_topic = { 'raw', 'member', 'mcu', 'state', 'software' } }, + }, + }) + wait_for_watches(conn, 1) + + send_watch_event(conn.watches[1], { op = 'unretain' }) + local ev = assert(recv_observation(rx)) + assert_eq(ev.tag, 'fact_unretained') + assert_eq(ev.fact, 'software') + + send_watch_event(conn.watches[1], { op = 'replace', payload = { image_id = 'bad' } }) + ev = assert(recv_observation(rx)) + assert_eq(ev.tag, 'source_down') + assert_eq(ev.reason, 'unknown_fact_event:software:replace') + assert_no_observation(rx, 0.02) + scope:cancel('test_done') + end) + assert_eq(st, 'cancelled', tostring(primary)) + assert_eq(primary, 'test_done') + end) +end + function tests.test_config_replacement_with_live_observer_terminates_raw_watch() fibers.run(function (scope) local conn = fake_live_conn() @@ -1095,6 +1496,164 @@ function tests.test_mcu_fault_availability_is_stored_and_projected() end) end +function tests.test_mcu_control_plane_projection_preserves_fabric_fact_origin() + fibers.run(function () + local cat = assert(config.to_catalogue({ + schema = config.SCHEMA, + components = { + mcu = { + subtype = 'mcu', + required_facts = { 'software', 'updater', 'health' }, + facts = { + software = topics.raw_member_state('mcu', 'software'), + updater = topics.raw_member_state('mcu', 'updater'), + health = topics.raw_member_state('mcu', 'health'), + }, + }, + }, + })) + local origin = { + extra = { + fabric = { + kind = 'remote_retain', + link_id = 'mcu-uart0', + link_generation = 4, + session = { + peer_sid = 'mcu-sid-1', + session_generation = 9, + }, + }, + }, + } + local m = model_mod.new() + m:apply_catalogue(1, cat) + m:apply_observation(1, { + component = 'mcu', + tag = 'fact_retained', + fact = 'software', + payload = { image_id = 'img-old', boot_id = 'boot-old' }, + origin = origin, + }) + m:apply_observation(1, { + component = 'mcu', + tag = 'fact_retained', + fact = 'updater', + payload = { state = 'ready' }, + origin = origin, + }) + m:apply_observation(1, { + component = 'mcu', + tag = 'fact_retained', + fact = 'health', + payload = { state = 'ok' }, + origin = origin, + }) + + local payloads = projection.component_payloads('mcu', m:snapshot().components.mcu, 10) + local cp = payloads.component.control_plane + assert_not_nil(cp) + assert_eq(cp.facts.software.fabric.peer_sid, 'mcu-sid-1') + assert_eq(cp.facts.updater.fabric.session_generation, 9) + assert_eq(cp.facts.health.fabric.link_id, 'mcu-uart0') + assert_eq(payloads.component.software.boot_id, 'boot-old') + assert_eq(payloads.component.updater.state, 'ready') + assert_eq(payloads.component.health, 'ok') + + m:apply_observation(1, { + component = 'mcu', + tag = 'fact_unretained', + fact = 'health', + origin = origin, + }) + payloads = projection.component_payloads('mcu', m:snapshot().components.mcu, 11) + assert_eq(payloads.component.control_plane.facts.health.seen, false) + assert_nil(payloads.component.control_plane.facts.health.fabric) + end) +end + +function tests.test_mcu_control_plane_real_fabric_retained_import_replay_has_origin() + fibers.run(function () + local st, _rep, primary = fibers.run_scope(function (scope) + local bus = busmod.new() + local import_conn = bus:connect() + local observe_conn = bus:connect() + local check_conn = bus:connect() + local rt = fabric_bus_adapter.local_runtime(scope, import_conn, { + link_id = 'mcu-uart0', + link_generation = 4, + }) + local session = fabric_ctx(12, 'mcu-sid-real') + local ready_watch = assert(check_conn:watch_retained( + topics.raw_member_state('mcu', 'health'), + { replay = false, queue_len = 4, full = 'reject_newest' } + )) + local imports = { + { + topic = topics.raw_member_state('mcu', 'software'), + payload = { image_id = 'mcu-dev-15.7', boot_id = 'boot-real' }, + }, + { + topic = topics.raw_member_state('mcu', 'updater'), + payload = { state = 'ready' }, + }, + { + topic = topics.raw_member_state('mcu', 'health'), + payload = { state = 'ok' }, + }, + } + for _, item in ipairs(imports) do + assert_true(rt.command_tx:send({ + kind = 'retain', + topic = item.topic, + payload = item.payload, + session = session, + origin_kind = 'remote_retain', + })) + end + + local ready_ev = fibers.perform(ready_watch:recv_op()) + assert_eq(ready_ev.op, 'retain') + assert_eq(ready_ev.origin.extra.fabric.session.peer_sid, 'mcu-sid-real') + assert_eq(ready_ev.origin.extra.fabric.session.session_generation, 12) + check_conn:unwatch_retained(ready_watch) + + local cat = catalogue.build(nil) + local m = model_mod.new() + m:apply_catalogue(1, cat) + local tx, rx = mailbox.new(16, { full = 'reject_newest' }) + start_observer_worker(scope, observe_conn, tx, cat.components.mcu) + + local seen = {} + while not (seen.software and seen.updater and seen.health) do + local ev = assert(recv_observation(rx, 0.25)) + if ev.tag == 'fact_retained' then + assert_eq(ev.origin.extra.fabric.session.peer_sid, 'mcu-sid-real') + m:apply_observation(1, ev) + seen[ev.fact] = true + end + end + + local payloads = projection.component_payloads('mcu', m:snapshot().components.mcu, 20) + local cp = payloads.component.control_plane + assert_true(cp.ready, 'control plane should be ready from real Fabric retained replay; missing_origin=' + .. join_strings(cp.status.missing_origin_facts) + .. ' missing_facts=' .. join_strings(cp.status.missing_facts)) + assert_true(cp.status.ready, 'control plane status should be ready') + assert_eq(#cp.status.missing_origin_facts, 0) + assert_eq(cp.status.peer_sid, 'mcu-sid-real') + assert_eq(cp.status.session_generation, 12) + assert_eq(cp.status.link_id, 'mcu-uart0') + assert_eq(cp.status.link_generation, 4) + assert_eq(cp.facts.software.fabric.peer_sid, 'mcu-sid-real') + assert_eq(cp.facts.updater.fabric.session_generation, 12) + assert_eq(cp.facts.health.fabric.link_id, 'mcu-uart0') + scope:cancel('test_done') + end) + assert_eq(st, 'cancelled', tostring(primary)) + assert_eq(primary, 'test_done') + end) +end + function tests.test_host_missing_software_degrades_rather_than_disappears() fibers.run(function () local cat = assert(config.to_catalogue({ @@ -1137,6 +1696,17 @@ function tests.test_component_module_ignores_non_contract_mcu_fact_aliases() assert_nil(updater.last_error) end +function tests.test_component_module_preserves_mcu_updater_boot_buy_rc() + local updater = assert(component_mcu.normalise_fact('updater', { + state = 'failed', + last_error = 'abupdate_buy_failed', + boot_buy_rc = -42, + })) + assert_eq(updater.state, 'failed') + assert_eq(updater.last_error, 'abupdate_buy_failed') + assert_eq(updater.boot_buy_rc, -42) +end + function tests.test_stale_action_completion_is_archived_and_clears_pending_without_mutating_model() fibers.run(function (scope) diff --git a/tests/unit/device/test_publisher.lua b/tests/unit/device/test_publisher.lua index fd21f3dab..72cb621bb 100644 --- a/tests/unit/device/test_publisher.lua +++ b/tests/unit/device/test_publisher.lua @@ -8,6 +8,7 @@ local topics = require 'services.device.topics' local tests = {} local function fail(msg) error(msg or 'assertion failed', 2) end local function assert_eq(a, b, msg) if a ~= b then fail(msg or ('expected ' .. tostring(b) .. ', got ' .. tostring(a))) end end +local function assert_nil(v, msg) if v ~= nil then fail(msg or ('expected nil, got ' .. tostring(v))) end end local function assert_true(v, msg) if v ~= true then fail(msg or ('expected true, got ' .. tostring(v))) end end local function key(topic) return table.concat(topic, '/') end @@ -44,4 +45,28 @@ function tests.test_publish_component_and_summary_are_immediate_effects() assert_eq(conn.retained['state/device/components'].counts.total, 1) end +function tests.test_component_software_projection_recovers_after_raw_unretain_and_retain() + local cat = assert(config.to_catalogue({ + schema = config.SCHEMA, + components = { + mcu = { subtype = 'mcu', facts = { software = topics.raw_member_state('mcu', 'software') } }, + }, + })) + local m = model_mod.new() + m:apply_catalogue(1, cat) + m:apply_observation(1, { component = 'mcu', tag = 'fact_retained', fact = 'software', payload = { image_id = 'old', boot_id = 'boot-old' } }) + m:apply_observation(1, { component = 'mcu', tag = 'fact_unretained', fact = 'software' }) + + local conn = fake_conn() + local ok, err = publisher.publish_component_now(conn, m:snapshot(), 'mcu', { now = function () return 123 end }) + assert_true(ok, err) + assert_nil(conn.retained['state/device/component/mcu/software'].boot_id) + + m:apply_observation(1, { component = 'mcu', tag = 'fact_retained', fact = 'software', payload = { image_id = 'new', version = '15.5', boot_id = 'boot-new' } }) + ok, err = publisher.publish_component_now(conn, m:snapshot(), 'mcu', { now = function () return 124 end }) + assert_true(ok, err) + assert_eq(conn.retained['state/device/component/mcu/software'].image_id, 'new') + assert_eq(conn.retained['state/device/component/mcu/software'].boot_id, 'boot-new') +end + return tests diff --git a/tests/unit/fabric/test_bridge.lua b/tests/unit/fabric/test_bridge.lua index b1d19c298..afe9edbd8 100644 --- a/tests/unit/fabric/test_bridge.lua +++ b/tests/unit/fabric/test_bridge.lua @@ -34,6 +34,33 @@ local function try_recv(rx) return queue.try_recv_now(rx) end +local function topic_key(topic) + return table.concat(topic or {}, '/') +end + +local function recv_topic_set(rx, n, label) + local out = {} + for i = 1, n do + local cmd = recv_with_timeout(rx, (label or 'topic command') .. ' ' .. tostring(i)) + out[topic_key(cmd.topic)] = cmd + end + return out +end + +local function recv_state_with(rx, pred, label, timeout) + timeout = timeout or 0.25 + local deadline = fibers.now() + timeout + while fibers.now() < deadline do + local item = try_recv(rx) + if item ~= nil then + if pred(item) then return item end + else + fibers.perform(sleep.sleep_op(0.001)) + end + end + fail('timed out waiting for ' .. tostring(label or 'state')) +end + local function ctx(gen, sid) return session.new_session_context { link_id = 'link-a', @@ -178,6 +205,16 @@ function tests.test_remote_retained_publish_emits_bus_command_and_updates_import assert_eq(cmd.topic[4], 'state') assert_eq(cmd.topic[5], 'software') assert_eq(cmd.session.peer_sid, 'sid-1') + local state = recv_state_with(h.state_rx, function (ev) + return ev.kind == 'component_snapshot' + and ev.snapshot + and ev.snapshot.imported_retained_count == 1 + and ev.snapshot.last_imported_topic == 'raw/member/mcu/state/software' + end, 'bridge import status') + assert_eq(state.snapshot.peer_sid, 'sid-1') + assert_eq(state.snapshot.session_generation, 1) + assert_not_nil(state.snapshot.last_imported_at) + assert_eq(state.snapshot.imported_retained_topics[1], 'raw/member/mcu/state/software') close_bridge(h) end) end @@ -233,6 +270,119 @@ function tests.test_peer_session_drop_clears_imported_retained_state_by_bus_comm assert_eq(cmd.kind, 'unretain') assert_eq(cmd.topic[5], 'software') assert_eq(cmd.session.peer_sid, 'sid-1') + local state = recv_state_with(h.state_rx, function (ev) + return ev.kind == 'component_snapshot' + and ev.snapshot + and ev.snapshot.last_clear_reason == 'bad_frame_limit' + end, 'bridge clear status') + assert_eq(state.snapshot.last_clear_count, 1) + assert_eq(state.snapshot.imported_retained_count, 0) + assert_eq(#state.snapshot.imported_retained_topics, 0) + close_bridge(h) + end) +end + +function tests.test_peer_session_drop_then_reimport_restores_software_import_status() + fibers.run(function (scope) + local h = start_bridge(scope) + assert_true(h.session_tx:send(peer_session_event(1, 'sid-1'))) + assert_true(h.session_tx:send(rpc_event(assert(protocol.pub({ 'state', 'self', 'software' }, { image_id = 'old' }, true)), 1, 'sid-1'))) + recv_with_timeout(h.bus_rx, 'retain command') + assert_true(h.session_tx:send(peer_drop_event(1, 'sid-1', 'bad_frame_limit'))) + recv_with_timeout(h.bus_rx, 'clear command') + + assert_true(h.session_tx:send(peer_session_event(2, 'sid-2'))) + assert_true(h.session_tx:send(rpc_event(assert(protocol.pub({ 'state', 'self', 'software' }, { + image_id = 'new', + version = '15.2', + boot_id = 'boot-new', + }, true)), 2, 'sid-2'))) + local cmd = recv_with_timeout(h.bus_rx, 'reimport command') + assert_eq(cmd.kind, 'retain') + assert_eq(cmd.topic[5], 'software') + assert_eq(cmd.payload.image_id, 'new') + assert_eq(cmd.session.peer_sid, 'sid-2') + + local state = recv_state_with(h.state_rx, function (ev) + return ev.kind == 'component_snapshot' + and ev.snapshot + and ev.snapshot.peer_sid == 'sid-2' + and ev.snapshot.imported_retained_count == 1 + and ev.snapshot.imported_retained_topics[1] == 'raw/member/mcu/state/software' + end, 'bridge reimport status') + assert_eq(state.snapshot.session_generation, 2) + close_bridge(h) + end) +end + +function tests.test_peer_session_drop_then_reimport_restores_critical_import_status() + fibers.run(function (scope) + local h = start_bridge(scope) + assert_true(h.session_tx:send(peer_session_event(1, 'sid-1'))) + assert_true(h.session_tx:send(rpc_event(assert(protocol.pub({ 'state', 'self', 'software' }, { image_id = 'old' }, true)), 1, 'sid-1'))) + assert_true(h.session_tx:send(rpc_event(assert(protocol.pub({ 'state', 'self', 'updater' }, { state = 'rebooting' }, true)), 1, 'sid-1'))) + assert_true(h.session_tx:send(rpc_event(assert(protocol.pub({ 'state', 'self', 'health' }, { state = 'ok' }, true)), 1, 'sid-1'))) + recv_topic_set(h.bus_rx, 3, 'initial critical retain') + + assert_true(h.session_tx:send(peer_drop_event(1, 'sid-1', 'bad_frame_limit'))) + local cleared = recv_topic_set(h.bus_rx, 3, 'critical clear') + assert_not_nil(cleared['raw/member/mcu/state/software']) + assert_not_nil(cleared['raw/member/mcu/state/updater']) + assert_not_nil(cleared['raw/member/mcu/state/health']) + + local cleared_state = recv_state_with(h.state_rx, function (ev) + return ev.kind == 'component_snapshot' + and ev.snapshot + and ev.snapshot.last_clear_reason == 'bad_frame_limit' + and ev.snapshot.last_clear_count == 3 + and ev.snapshot.imported_retained_count == 0 + end, 'critical clear status') + assert_eq(#cleared_state.snapshot.imported_retained_topics, 0) + + assert_true(h.session_tx:send(peer_session_event(2, 'sid-2'))) + assert_true(h.session_tx:send(rpc_event(assert(protocol.pub({ 'state', 'self', 'runtime', 'memory' }, { + free = 1234, + }, true)), 2, 'sid-2'))) + local telemetry = recv_with_timeout(h.bus_rx, 'non-critical reimport') + assert_eq(telemetry.kind, 'retain') + assert_eq(table.concat(telemetry.topic, '/'), 'raw/member/mcu/state/runtime/memory') + local telemetry_state = recv_state_with(h.state_rx, function (ev) + return ev.kind == 'component_snapshot' + and ev.snapshot + and ev.snapshot.peer_sid == 'sid-2' + and ev.snapshot.imported_retained_count == 1 + and ev.snapshot.imported_retained_topics[1] == 'raw/member/mcu/state/runtime/memory' + end, 'non-critical reimport status') + assert_eq(telemetry_state.snapshot.session_generation, 2) + + assert_true(h.session_tx:send(rpc_event(assert(protocol.pub({ 'state', 'self', 'software' }, { + image_id = 'new', + version = '15.2', + boot_id = 'boot-new', + }, true)), 2, 'sid-2'))) + assert_true(h.session_tx:send(rpc_event(assert(protocol.pub({ 'state', 'self', 'updater' }, { state = 'running' }, true)), 2, 'sid-2'))) + assert_true(h.session_tx:send(rpc_event(assert(protocol.pub({ 'state', 'self', 'health' }, { state = 'ok' }, true)), 2, 'sid-2'))) + local imported = recv_topic_set(h.bus_rx, 3, 'critical reimport') + assert_eq(imported['raw/member/mcu/state/software'].payload.image_id, 'new') + assert_eq(imported['raw/member/mcu/state/software'].session.peer_sid, 'sid-2') + assert_eq(imported['raw/member/mcu/state/updater'].payload.state, 'running') + assert_eq(imported['raw/member/mcu/state/health'].payload.state, 'ok') + + local state = recv_state_with(h.state_rx, function (ev) + return ev.kind == 'component_snapshot' + and ev.snapshot + and ev.snapshot.peer_sid == 'sid-2' + and ev.snapshot.session_generation == 2 + and ev.snapshot.imported_retained_count == 4 + end, 'critical reimport status') + local topics = {} + for _, topic in ipairs(state.snapshot.imported_retained_topics or {}) do + topics[topic] = true + end + assert_true(topics['raw/member/mcu/state/runtime/memory'], 'runtime retained import missing') + assert_true(topics['raw/member/mcu/state/software'], 'software retained import missing') + assert_true(topics['raw/member/mcu/state/updater'], 'updater retained import missing') + assert_true(topics['raw/member/mcu/state/health'], 'health retained import missing') close_bridge(h) end) end @@ -295,6 +445,62 @@ function tests.test_outbound_call_sends_call_frame_and_routes_remote_reply_to_re end) end +function tests.test_outbound_prepare_call_updates_bridge_diagnostics() + fibers.run(function (scope) + local local_topic = { 'raw', 'member', 'mcu', 'cap', 'updater', 'main', 'rpc', 'prepare-update' } + local remote_topic = { 'cap', 'self', 'updater', 'main', 'rpc', 'prepare-update' } + local h = start_bridge(scope, { + outbound_call_rules = { + { local_topic = local_topic, remote_topic = remote_topic }, + }, + }) + assert_true(h.session_tx:send(peer_session_event(3, 'mcu-sid-1'))) + local req = fake_request() + assert_true(h.local_tx:send({ + kind = 'call', + id = 'prepare-1', + topic = local_topic, + payload = { + job_id = 'job-prepare', + expected_image_id = 'mcu-dev-15.3', + }, + request = req, + })) + + local frame = recv_with_timeout(h.rpc_rx, 'outbound prepare call').frame + assert_eq(frame.type, 'call') + assert_eq(frame.id, 'prepare-1') + assert_eq(topic_key(frame.topic), topic_key(remote_topic)) + + local sent = recv_state_with(h.state_rx, function (ev) + local diag = ev.snapshot and ev.snapshot.last_outbound_call + return diag + and diag.event == 'sent' + and diag.call_id == 'prepare-1' + and diag.job_id == 'job-prepare' + end, 'outbound prepare sent diag') + local diag = sent.snapshot.last_outbound_call + assert_eq(diag.local_topic, topic_key(local_topic)) + assert_eq(diag.remote_topic, topic_key(remote_topic)) + assert_eq(diag.expected_image_id, 'mcu-dev-15.3') + assert_eq(diag.peer_sid, 'mcu-sid-1') + assert_eq(diag.session_generation, 3) + assert_eq(diag.frame_sent, true) + + local reply = assert(protocol.reply('prepare-1', true, { accepted = true }, nil)) + assert_true(h.session_tx:send(rpc_event(reply, 3, 'mcu-sid-1'))) + local done = recv_state_with(h.state_rx, function (ev) + local d = ev.snapshot and ev.snapshot.last_outbound_call + return d and d.call_id == 'prepare-1' and d.event == 'reply_ok' + end, 'outbound prepare reply diag') + assert_eq(done.snapshot.last_outbound_call.reply_routed, true) + local deadline = fibers.now() + 0.25 + while req.resolved == false and fibers.now() < deadline do fibers.perform(sleep.sleep_op(0.001)) end + assert_eq(req.resolved, 'reply') + close_bridge(h) + end) +end + function tests.test_bus_adapter_remote_commands_use_local_bus_methods() fibers.run(function (scope) local command_tx, command_rx = mailbox.new(8, { full = 'reject_newest' }) diff --git a/tests/unit/fabric/test_config.lua b/tests/unit/fabric/test_config.lua index 36d2be71e..4f43e8c96 100644 --- a/tests/unit/fabric/test_config.lua +++ b/tests/unit/fabric/test_config.lua @@ -1,11 +1,36 @@ local cfg = require 'services.fabric.config' +local cjson = require 'cjson.safe' local T = {} +local function read_project_file(rel) + local candidates = { + '../' .. rel, + rel, + 'devicecode-lua/' .. rel, + } + for i = 1, #candidates do + local f = io.open(candidates[i], 'rb') + if f then + local text = f:read('*a') + f:close() + return text + end + end + return nil +end + +local function decode_json(text) + local doc, err = cjson.decode(text) + assert(doc ~= nil, tostring(err)) + return doc +end + function T.compile_builds_canonical_runtime_plan() local compiled, err = cfg.compile({ schema = 'devicecode.config/fabric/1', local_node = 'host-a', + trace_io = true, links = { { id = 'uart0', @@ -15,6 +40,7 @@ function T.compile_builds_canonical_runtime_plan() source = 'uart', id = 'main', terminator = '\n', + cap_wait_timeout_s = 12.5, open_opts = { baud = 115200 }, }, session = { @@ -60,6 +86,9 @@ function T.compile_builds_canonical_runtime_plan() }, }, }, + writer = { + flush_each = false, + }, transfer = { chunk_size = 8192, timeout_s = 22.0, @@ -69,6 +98,7 @@ function T.compile_builds_canonical_runtime_plan() }) assert(compiled ~= nil, tostring(err)) + assert(compiled.service.trace_io == true) assert(compiled.service.local_node == 'host-a') assert(#compiled.links == 1) @@ -80,6 +110,7 @@ function T.compile_builds_canonical_runtime_plan() assert(link.transport.class == 'uart') assert(link.transport.id == 'main') assert(link.transport.open_opts.baud == 115200) + assert(link.transport.cap_wait_timeout_s == 12.5) assert(link.session.local_node == 'host-a') assert(link.session.hello_interval_s == 1.5) @@ -99,8 +130,10 @@ function T.compile_builds_canonical_runtime_plan() assert(link.bridge.max_inbound_calls == 7) assert(link.bridge.call_timeout_s == 4.5) + assert(link.writer.flush_each == false) assert(link.transfer.chunk_size == 8192) assert(link.transfer.timeout_s == 22.0) + assert(link.reader.bad_frame_quiet_s == cfg.DEFAULTS.reader.bad_frame_quiet_s) assert(compiled.routing.by_link_id['uart0'] == link) assert(compiled.routing.by_peer_id['peer-main'][1] == link) @@ -131,8 +164,32 @@ function T.compile_applies_defaults() assert(link.session.liveness_timeout_s == cfg.DEFAULTS.session.liveness_timeout_s) assert(link.bridge.max_pending_calls == cfg.DEFAULTS.bridge.max_pending_calls) + assert(link.writer.flush_each == cfg.DEFAULTS.writer.flush_each) assert(link.transfer.chunk_size == cfg.DEFAULTS.transfer.chunk_size) assert(link.transfer.timeout_s == cfg.DEFAULTS.transfer.timeout_s) + assert(compiled.service.trace_io == cfg.DEFAULTS.trace_io) +end + +function T.compile_accepts_trace_io_and_rejects_unknown_root_key() + local compiled, err = cfg.compile({ + schema = 'devicecode.config/fabric/1', + trace_io = false, + links = { + { id = 'uart0', peer_id = 'peer-a' }, + }, + }) + assert(compiled ~= nil, tostring(err)) + assert(compiled.service.trace_io == false) + + local bad, bad_err = cfg.compile({ + schema = 'devicecode.config/fabric/1', + trace_iio = true, + links = { + { id = 'uart0', peer_id = 'peer-a' }, + }, + }) + assert(bad == nil) + assert(tostring(bad_err):match('unknown field: trace_iio')) end function T.compile_rejects_wrong_schema() @@ -259,4 +316,47 @@ function T.compile_rejects_profile_field() assert(tostring(err):match('unknown field: profile')) end +function T.compile_rejects_invalid_transport_cap_wait_timeout() + local compiled, err = cfg.compile({ + schema = 'devicecode.config/fabric/1', + links = { + { + id = 'uart0', + peer_id = 'peer-a', + transport = { + cap_wait_timeout_s = 0, + }, + }, + }, + }) + assert(compiled == nil) + assert(tostring(err):match('transport.cap_wait_timeout_s must be a positive finite number')) +end + +function T.bigbox_mcu_fabric_link_uses_hal_uart_raw_source() + local text = assert(read_project_file('src/configs/bigbox-v1-cm-2.json')) + local doc = decode_json(text) + + local uart_ids = {} + for _, port in ipairs(doc.hal.data.uart.serial_ports or {}) do + uart_ids[port.id] = true + end + + local link + for _, candidate in ipairs(doc.fabric.data.links or {}) do + if candidate.id == 'mcu-uart0' then + link = candidate + break + end + end + + assert(link ~= nil, 'bigbox config must define fabric link mcu-uart0') + assert(link.peer_id == 'mcu') + assert(link.transport.source == 'uart_' .. link.transport.id) + assert(link.transport.class == 'uart') + assert(link.transport.cap_wait_timeout_s >= 30) + assert(link.transport.open_opts == nil, 'uart line settings belong in HAL uart config, not fabric open_opts') + assert(uart_ids[link.transport.id] == true, 'fabric transport id must match configured HAL uart') +end + return T diff --git a/tests/unit/fabric/test_fabric.lua b/tests/unit/fabric/test_fabric.lua index dd8fe270a..247f27b99 100644 --- a/tests/unit/fabric/test_fabric.lua +++ b/tests/unit/fabric/test_fabric.lua @@ -1,6 +1,8 @@ -- tests/unit/fabric/test_fabric.lua local fibers = require 'fibers' +local op = require 'fibers.op' +local sleep = require 'fibers.sleep' local mailbox = require 'fibers.mailbox' local fabric = require 'services.fabric' @@ -122,6 +124,41 @@ function tests.test_composed_link_wires_session_bridge_transfer_and_writer_witho end) end +function tests.test_hal_transport_drain_input_is_cancel_bounded() + fibers.run(function () + local chunks = { 'abc', 'def' } + local i = 0 + local session = { + read_line_op = function () return fibers.always(nil, 'eof') end, + write_op = function () return fibers.always(0, nil) end, + read_some_op = function () + return op.guard(function () + i = i + 1 + if chunks[i] ~= nil then + return fibers.always(chunks[i], nil) + end + return sleep.sleep_op(1) + end) + end, + terminate = function () return true, nil end, + } + + local wrapped = assert(hal_transport.wrap_transport(session)) + local result, err = fibers.perform(wrapped:drain_input_op({ + max_bytes = 64, + total_s = 0.050, + quiet_s = 0.005, + read_s = 0.002, + chunk_size = 8, + })) + + assert_eq(err, nil) + assert_eq(result.bytes, 6) + assert_eq(result.reads, 2) + assert_eq(result.reason, 'quiet') + end) +end + function tests.test_public_fabric_run_uses_supplied_link_runner() fibers.run(function () local called = false diff --git a/tests/unit/fabric/test_io.lua b/tests/unit/fabric/test_io.lua index 774099658..bf9680b6c 100644 --- a/tests/unit/fabric/test_io.lua +++ b/tests/unit/fabric/test_io.lua @@ -6,8 +6,11 @@ local sleep = require 'fibers.sleep' local mailbox = require 'fibers.mailbox' local io_mod = require 'services.fabric.io' +local hal_transport = require 'services.fabric.hal_transport' local link = require 'services.fabric.link' +local protocol = require 'services.fabric.protocol' local queue = require 'devicecode.support.queue' +local xxhash32 = require 'shared.hash.xxhash32' local tests = {} @@ -54,6 +57,43 @@ local function frames_reader(frames) end end +local function read_results(results) + local i = 0 + return function () + return op.guard(function () + i = i + 1 + local item = results[i] + if item == nil then + return op.always(nil, 'eof') + end + if item.err ~= nil then + return op.always(nil, item.err) + end + return op.always(item.frame, nil) + end) + end +end + +local function raw_line_transport(lines) + local i = 0 + return { + read_line_op = function () + return op.guard(function () + i = i + 1 + local line = lines[i] + if line == nil then + return op.always(nil, 'eof') + end + return op.always(line, nil) + end) + end, + write_line_op = function () + return op.always(true, nil) + end, + terminate = function () return true, nil end, + } +end + local function closed_rx(reason) local tx, rx = mailbox.new(1, { full = 'reject_newest' }) tx:close(reason or 'closed') @@ -67,6 +107,22 @@ local function send_frame(tx, frame, label) }, label or 'send_frame') end +local function capture_prints(fn) + local old_print = _G.print + local lines = {} + _G.print = function (...) + local parts = {} + for i = 1, select('#', ...) do + parts[#parts + 1] = tostring(select(i, ...)) + end + lines[#lines + 1] = table.concat(parts, '\t') + end + local ok, err = pcall(fn, lines) + _G.print = old_print + if not ok then error(err, 0) end + return lines +end + ------------------------------------------------------------------------------- -- Reader forwards frames and returns a result table ------------------------------------------------------------------------------- @@ -156,6 +212,486 @@ function tests.test_reader_read_error_fails_scope() end) end +function tests.test_reader_forwards_wire_errors_below_recovery_limit() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + + local result = io_mod.run_reader(scope, { + read_frame_op = read_results { + { err = 'decode_failed: first' }, + }, + downstream_tx = tx, + bad_frame_limit = 2, + bad_frame_window_s = 10, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.wire_errors, 1) + + local ev = fibers.perform(rx:recv_op()) + assert_eq(ev.kind, 'wire_error') + assert_eq(ev.err, 'decode_failed: first') + assert_eq(ev.wire_errors, 1) + assert_eq(ev.bad_frame_count, 1) + end) +end + +function tests.test_reader_wire_error_includes_bad_line_diagnostics_from_hal_transport() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local line = '[mem] boot\r\n{"type":"hello"' + local wrapped = assert(hal_transport.wrap_transport(raw_line_transport { line })) + + local result = io_mod.run_reader(scope, { + read_frame_op = function () return wrapped:read_frame_op() end, + downstream_tx = tx, + bad_frame_limit = 3, + bad_frame_window_s = 10, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.wire_errors, 1) + + local ev = fibers.perform(rx:recv_op()) + assert_eq(ev.kind, 'wire_error') + assert_match(ev.err, '^decode_failed') + assert_match(ev.last_decode_error, '^decode_failed') + assert_eq(ev.last_bad_line_len, #line) + assert_eq(ev.last_bad_line_xxhash32, xxhash32.digest_hex(line)) + assert_eq(ev.last_bad_line_head, '[mem] boot\\r\\n{"type":"hello"') + assert_eq(ev.last_bad_line_tail, '[mem] boot\\r\\n{"type":"hello"') + end) +end + +function tests.test_reader_ignores_blank_lines_from_hal_transport() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local hello = assert(protocol.encode_line(assert(protocol.hello_ack('peer-sid', 'mcu')))) + local wrapped = assert(hal_transport.wrap_transport(raw_line_transport { '', ' \t\r', hello })) + + local result = io_mod.run_reader(scope, { + read_frame_op = function () return wrapped:read_frame_op() end, + downstream_tx = tx, + bad_frame_limit = 2, + bad_frame_window_s = 10, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.frames_read, 1) + assert_eq(result.wire_errors, 0) + + local ev = fibers.perform(rx:recv_op()) + assert_eq(ev.kind, 'frame_received') + assert_eq(ev.frame.type, 'hello_ack') + assert_eq(ev.frame.sid, 'peer-sid') + end) +end + +function tests.test_reader_recovers_short_non_printable_prefix_before_hello_ack() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local hello = assert(protocol.encode_line(assert(protocol.hello_ack('peer-sid', 'mcu')))) + local line = string.char(0xfe) .. hello + local wrapped = assert(hal_transport.wrap_transport(raw_line_transport { + line, + })) + + local result = io_mod.run_reader(scope, { + read_frame_op = function () return wrapped:read_frame_op() end, + downstream_tx = tx, + bad_frame_limit = 2, + bad_frame_window_s = 10, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.frames_read, 1) + assert_eq(result.wire_errors, 0) + + local ev = fibers.perform(rx:recv_op()) + assert_eq(ev.kind, 'frame_received') + assert_eq(ev.frame.type, 'hello_ack') + assert_eq(ev.frame.sid, 'peer-sid') + assert_eq(ev.line_resync, true) + assert_eq(ev.last_line_resync_prefix_len, 1) + assert_eq(ev.last_line_resync_line_len, #line) + assert_eq(ev.last_line_resync_xxhash32, xxhash32.digest_hex(line)) + assert_eq(ev.last_line_resync_type, 'hello_ack') + assert_eq(ev.last_line_resync_peer_sid, 'peer-sid') + end) +end + +function tests.test_reader_recovers_long_non_printable_prefix_before_hello_ack() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local hello = assert(protocol.encode_line(assert(protocol.hello_ack('peer-sid-long', 'mcu')))) + local prefix = string.rep(string.char(0), 1592) + local line = prefix .. hello + local wrapped = assert(hal_transport.wrap_transport(raw_line_transport { line })) + + local result = io_mod.run_reader(scope, { + read_frame_op = function () return wrapped:read_frame_op() end, + downstream_tx = tx, + bad_frame_limit = 2, + bad_frame_window_s = 10, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.frames_read, 1) + assert_eq(result.wire_errors, 0) + + local ev = fibers.perform(rx:recv_op()) + assert_eq(ev.kind, 'frame_received') + assert_eq(ev.frame.type, 'hello_ack') + assert_eq(ev.frame.sid, 'peer-sid-long') + assert_eq(ev.line_resync, true) + assert_eq(ev.last_line_resync_prefix_len, #prefix) + assert_eq(ev.last_line_resync_line_len, #line) + assert_eq(ev.last_line_resync_xxhash32, xxhash32.digest_hex(line)) + assert_eq(ev.last_line_resync_type, 'hello_ack') + assert_eq(ev.last_line_resync_peer_sid, 'peer-sid-long') + end) +end + +function tests.test_reader_recovers_non_printable_prefix_before_hello() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local hello = assert(protocol.encode_line(assert(protocol.hello('peer-sid-hello', 'mcu')))) + local prefix = string.char(0xfe, 0x00) + local line = prefix .. hello + local wrapped = assert(hal_transport.wrap_transport(raw_line_transport { line })) + + local result = io_mod.run_reader(scope, { + read_frame_op = function () return wrapped:read_frame_op() end, + downstream_tx = tx, + bad_frame_limit = 2, + bad_frame_window_s = 10, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.frames_read, 1) + assert_eq(result.wire_errors, 0) + + local ev = fibers.perform(rx:recv_op()) + assert_eq(ev.kind, 'frame_received') + assert_eq(ev.frame.type, 'hello') + assert_eq(ev.frame.sid, 'peer-sid-hello') + assert_eq(ev.line_resync, true) + assert_eq(ev.last_line_resync_prefix_len, #prefix) + assert_eq(ev.last_line_resync_type, 'hello') + assert_eq(ev.last_line_resync_peer_sid, 'peer-sid-hello') + end) +end + +function tests.test_reader_resync_rejects_printable_prefix_before_json() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local hello = assert(protocol.encode_line(assert(protocol.hello_ack('peer-sid', 'mcu')))) + local line = '[mem] boot ' .. hello + local wrapped = assert(hal_transport.wrap_transport(raw_line_transport { line })) + + local result = io_mod.run_reader(scope, { + read_frame_op = function () return wrapped:read_frame_op() end, + downstream_tx = tx, + bad_frame_limit = 2, + bad_frame_window_s = 10, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.frames_read, 0) + assert_eq(result.wire_errors, 1) + + local ev = fibers.perform(rx:recv_op()) + assert_eq(ev.kind, 'wire_error') + assert_match(ev.err, '^decode_failed') + assert_nil(ev.line_resync) + assert_eq(ev.last_bad_line_len, #line) + end) +end + +function tests.test_reader_resync_rejects_prefix_beyond_scan_window() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local hello = assert(protocol.encode_line(assert(protocol.hello_ack('peer-sid', 'mcu')))) + local prefix = string.rep(string.char(0), 4096) + local line = prefix .. hello + local wrapped = assert(hal_transport.wrap_transport(raw_line_transport { line })) + + local result = io_mod.run_reader(scope, { + read_frame_op = function () return wrapped:read_frame_op() end, + downstream_tx = tx, + bad_frame_limit = 2, + bad_frame_window_s = 10, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.frames_read, 0) + assert_eq(result.wire_errors, 1) + + local ev = fibers.perform(rx:recv_op()) + assert_eq(ev.kind, 'wire_error') + assert_match(ev.err, '^decode_failed') + assert_nil(ev.line_resync) + assert_eq(ev.last_bad_line_len, #line) + end) +end + +function tests.test_reader_resync_rejects_non_handshake_frame() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local frame = { + type = 'xfer_ready', + xfer_id = 'xfer-resync', + } + local line = string.char(1, 2, 3) .. assert(protocol.encode_line(frame)) + local wrapped = assert(hal_transport.wrap_transport(raw_line_transport { line })) + + local result = io_mod.run_reader(scope, { + read_frame_op = function () return wrapped:read_frame_op() end, + downstream_tx = tx, + bad_frame_limit = 2, + bad_frame_window_s = 10, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.frames_read, 0) + assert_eq(result.wire_errors, 1) + + local ev = fibers.perform(rx:recv_op()) + assert_eq(ev.kind, 'wire_error') + assert_match(ev.err, '^decode_failed') + assert_nil(ev.line_resync) + assert_eq(ev.last_bad_line_len, #line) + assert_eq(ev.last_bad_line_xxhash32, xxhash32.digest_hex(line)) + end) +end + +function tests.test_reader_recovery_includes_last_bad_line_diagnostics() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local first = '[value] first' + local second = '{"type":"hello"}{"type":"hello_ack"}' + local wrapped = assert(hal_transport.wrap_transport(raw_line_transport { first, second })) + + local result = io_mod.run_reader(scope, { + read_frame_op = function () return wrapped:read_frame_op() end, + downstream_tx = tx, + bad_frame_limit = 2, + bad_frame_window_s = 10, + bad_frame_quiet_s = 0.01, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.wire_errors, 2) + + local ev1 = fibers.perform(rx:recv_op()) + local ev2 = fibers.perform(rx:recv_op()) + assert_eq(ev1.kind, 'wire_error') + assert_eq(ev2.kind, 'wire_recovery') + assert_eq(ev2.reason, 'bad_frame_limit') + assert_match(ev2.last_decode_error, '^decode_failed') + assert_eq(ev2.last_bad_line_len, #second) + assert_eq(ev2.last_bad_line_xxhash32, xxhash32.digest_hex(second)) + assert_eq(ev2.last_bad_line_head, second) + assert_eq(ev2.last_bad_line_tail, second) + end) +end + +function tests.test_reader_coalesces_deadline_drains_until_quiet() + local logs = capture_prints(function () + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local gate = { drain_active = false, hello_quiet_until = 0 } + local drain_calls = 0 + local quiet_seen + local drain_results = { + { bytes = 9, reads = 1, reason = 'deadline' }, + { bytes = 8, reads = 1, reason = 'deadline' }, + { bytes = 3, reads = 1, reason = 'quiet' }, + } + + local result = io_mod.run_reader(scope, { + read_frame_op = read_results { + { err = 'decode_failed: first' }, + { err = 'decode_failed: second' }, + }, + downstream_tx = tx, + recovery_gate = gate, + trace_io = true, + bad_frame_limit = 2, + bad_frame_window_s = 10, + bad_frame_quiet_s = 0.20, + drain_input_op = function () + return op.guard(function () + drain_calls = drain_calls + 1 + assert_true(gate.drain_active, 'reader must keep drain_active throughout recovery') + assert_true(gate.hello_quiet_until > fibers.now(), 'reader must set hello quiet before draining') + quiet_seen = gate.hello_quiet_until + return op.always(drain_results[drain_calls], nil) + end) + end, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.wire_errors, 2) + assert_eq(drain_calls, 3) + assert_eq(gate.drain_active, false) + assert_eq(gate.hello_quiet_until, quiet_seen) + + local ev1 = fibers.perform(rx:recv_op()) + local ev2 = fibers.perform(rx:recv_op()) + assert_eq(ev1.kind, 'wire_error') + assert_eq(ev2.kind, 'wire_recovery') + assert_eq(ev2.reason, 'bad_frame_limit') + assert_eq(ev2.wire_errors, 2) + assert_eq(ev2.bad_frame_count, 2) + assert_eq(ev2.drained_bytes, 20) + assert_eq(ev2.drain_attempts, 3) + assert_eq(ev2.drain_reason, 'quiet') + assert_eq(ev2.quiet_until, quiet_seen) + end) + end) + + assert_eq(#logs, 1) + assert_match(logs[1], 'reader_wire_recovery') + assert_match(logs[1], 'drain_attempts') + assert_match(logs[1], '3') +end + +function tests.test_reader_recovery_window_expires_after_repeated_deadlines() + fibers.run(function (scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local gate = { drain_active = false, hello_quiet_until = 0 } + local drain_calls = 0 + + local result = io_mod.run_reader(scope, { + read_frame_op = read_results { + { err = 'decode_failed: first' }, + { err = 'decode_failed: second' }, + }, + downstream_tx = tx, + recovery_gate = gate, + bad_frame_limit = 2, + bad_frame_window_s = 10, + bad_frame_quiet_s = 0.035, + drain_input_op = function () + return op.guard(function () + drain_calls = drain_calls + 1 + assert_true(gate.drain_active, 'reader must keep drain_active throughout recovery') + return sleep.sleep_op(0.020):wrap(function () + return { bytes = 1, reads = 1, reason = 'deadline' }, nil + end) + end) + end, + }) + + assert_eq(result.role, 'reader') + assert_eq(result.wire_errors, 2) + assert_eq(drain_calls, 2) + assert_eq(gate.drain_active, false) + + local ev1 = fibers.perform(rx:recv_op()) + local ev2 = fibers.perform(rx:recv_op()) + assert_eq(ev1.kind, 'wire_error') + assert_eq(ev2.kind, 'wire_recovery') + assert_eq(ev2.drain_attempts, 2) + assert_eq(ev2.drained_bytes, 2) + assert_eq(ev2.drain_reason, 'recovery_window_expired') + end) +end + +function tests.test_reader_recovery_cancellation_clears_gate_and_propagates() + fibers.run(function (root_scope) + local tx, rx = mailbox.new(8, { full = 'reject_newest' }) + local entered_tx, entered_rx = mailbox.new(1, { full = 'reject_newest' }) + local gate = { drain_active = false, hello_quiet_until = 0 } + local child = assert(root_scope:child()) + + local ok, err = child:spawn(function (scope) + io_mod.run_reader(scope, { + read_frame_op = read_results { + { err = 'decode_failed: first' }, + }, + downstream_tx = tx, + recovery_gate = gate, + bad_frame_limit = 1, + bad_frame_window_s = 10, + bad_frame_quiet_s = 1.0, + drain_input_op = function () + return op.guard(function () + queue.try_admit_required(entered_tx, true, 'drain_entered') + assert_true(gate.drain_active, 'reader must set drain_active before draining') + return op.never() + end) + end, + }) + end) + assert_true(ok, err) + + assert_true(fibers.perform(entered_rx:recv_op())) + assert_eq(gate.drain_active, true) + child:cancel('cancelled_for_test') + + local st, _, primary = fibers.perform(child:join_op()) + assert_eq(st, 'cancelled') + assert_eq(primary, 'cancelled_for_test') + assert_eq(gate.drain_active, false) + + local ev, recv_err = queue.try_recv_now(rx) + assert_nil(ev) + assert_eq(recv_err, 'not_ready') + end) +end + +function tests.test_reader_recovery_downstream_close_and_reject_clear_gate() + fibers.run(function (scope) + local tx = mailbox.new(1, { full = 'reject_newest' }) + local gate = { drain_active = false, hello_quiet_until = 0 } + tx:close('downstream_done') + + local result = io_mod.run_reader(scope, { + read_frame_op = read_results { + { err = 'decode_failed: first' }, + }, + downstream_tx = tx, + recovery_gate = gate, + bad_frame_limit = 1, + bad_frame_window_s = 10, + bad_frame_quiet_s = 0.05, + drain_input_op = function () + return op.always({ bytes = 0, reads = 0, reason = 'quiet' }, nil) + end, + }) + + assert_eq(result.reason, 'downstream_done') + assert_eq(gate.drain_active, false) + end) + + fibers.run(function () + local tx = mailbox.new(0, { full = 'reject_newest' }) + local gate = { drain_active = false, hello_quiet_until = 0 } + + local st, _, primary = fibers.run_scope(function (scope) + io_mod.run_reader(scope, { + read_frame_op = read_results { + { err = 'decode_failed: first' }, + }, + downstream_tx = tx, + recovery_gate = gate, + bad_frame_limit = 1, + bad_frame_window_s = 10, + bad_frame_quiet_s = 0.05, + drain_input_op = function () + return op.always({ bytes = 0, reads = 0, reason = 'quiet' }, nil) + end, + }) + end) + + assert_eq(st, 'failed') + assert_match(primary, 'reader downstream rejected wire event') + assert_eq(gate.drain_active, false) + end) +end + ------------------------------------------------------------------------------- -- Lane writer writes queued frames and flushes on close ------------------------------------------------------------------------------- @@ -237,6 +773,131 @@ function tests.test_lane_writer_can_flush_each_frame() end) end +function tests.test_lane_writer_pauses_writes_while_recovery_drain_is_active() + fibers.run(function (scope) + local rpc_tx, rpc_rx = mailbox.new(8, { full = 'reject_newest' }) + local done_tx, done_rx = mailbox.new(1, { full = 'reject_newest' }) + local gate = { drain_active = true, hello_quiet_until = 0 } + local written = {} + + send_frame(rpc_tx, 'rpc-frame', 'rpc_frame') + rpc_tx:close('rpc_done') + + assert_true(scope:spawn(function () + local result = io_mod.run_lane_writer(scope, { + control_rx = closed_rx('control_done'), + rpc_rx = rpc_rx, + bulk_rx = closed_rx('bulk_done'), + recovery_gate = gate, + write_frame_op = function (frame) + return op.guard(function () + written[#written + 1] = frame + return op.always(true, nil) + end) + end, + }) + queue.try_admit_required(done_tx, result, 'writer_done') + end)) + + fibers.perform(sleep.sleep_op(0.03)) + assert_eq(#written, 0) + gate.drain_active = false + + local result = fibers.perform(done_rx:recv_op()) + assert_eq(result.frames_written, 1) + assert_eq(written[1], 'rpc-frame') + end) +end + +function tests.test_lane_writer_drops_queued_frames_from_stale_session() + fibers.run(function (scope) + local rpc_tx, rpc_rx = mailbox.new(8, { full = 'reject_newest' }) + local written = {} + local session_gate = { + current_session = { + link_id = 'link-a', + link_generation = 1, + session_generation = 2, + peer_sid = 'new-peer', + }, + drop_reason = 'peer_sid_changed', + } + + queue.try_admit_required(rpc_tx, { + kind = 'send_frame', + frame = 'stale-frame', + session = { + link_id = 'link-a', + link_generation = 1, + session_generation = 1, + peer_sid = 'old-peer', + }, + }, 'stale_frame') + queue.try_admit_required(rpc_tx, { + kind = 'send_frame', + frame = 'current-frame', + session = { + link_id = 'link-a', + link_generation = 1, + session_generation = 2, + peer_sid = 'new-peer', + }, + }, 'current_frame') + rpc_tx:close('rpc_done') + + local result = io_mod.run_lane_writer(scope, { + control_rx = closed_rx('control_done'), + rpc_rx = rpc_rx, + bulk_rx = closed_rx('bulk_done'), + session_gate = session_gate, + write_frame_op = function (frame) + return op.guard(function () + written[#written + 1] = frame + return op.always(true, nil) + end) + end, + }) + + assert_eq(result.frames_written, 1) + assert_eq(written[1], 'current-frame') + end) +end + +function tests.test_lane_writer_delays_hello_until_recovery_quiet_expires() + fibers.run(function (scope) + local control_tx, control_rx = mailbox.new(8, { full = 'reject_newest' }) + local done_tx, done_rx = mailbox.new(1, { full = 'reject_newest' }) + local gate = { drain_active = false, hello_quiet_until = fibers.now() + 0.06 } + local written = {} + + send_frame(control_tx, { type = 'hello' }, 'hello_frame') + control_tx:close('control_done') + + assert_true(scope:spawn(function () + local result = io_mod.run_lane_writer(scope, { + control_rx = control_rx, + rpc_rx = closed_rx('rpc_done'), + bulk_rx = closed_rx('bulk_done'), + recovery_gate = gate, + write_frame_op = function (frame) + return op.guard(function () + written[#written + 1] = frame + return op.always(true, nil) + end) + end, + }) + queue.try_admit_required(done_tx, result, 'writer_done') + end)) + + fibers.perform(sleep.sleep_op(0.02)) + assert_eq(#written, 0) + + local result = fibers.perform(done_rx:recv_op()) + assert_eq(result.frames_written, 1) + assert_eq(written[1].type, 'hello') + end) +end + ------------------------------------------------------------------------------- -- Lane writer write errors fail the owning scope ------------------------------------------------------------------------------- diff --git a/tests/unit/fabric/test_link.lua b/tests/unit/fabric/test_link.lua index 55aff4a0c..1f2798f7d 100644 --- a/tests/unit/fabric/test_link.lua +++ b/tests/unit/fabric/test_link.lua @@ -100,7 +100,7 @@ function tests.test_session_control_establishes_from_hello_and_sends_ack() queue.try_admit_required(control_tx, { kind = 'frame_received', - frame = assert(protocol.hello('peer-sid', 'peer-node')), + frame = assert(protocol.hello('peer-sid', 'peer-a')), }, 'remote_hello') local ack = fibers.perform(out_rx:recv_op()) @@ -113,7 +113,7 @@ function tests.test_session_control_establishes_from_hello_and_sends_ack() assert_eq(result.role, 'session') assert_eq(result.snapshot.established, true) assert_eq(result.snapshot.peer_sid, 'peer-sid') - assert_eq(result.snapshot.peer_node, 'peer-node') + assert_eq(result.snapshot.peer_node, 'peer-a') assert_eq(result.snapshot.link_generation, 1) end) end @@ -155,7 +155,7 @@ function tests.test_session_liveness_timeout_resets_to_hello() assert_eq(fibers.perform(out_rx:recv_op()).frame.type, 'hello') queue.try_admit_required(control_tx, { kind = 'frame_received', - frame = assert(protocol.hello('peer-sid', 'peer-node')), + frame = assert(protocol.hello('peer-sid', 'peer-a')), }, 'remote_hello') assert_eq(fibers.perform(out_rx:recv_op()).frame.type, 'hello_ack') @@ -216,7 +216,7 @@ function tests.test_session_ping_is_emitted_before_liveness_deadline() assert_eq(fibers.perform(out_rx:recv_op()).frame.type, 'hello') queue.try_admit_required(control_tx, { kind = 'frame_received', - frame = assert(protocol.hello('peer-sid', 'peer-node')), + frame = assert(protocol.hello('peer-sid', 'peer-a')), }, 'remote_hello') assert_eq(fibers.perform(out_rx:recv_op()).frame.type, 'hello_ack') @@ -265,7 +265,7 @@ function tests.test_session_control_processes_ready_control_before_timer_work() assert_eq(fibers.perform(out_rx:recv_op()).frame.type, 'hello') queue.try_admit_required(control_tx, { kind = 'frame_received', - frame = assert(protocol.hello('peer-sid', 'peer-node')), + frame = assert(protocol.hello('peer-sid', 'peer-a')), }, 'remote_hello') assert_eq(fibers.perform(out_rx:recv_op()).frame.type, 'hello_ack') diff --git a/tests/unit/fabric/test_protocol.lua b/tests/unit/fabric/test_protocol.lua index 13cd3a777..64c98a13a 100644 --- a/tests/unit/fabric/test_protocol.lua +++ b/tests/unit/fabric/test_protocol.lua @@ -111,6 +111,10 @@ function tests.test_ping_and_pong_require_sid() ok, err = protocol.validate({ type = 'pong', sid = 'sid-1' }) assert_not_nil(ok) assert_nil(err) + + ok, err = protocol.validate({ type = 'ping', sid = 'sid-1', ts = 1 }) + assert_nil(ok) + assert_eq(err, 'unknown_frame_field: ts') end function tests.test_pub_requires_dense_scalar_topic_and_boolean_retain() diff --git a/tests/unit/fabric/test_session.lua b/tests/unit/fabric/test_session.lua index 9d9f85cbc..a314e95b2 100644 --- a/tests/unit/fabric/test_session.lua +++ b/tests/unit/fabric/test_session.lua @@ -45,6 +45,13 @@ local function recv_with_timeout(rx, label, timeout) return item end +local function expect_no_item(rx, label, timeout) + timeout = timeout or 0.05 + fibers.perform(sleep.sleep_op(timeout)) + local item = queue.try_recv_now(rx) + assert_nil(item, label or 'expected no queued item') +end + local function start_session(scope, opts) opts = opts or {} local frame_tx, frame_rx = mailbox.new(16, { full = 'reject_newest' }) @@ -57,7 +64,7 @@ local function start_session(scope, opts) local ok, err = scope:spawn(function () local result = session.run(scope, { link_id = opts.link_id or 'link-a', - peer_id = opts.peer_id or 'peer-a', + peer_id = opts.peer_id or 'mcu', local_node = opts.local_node or 'cm5', local_sid = opts.local_sid or 'cm5-sid', identity_claim = opts.identity_claim, @@ -94,12 +101,57 @@ local function admit_frame(tx, frame) assert_true(ok, err) end -local function admit_wire_error(tx, err) - local ok, admit_err = queue.try_admit_required(tx, { +local function admit_frame_event(tx, frame, extra) + local ev = { + kind = 'frame_received', + frame = frame, + } + for k, v in pairs(extra or {}) do + ev[k] = v + end + local ok, err = queue.try_admit_required(tx, ev, 'test_frame_admit_failed') + assert_true(ok, err) +end + +local BAD_LINE_DIAG = { + last_decode_error = 'decode_failed: expected value', + last_bad_line_len = 12, + last_bad_line_xxhash32 = '12345678', + last_bad_line_head = '[mem] boot', + last_bad_line_tail = '[mem] boot', +} + +local function add_bad_line_diag(item, opts) + if type(opts) ~= 'table' then return item end + for k, v in pairs(opts) do + if k:match('^last_') then item[k] = v end + end + return item +end + +local function admit_wire_error(tx, err, opts) + local ok, admit_err = queue.try_admit_required(tx, add_bad_line_diag({ kind = 'wire_error', err = err or 'decode_failed: bad json', at = fibers.now(), - }, 'test_wire_error_admit_failed') + wire_errors = 1, + bad_frame_count = 1, + }, opts), 'test_wire_error_admit_failed') + assert_true(ok, admit_err) +end + +local function admit_wire_recovery(tx, opts) + opts = opts or {} + local ok, admit_err = queue.try_admit_required(tx, add_bad_line_diag({ + kind = 'wire_recovery', + reason = opts.reason or 'bad_frame_limit', + at = fibers.now(), + wire_errors = opts.wire_errors or 2, + bad_frame_count = opts.bad_frame_count or 2, + drained_bytes = opts.drained_bytes or 7, + drain_err = opts.drain_err, + quiet_until = opts.quiet_until or (fibers.now() + 0.05), + }, opts), 'test_wire_recovery_admit_failed') assert_true(ok, admit_err) end @@ -213,6 +265,33 @@ function tests.test_new_peer_sid_drops_old_generation_and_starts_next_generation end) end +function tests.test_session_rejects_wrong_peer_handshake_before_expected_peer() + fibers.run(function (scope) + local h = start_session(scope, { peer_id = 'mcu' }) + recv_with_timeout(h.control_rx, 'initial hello') + + admit_frame(h.frame_tx, assert(protocol.hello_ack('wrong-sid', 'bigbox-cm5'))) + expect_no_item(h.rpc_rx, 'wrong peer ack should not emit rpc peer session') + expect_no_item(h.transfer_rx, 'wrong peer ack should not emit transfer peer session') + + admit_frame(h.frame_tx, assert(protocol.hello('mcu-sid', 'mcu'))) + local rpc_ev = recv_with_timeout(h.rpc_rx, 'rpc peer session after real hello') + local xfer_ev = recv_with_timeout(h.transfer_rx, 'transfer peer session after real hello') + assert_eq(rpc_ev.kind, 'peer_session') + assert_eq(xfer_ev.kind, 'peer_session') + assert_eq(rpc_ev.session.peer_node, 'mcu') + assert_eq(rpc_ev.session.peer_sid, 'mcu-sid') + assert_eq(xfer_ev.session.peer_node, 'mcu') + assert_eq(xfer_ev.session.peer_sid, 'mcu-sid') + + local ack = recv_with_timeout(h.control_rx, 'hello ack for real peer') + assert_eq(ack.frame.type, 'hello_ack') + + h.frame_tx:close('done') + recv_with_timeout(h.done_rx, 'session done') + end) +end + function tests.test_wire_errors_below_limit_are_counted_without_dropping_session() @@ -223,7 +302,7 @@ function tests.test_wire_errors_below_limit_are_counted_without_dropping_session recv_with_timeout(h.rpc_rx, 'peer session') recv_with_timeout(h.transfer_rx, 'transfer peer session') - admit_wire_error(h.frame_tx, 'decode_failed: truncated line') + admit_wire_error(h.frame_tx, 'decode_failed: truncated line', BAD_LINE_DIAG) local stale = queue.try_recv_now(h.rpc_rx) assert_nil(stale, 'single bad frame should not drop session') @@ -236,31 +315,79 @@ function tests.test_wire_errors_below_limit_are_counted_without_dropping_session local done = recv_with_timeout(h.done_rx, 'session done') assert_eq(done.snapshot.wire_errors, 1) assert_eq(done.snapshot.last_wire_error, 'decode_failed: truncated line') + assert_eq(done.snapshot.last_decode_error, BAD_LINE_DIAG.last_decode_error) + assert_eq(done.snapshot.last_bad_line_len, BAD_LINE_DIAG.last_bad_line_len) + assert_eq(done.snapshot.last_bad_line_xxhash32, BAD_LINE_DIAG.last_bad_line_xxhash32) + assert_eq(done.snapshot.last_bad_line_head, BAD_LINE_DIAG.last_bad_line_head) + assert_eq(done.snapshot.last_bad_line_tail, BAD_LINE_DIAG.last_bad_line_tail) + end) +end + +function tests.test_resynced_hello_ack_updates_session_status_without_wire_error() + fibers.run(function (scope) + local h = start_session(scope) + recv_with_timeout(h.control_rx, 'initial hello') + admit_frame_event(h.frame_tx, assert(protocol.hello_ack('mcu-sid-resync', 'mcu')), { + line_resync = true, + last_line_resync_prefix_len = 1592, + last_line_resync_line_len = 1660, + last_line_resync_xxhash32 = 'feed1234', + last_line_resync_type = 'hello_ack', + last_line_resync_peer_sid = 'mcu-sid-resync', + }) + local peer = recv_with_timeout(h.rpc_rx, 'peer session') + assert_eq(peer.kind, 'peer_session') + assert_eq(peer.session.peer_sid, 'mcu-sid-resync') + recv_with_timeout(h.transfer_rx, 'transfer peer session') + + h.frame_tx:close('done') + local done = recv_with_timeout(h.done_rx, 'session done') + assert_eq(done.snapshot.wire_errors, 0) + assert_eq(done.snapshot.bad_frame_count, 0) + assert_eq(done.snapshot.line_resyncs, 1) + assert_eq(done.snapshot.line_resync, true) + assert_eq(done.snapshot.last_line_resync_prefix_len, 1592) + assert_eq(done.snapshot.last_line_resync_line_len, 1660) + assert_eq(done.snapshot.last_line_resync_xxhash32, 'feed1234') + assert_eq(done.snapshot.last_line_resync_type, 'hello_ack') + assert_eq(done.snapshot.last_line_resync_peer_sid, 'mcu-sid-resync') end) end -function tests.test_bad_frame_limit_drops_current_peer_session() +function tests.test_wire_recovery_drops_current_peer_session_and_delays_hello() fibers.run(function (scope) - local h = start_session(scope, { bad_frame_limit = 2, bad_frame_window_s = 10 }) + local h = start_session(scope) recv_with_timeout(h.control_rx, 'initial hello') admit_frame(h.frame_tx, assert(protocol.hello_ack('mcu-sid', 'mcu'))) recv_with_timeout(h.rpc_rx, 'peer session') recv_with_timeout(h.transfer_rx, 'transfer peer session') - admit_wire_error(h.frame_tx, 'decode_failed: first') - assert_nil(queue.try_recv_now(h.rpc_rx), 'first bad frame should be tolerated') - admit_wire_error(h.frame_tx, 'decode_failed: second') + local quiet_until = fibers.now() + 0.08 + admit_wire_recovery(h.frame_tx, { + quiet_until = quiet_until, + last_decode_error = 'decode_failed: joined objects', + last_bad_line_len = 34, + last_bad_line_xxhash32 = '87654321', + last_bad_line_head = '{"type":"hello"}{"type":"hello_ack"}', + last_bad_line_tail = '{"type":"hello"}{"type":"hello_ack"}', + }) local drop = recv_with_timeout(h.rpc_rx, 'peer session drop') assert_eq(drop.kind, 'peer_session_dropped') assert_eq(drop.reason, 'bad_frame_limit') assert_eq(drop.session.peer_sid, 'mcu-sid') + expect_no_item(h.control_rx, 'hello should be delayed during recovery quiet', 0.02) admit_frame(h.frame_tx, assert(protocol.pub({ 'state', 'self' }, { ok = true }, true))) assert_nil(queue.try_recv_now(h.rpc_rx), 'rpc frame should be dropped after bad-frame session reset') h.frame_tx:close('done') - recv_with_timeout(h.done_rx, 'session done') + local done = recv_with_timeout(h.done_rx, 'session done') + assert_eq(done.snapshot.last_wire_error, 'bad_frame_limit') + assert_eq(done.snapshot.last_decode_error, 'decode_failed: joined objects') + assert_eq(done.snapshot.last_bad_line_len, 34) + assert_eq(done.snapshot.last_bad_line_xxhash32, '87654321') + assert_eq(done.snapshot.last_bad_line_head, '{"type":"hello"}{"type":"hello_ack"}') end) end diff --git a/tests/unit/fabric/test_transfer.lua b/tests/unit/fabric/test_transfer.lua index 18dbc483b..bcb93a4d3 100644 --- a/tests/unit/fabric/test_transfer.lua +++ b/tests/unit/fabric/test_transfer.lua @@ -19,7 +19,9 @@ local function fail(msg) error(msg or 'assertion failed', 2) end local function assert_true(v, msg) if v ~= true then fail(msg or ('expected true, got ' .. tostring(v))) end end local function assert_nil(v, msg) if v ~= nil then fail(msg or ('expected nil, got ' .. tostring(v))) end end local function assert_not_nil(v, msg) if v == nil then fail(msg or 'expected non-nil value') end end -local function assert_eq(a, b, msg) if a ~= b then fail(msg or ('expected ' .. tostring(b) .. ', got ' .. tostring(a))) end end +local function assert_eq(a, b, msg) + if a ~= b then fail(msg or ('expected ' .. tostring(b) .. ', got ' .. tostring(a))) end +end local function recv_with_timeout(rx, label, timeout) timeout = timeout or 0.25 @@ -139,6 +141,97 @@ function tests.test_reducer_requires_session_context_for_claims() assert_eq(transfer.snapshot(state).active.session.peer_sid, 'sid-1') end +function tests.test_reducer_tracks_active_send_progress() + local state = transfer.new_state { manager_id = 'm' } + assert_true(transfer.claim_slot(state, { + request_id = 'r-progress', + request_generation = 1, + session = ctx(), + xfer_id = 'xfer-progress', + size = 6, + })) + + local ok, err = transfer.apply_progress(state, { + request_id = 'r-progress', + request_generation = 1, + session = ctx(), + xfer_id = 'xfer-progress', + sent = 3, + size = 6, + status = 'sending', + chunk_size = 3, + pending_offset = 3, + pending_next = 6, + last_transfer_event = 'chunk_tx', + }) + assert_true(ok, err) + local snap = transfer.snapshot(state) + assert_eq(snap.active.sent, 3) + assert_eq(snap.active.size, 6) + assert_eq(snap.active.status, 'sending') + assert_eq(snap.active.chunk_size, 3) + assert_eq(snap.active.pending_offset, 3) + assert_eq(snap.active.pending_next, 6) + assert_eq(snap.active.last_transfer_event, 'chunk_tx') +end + +function tests.test_reducer_requires_xfer_id_for_progress_completion_and_release() + local state = transfer.new_state { manager_id = 'm' } + assert_true(transfer.claim_slot(state, { + request_id = 'r-progress', + request_generation = 1, + session = ctx(), + xfer_id = 'xfer-current', + size = 6, + })) + + local ok, err = transfer.apply_progress(state, { + request_id = 'r-progress', + request_generation = 1, + session = ctx(), + xfer_id = 'xfer-other', + sent = 3, + }) + assert_eq(ok, false) + assert_eq(err, 'stale_transfer_progress') + assert_eq(transfer.snapshot(state).active.xfer_id, 'xfer-current') + + ok, err = transfer.apply_attempt_done(state, { + kind = 'transfer_attempt_done', + request_id = 'r-progress', + request_generation = 1, + session = ctx(), + xfer_id = 'xfer-other', + status = 'ok', + }) + assert_eq(ok, false) + assert_eq(err, 'stale_attempt_completion') + assert_eq(transfer.snapshot(state).active.xfer_id, 'xfer-current') + + ok, err = transfer.release_slot(state, { + kind = 'transfer_slot_released', + request_id = 'r-progress', + request_generation = 1, + session = ctx(), + xfer_id = 'xfer-other', + reason = 'wrong transfer', + }) + assert_eq(ok, false) + assert_eq(err, 'stale_slot_release') + assert_eq(transfer.snapshot(state).active.xfer_id, 'xfer-current') + + ok, err = transfer.apply_attempt_done(state, { + kind = 'transfer_attempt_done', + request_id = 'r-progress', + request_generation = 1, + session = ctx(), + xfer_id = 'xfer-current', + status = 'ok', + }) + assert_true(ok, err) + assert_eq(transfer.snapshot(state).active, nil) +end + function tests.test_slot_admission_without_session_fails_request() fibers.run(function (scope) local h = start_manager(scope) @@ -263,20 +356,20 @@ function tests.test_manager_receives_inbound_transfer_for_registered_target() local received = {} local committed = false local target = {} - function target:open_sink_op(req) + function target.open_sink_op(_, req) assert_eq(req.target, 'updater/main') assert_eq(req.size, 6) local sink = {} - function sink:append_op(chunk) + function sink.append_op(_, chunk) received[#received + 1] = chunk return fibers.always(true, nil) end - function sink:commit_op(req2) + function sink.commit_op(_, req2) committed = true assert_eq(req2.digest, protocol.digest_hex('abcdef')) return fibers.always({ staged = true }, nil) end - function sink:abort(_) return true, nil end + function sink.abort(_) return true, nil end return fibers.always(sink, nil) end @@ -335,21 +428,143 @@ function tests.test_manager_receives_inbound_transfer_for_registered_target() end) end +function tests.test_manager_reacks_stale_inbound_chunk_without_rewriting() + fibers.run(function (scope) + local received = {} + local target = {} + function target.open_sink_op(_, req) + assert_eq(req.target, 'updater/main') + local sink = {} + function sink.append_op(_, chunk) + received[#received + 1] = chunk + return fibers.always(true, nil) + end + function sink.commit_op(_) + return fibers.always({ staged = true }, nil) + end + function sink.abort(_) return true, nil end + return fibers.always(sink, nil) + end + + local h = start_manager(scope, { + chunk_size = 3, + receive_targets = { ['updater/main'] = target }, + }) + local c = ctx() + h.outbound:bind(c) + assert_true(h.session_tx:send(peer_session_event())) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_begin( + 'xfer-in-stale', 'updater/main', 6, protocol.DIGEST_ALG, protocol.digest_hex('abcdef'), nil + ))))) + + assert_eq(recv_with_timeout(h.control_rx, 'ready').frame.type, 'xfer_ready') + assert_eq(recv_with_timeout(h.control_rx, 'need 0').frame.next, 0) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_chunk( + 'xfer-in-stale', 0, 'abc', protocol.chunk_digest('abc') + ))))) + assert_eq(recv_with_timeout(h.control_rx, 'need 3').frame.next, 3) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_chunk( + 'xfer-in-stale', 0, 'abc', protocol.chunk_digest('abc') + ))))) + assert_eq(recv_with_timeout(h.control_rx, 'stale need 3').frame.next, 3) + assert_eq(#received, 1) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_chunk( + 'xfer-in-stale', 3, 'def', protocol.chunk_digest('def') + ))))) + assert_eq(recv_with_timeout(h.control_rx, 'need 6').frame.next, 6) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_commit( + 'xfer-in-stale', 6, protocol.DIGEST_ALG, protocol.digest_hex('abcdef') + ))))) + assert_eq(recv_with_timeout(h.control_rx, 'done').frame.type, 'xfer_done') + assert_eq(table.concat(received), 'abcdef') + + h.admission_tx:close('done') + h.session_tx:close('done') + recv_with_timeout(h.done_rx, 'manager done') + end) +end + +function tests.test_manager_reasks_current_offset_for_future_inbound_chunk_and_completes() + fibers.run(function (scope) + local received = {} + local target = {} + function target.open_sink_op(_, req) + assert_eq(req.target, 'updater/main') + local sink = {} + function sink.append_op(_, chunk) + received[#received + 1] = chunk + return fibers.always(true, nil) + end + function sink.commit_op(_) + return fibers.always({ staged = true }, nil) + end + function sink.abort(_) return true, nil end + return fibers.always(sink, nil) + end + + local h = start_manager(scope, { + chunk_size = 3, + receive_targets = { ['updater/main'] = target }, + }) + local c = ctx() + h.outbound:bind(c) + assert_true(h.session_tx:send(peer_session_event())) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_begin( + 'xfer-in-future', 'updater/main', 6, protocol.DIGEST_ALG, protocol.digest_hex('abcdef'), nil + ))))) + + assert_eq(recv_with_timeout(h.control_rx, 'ready').frame.type, 'xfer_ready') + assert_eq(recv_with_timeout(h.control_rx, 'need 0').frame.next, 0) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_chunk( + 'xfer-in-future', 3, 'def', protocol.chunk_digest('def') + ))))) + assert_eq(recv_with_timeout(h.control_rx, 'future need 0').frame.next, 0) + assert_eq(#received, 0) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_chunk( + 'xfer-in-future', 0, 'abc', protocol.chunk_digest('abc') + ))))) + assert_eq(recv_with_timeout(h.control_rx, 'need 3').frame.next, 3) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_chunk( + 'xfer-in-future', 3, 'def', protocol.chunk_digest('def') + ))))) + assert_eq(recv_with_timeout(h.control_rx, 'need 6').frame.next, 6) + + assert_true(h.session_tx:send(transfer_frame_event(assert(protocol.xfer_commit( + 'xfer-in-future', 6, protocol.DIGEST_ALG, protocol.digest_hex('abcdef') + ))))) + assert_eq(recv_with_timeout(h.control_rx, 'done').frame.type, 'xfer_done') + assert_eq(table.concat(received), 'abcdef') + + h.admission_tx:close('done') + h.session_tx:close('done') + recv_with_timeout(h.done_rx, 'manager done') + end) +end + function tests.test_manager_reasks_same_offset_after_bad_chunk_digest_and_accepts_retry() fibers.run(function (scope) local received = {} local target = {} - function target:open_sink_op(req) + function target.open_sink_op(_, req) assert_eq(req.target, 'updater/main') local sink = {} - function sink:append_op(chunk) + function sink.append_op(_, chunk) received[#received + 1] = chunk return fibers.always(true, nil) end - function sink:commit_op(_) + function sink.commit_op(_) return fibers.always({ staged = true }, nil) end - function sink:abort(_) return true, nil end + function sink.abort(_) return true, nil end return fibers.always(sink, nil) end diff --git a/tests/unit/fabric/test_transfer_sender.lua b/tests/unit/fabric/test_transfer_sender.lua index b4961e869..40f81cfc6 100644 --- a/tests/unit/fabric/test_transfer_sender.lua +++ b/tests/unit/fabric/test_transfer_sender.lua @@ -51,6 +51,22 @@ local function send_frame(tx, frame) assert_true(tx:send({ frame = frame })) end +local function capture_prints(fn) + local old_print = _G.print + local lines = {} + _G.print = function (...) + local parts = {} + for i = 1, select('#', ...) do + parts[#parts + 1] = tostring(select(i, ...)) + end + lines[#lines + 1] = table.concat(parts, '\t') + end + local ok, err = pcall(fn, lines) + _G.print = old_print + if not ok then error(err, 0) end + return lines +end + local function make_sender_caps(control_tx, bulk_tx, frame_rx, opts) opts = opts or {} @@ -66,6 +82,7 @@ local function make_sender_caps(control_tx, bulk_tx, frame_rx, opts) frame_rx = frame_rx, chunk_size = opts.chunk_size or 3, timeout_s = opts.timeout_s or 0.05, + trace_io = opts.trace_io == true, send_control_frame_now = function (frame, label) return admit(control_tx, frame, label) @@ -166,7 +183,16 @@ end function tests.test_sender_sends_begin_chunks_commit_and_returns_after_done() local seen = {} - local req = make_req { data = 'abcdef', size = 6, xfer_id = 'xfer-1' } + local progress = {} + local req = make_req { + data = 'abcdef', + size = 6, + xfer_id = 'xfer-1', + on_progress = function (p) + progress[#progress + 1] = p + return true + end, + } local out = collect_result(req, function (io) local begin = recv_with_timeout(io.control_rx, 'begin') @@ -218,10 +244,163 @@ function tests.test_sender_sends_begin_chunks_commit_and_returns_after_done() assert_eq(seen[2], 'xfer_chunk') assert_eq(seen[3], 'xfer_chunk') assert_eq(seen[4], 'xfer_commit') + assert_eq(progress[1].status, 'waiting_ready') + assert_eq(progress[1].sent, 0) + assert_eq(progress[1].chunk_size, 3) + assert_eq(progress[1].last_transfer_event, 'begin_tx') + assert_eq(progress[2].status, 'sending') + assert_eq(progress[2].sent, 0) + assert_eq(progress[2].chunk_size, 3) + assert_eq(progress[2].pending_offset, 0) + assert_eq(progress[2].pending_next, 3) + assert_eq(progress[2].last_transfer_event, 'chunk_tx') + assert_eq(progress[3].status, 'sending') + assert_eq(progress[3].sent, 3) + assert_eq(progress[3].pending_offset, 0) + assert_eq(progress[3].pending_next, 3) + assert_eq(progress[3].last_transfer_event, 'chunk_ack') + assert_eq(progress[4].status, 'sending') + assert_eq(progress[4].sent, 3) + assert_eq(progress[4].pending_offset, 3) + assert_eq(progress[4].pending_next, 6) + assert_eq(progress[4].last_transfer_event, 'chunk_tx') +end + +function tests.test_sender_retries_begin_while_waiting_ready_and_then_completes() + local progress = {} + local req = make_req { + data = 'abc', + size = 3, + xfer_id = 'xfer-begin-retry', + timeout_s = 2.5, + on_progress = function (p) + progress[#progress + 1] = p + return true + end, + } + + local out = collect_result(req, function (io) + local begin1 = recv_with_timeout(io.control_rx, 'begin') + assert_eq(begin1.frame.type, 'xfer_begin') + assert_eq(begin1.frame.xfer_id, 'xfer-begin-retry') + assert_eq(queue.try_recv_now(io.bulk_rx), nil, 'sender must not send bulk before xfer_ready') + + local begin2 = recv_with_timeout(io.control_rx, 'begin retry', 1.3) + assert_eq(begin2.frame.type, 'xfer_begin') + assert_eq(begin2.frame.xfer_id, 'xfer-begin-retry') + assert_eq(queue.try_recv_now(io.bulk_rx), nil, 'retry must not send bulk before xfer_ready') + + send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-begin-retry'))) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-begin-retry', 0))) + + local chunk = recv_with_timeout(io.bulk_rx, 'chunk after ready') + assert_eq(chunk.frame.type, 'xfer_chunk') + assert_eq(chunk.frame.offset, 0) + assert_eq(chunk.frame.data, 'abc') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-begin-retry', 3))) + local commit = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit.frame.type, 'xfer_commit') + send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-begin-retry'))) + end, { timeout_s = 2.5 }) + + assert_eq(out.status, 'ok', tostring(out.value)) + assert_eq(out.value.sent_bytes, 3) + assert_eq(progress[1].status, 'waiting_ready') + assert_eq(progress[1].sent, 0) + assert_eq(progress[1].last_transfer_event, 'begin_tx') + assert_eq(progress[2].status, 'waiting_ready') + assert_eq(progress[2].sent, 0) + assert_eq(progress[2].last_transfer_event, 'begin_retry_tx') + assert_eq(progress[3].status, 'sending') + assert_eq(progress[3].sent, 0) + assert_eq(progress[3].pending_offset, 0) + assert_eq(progress[3].pending_next, 3) + assert_eq(progress[3].last_transfer_event, 'chunk_tx') +end + +function tests.test_sender_treats_need_zero_as_implicit_ready() + local req = make_req { + data = 'abc', + size = 3, + xfer_id = 'xfer-implicit-ready', + timeout_s = 0.2, + } + + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-implicit-ready', 0))) + local chunk = recv_with_timeout(io.bulk_rx, 'chunk after implicit ready') + assert_eq(chunk.frame.type, 'xfer_chunk') + assert_eq(chunk.frame.offset, 0) + assert_eq(chunk.frame.data, 'abc') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-implicit-ready', 3))) + local commit = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit.frame.type, 'xfer_commit') + send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-implicit-ready'))) + end, { timeout_s = 0.2 }) + + assert_eq(out.status, 'ok', tostring(out.value)) + assert_eq(out.value.sent_bytes, 3) end -function tests.test_sender_resends_cached_chunk_when_receiver_reasks_same_offset() - local req = make_req { data = 'abcdef', size = 6, xfer_id = 'xfer-retry' } +function tests.test_sender_rejects_nonzero_need_while_waiting_ready() + local req = make_req { + data = 'abc', + size = 3, + xfer_id = 'xfer-waiting-ready-bad-need', + timeout_s = 0.2, + } + + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-waiting-ready-bad-need', 1))) + end, { timeout_s = 0.2 }) + + assert_eq(out.status, 'failed') + assert_match(out.value, 'unexpected_need') +end + +function tests.test_sender_trace_logs_are_quiet_by_default_and_enabled_by_flag() + local function run_once(trace_io) + local req = make_req { + data = 'abc', + size = 3, + xfer_id = trace_io and 'xfer-trace-on' or 'xfer-trace-off', + } + + return capture_prints(function () + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + send_frame(io.frame_tx, assert(protocol.xfer_ready(req.xfer_id))) + send_frame(io.frame_tx, assert(protocol.xfer_need(req.xfer_id, 0))) + recv_with_timeout(io.bulk_rx, 'chunk') + send_frame(io.frame_tx, assert(protocol.xfer_need(req.xfer_id, 3))) + recv_with_timeout(io.control_rx, 'commit') + send_frame(io.frame_tx, assert(protocol.xfer_done(req.xfer_id))) + end, { trace_io = trace_io }) + + assert_eq(out.status, 'ok', tostring(out.value)) + end) + end + + local quiet = run_once(false) + assert_eq(#quiet, 0) + + local noisy = run_once(true) + assert_true(#noisy > 0, 'trace_io=true should enable transfer diagnostics') + assert_match(noisy[1], '%[fabric%-xfer%-tx%]') +end + +function tests.test_sender_resends_cached_chunk_when_receiver_reasks_same_offset_after_delay() + local req = make_req { + data = 'abcdef', + size = 6, + xfer_id = 'xfer-retry', + timeout_s = 1.0, + } local out = collect_result(req, function (io) recv_with_timeout(io.control_rx, 'begin') @@ -233,7 +412,9 @@ function tests.test_sender_resends_cached_chunk_when_receiver_reasks_same_offset assert_eq(chunk1.frame.data, 'abc') -- Same offset means the receiver did not advance. The sender must - -- resend the cached frame without reading from the source again. + -- resend the cached frame without reading from the source again once + -- the previous copy has had time to leave the host. + fibers.perform(sleep.sleep_op(0.3)) send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-retry', 0))) local retry1 = recv_with_timeout(io.bulk_rx, 'chunk1 retry') assert_eq(retry1.frame.offset, 0) @@ -248,13 +429,124 @@ function tests.test_sender_resends_cached_chunk_when_receiver_reasks_same_offset local commit = recv_with_timeout(io.control_rx, 'commit') assert_eq(commit.frame.type, 'xfer_commit') send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-retry'))) - end) + end, { timeout_s = 1.0 }) assert_eq(out.status, 'ok', tostring(out.value)) assert_eq(out.value.sent_bytes, 6) assert_eq(out.value.retransmits, 1) end +function tests.test_sender_coalesces_immediate_duplicate_need_for_pending_offset() + local req = make_req { + data = 'abcdef', + size = 6, + xfer_id = 'xfer-coalesce', + timeout_s = 0.2, + } + + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-coalesce'))) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-coalesce', 0))) + fibers.perform(sleep.sleep_op(0.01)) + + -- A duplicate request that arrives immediately after the original send + -- should not enqueue an identical chunk while the first copy is still + -- queued or on the wire. + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-coalesce', 0))) + fibers.perform(sleep.sleep_op(0.01)) + + local chunk1 = recv_with_timeout(io.bulk_rx, 'chunk1') + assert_eq(chunk1.frame.offset, 0) + assert_eq(queue.try_recv_now(io.bulk_rx), nil, 'duplicate need should be coalesced') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-coalesce', 3))) + local chunk2 = recv_with_timeout(io.bulk_rx, 'chunk2') + assert_eq(chunk2.frame.offset, 3) + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-coalesce', 6))) + local commit = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit.frame.type, 'xfer_commit') + send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-coalesce'))) + end, { timeout_s = 0.2 }) + + assert_eq(out.status, 'ok', tostring(out.value)) + assert_eq(out.value.sent_bytes, 6) + assert_eq(out.value.retransmits, 0) +end + +function tests.test_sender_waits_for_transient_bulk_queue_backpressure() + local req = make_req { + data = 'abcdef', + size = 6, + xfer_id = 'xfer-backpressure', + timeout_s = 0.2, + } + + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-backpressure'))) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-backpressure', 0))) + + fibers.perform(sleep.sleep_op(0.02)) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-backpressure', 3))) + fibers.perform(sleep.sleep_op(0.02)) + + local chunk1 = recv_with_timeout(io.bulk_rx, 'queued chunk1') + assert_eq(chunk1.frame.offset, 0) + + local chunk2 = recv_with_timeout(io.bulk_rx, 'chunk2 after backpressure') + assert_eq(chunk2.frame.offset, 3) + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-backpressure', 6))) + local commit = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit.frame.type, 'xfer_commit') + send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-backpressure'))) + end, { + bulk_queue_len = 1, + timeout_s = 0.2, + }) + + assert_eq(out.status, 'ok', tostring(out.value)) + assert_eq(out.value.sent_bytes, 6) +end + +function tests.test_sender_ignores_stale_need_for_already_advanced_offset() + local req = make_req { data = 'abcdefghi', size = 9, xfer_id = 'xfer-stale' } + + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-stale'))) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-stale', 0))) + + local chunk1 = recv_with_timeout(io.bulk_rx, 'chunk1') + assert_eq(chunk1.frame.offset, 0) + assert_eq(chunk1.frame.data, 'abc') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-stale', 3))) + local chunk2 = recv_with_timeout(io.bulk_rx, 'chunk2') + assert_eq(chunk2.frame.offset, 3) + assert_eq(chunk2.frame.data, 'def') + + -- A delayed duplicate for an already accepted offset must not abort the + -- transfer or rewind the source. The current pending chunk remains 3..6. + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-stale', 0))) + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-stale', 6))) + local chunk3 = recv_with_timeout(io.bulk_rx, 'chunk3') + assert_eq(chunk3.frame.offset, 6) + assert_eq(chunk3.frame.data, 'ghi') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-stale', 9))) + local commit = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit.frame.type, 'xfer_commit') + send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-stale'))) + end) + + assert_eq(out.status, 'ok', tostring(out.value)) + assert_eq(out.value.sent_bytes, 9) +end + function tests.test_sender_timeout_sends_abort_and_fails_attempt() local req = make_req { data = 'abc', size = 3, xfer_id = 'xfer-timeout', timeout_s = 0.01 } @@ -267,6 +559,38 @@ function tests.test_sender_timeout_sends_abort_and_fails_attempt() assert_match(out.value, 'timeout') end +function tests.test_sender_waiting_ready_timeout_reports_begin_attempts() + local req = make_req { + data = 'abc', + size = 3, + xfer_id = 'xfer-waiting-ready-timeout', + timeout_s = 10, + begin_retry_interval_s = 0.01, + begin_max_attempts = 3, + begin_startup_timeout_s = 0.05, + } + + local out = collect_result(req, function (io) + local begin1 = recv_with_timeout(io.control_rx, 'begin') + assert_eq(begin1.frame.type, 'xfer_begin') + + local begin2 = recv_with_timeout(io.control_rx, 'begin retry 2', 0.1) + assert_eq(begin2.frame.type, 'xfer_begin') + assert_eq(begin2.frame.xfer_id, 'xfer-waiting-ready-timeout') + + local begin3 = recv_with_timeout(io.control_rx, 'begin retry 3', 0.1) + assert_eq(begin3.frame.type, 'xfer_begin') + assert_eq(begin3.frame.xfer_id, 'xfer-waiting-ready-timeout') + assert_eq(queue.try_recv_now(io.bulk_rx), nil, 'waiting_ready timeout must not send bulk') + end, { timeout_s = 10 }) + + assert_eq(out.status, 'failed') + assert_match(out.value, 'waiting_ready_timeout') + assert_match(out.value, 'state=waiting_ready') + assert_match(out.value, 'sent=0') + assert_match(out.value, 'begin_attempts=3') +end + function tests.test_sender_remote_abort_fails_without_echoing_abort() local req = make_req { data = 'abc', size = 3, xfer_id = 'xfer-abort' } @@ -279,17 +603,150 @@ function tests.test_sender_remote_abort_fails_without_echoing_abort() assert_match(out.value, 'remote denied') end -function tests.test_sender_unexpected_offset_sends_abort_and_fails() - local req = make_req { data = 'abc', size = 3, xfer_id = 'xfer-offset' } +function tests.test_sender_recovers_from_future_need_before_pending_chunk() + local req = make_req { data = 'abc', size = 3, xfer_id = 'xfer-future-need' } + + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-future-need'))) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-future-need', 1))) + + local chunk = recv_with_timeout(io.bulk_rx, 'chunk after future need') + assert_eq(chunk.frame.type, 'xfer_chunk') + assert_eq(chunk.frame.offset, 0) + assert_eq(chunk.frame.data, 'abc') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-future-need', 3))) + local commit = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit.frame.type, 'xfer_commit') + send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-future-need'))) + end) + + assert_eq(out.status, 'ok', tostring(out.value)) + assert_eq(out.value.sent_bytes, 3) +end + +function tests.test_sender_recovers_from_future_need_while_chunk_pending() + local req = make_req { data = 'abcdef', size = 6, xfer_id = 'xfer-pending-future' } local out = collect_result(req, function (io) recv_with_timeout(io.control_rx, 'begin') - send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-offset'))) - send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-offset', 1))) + send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-pending-future'))) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-pending-future', 0))) + + local chunk1 = recv_with_timeout(io.bulk_rx, 'chunk1') + assert_eq(chunk1.frame.offset, 0) + assert_eq(chunk1.frame.data, 'abc') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-pending-future', 6))) + assert_eq(queue.try_recv_now(io.bulk_rx), nil, 'future need should be coalesced while pending') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-pending-future', 3))) + local chunk2 = recv_with_timeout(io.bulk_rx, 'chunk2 after recovery') + assert_eq(chunk2.frame.offset, 3) + assert_eq(chunk2.frame.data, 'def') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-pending-future', 6))) + local commit = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit.frame.type, 'xfer_commit') + send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-pending-future'))) end) + assert_eq(out.status, 'ok', tostring(out.value)) + assert_eq(out.value.sent_bytes, 6) +end + +function tests.test_sender_resends_commit_on_duplicate_need_size_after_interval() + local req = make_req { + data = 'abc', + size = 3, + xfer_id = 'xfer-commit-resend', + timeout_s = 1.0, + } + + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-commit-resend'))) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-resend', 0))) + recv_with_timeout(io.bulk_rx, 'chunk') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-resend', 3))) + local commit1 = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit1.frame.type, 'xfer_commit') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-resend', 3))) + fibers.perform(sleep.sleep_op(0.01)) + assert_eq(queue.try_recv_now(io.control_rx), nil, 'immediate duplicate commit need should be coalesced') + + fibers.perform(sleep.sleep_op(0.3)) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-resend', 3))) + local commit2 = recv_with_timeout(io.control_rx, 'commit retry') + assert_eq(commit2.frame.type, 'xfer_commit') + assert_eq(commit2.frame.xfer_id, 'xfer-commit-resend') + + send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-commit-resend'))) + end, { timeout_s = 1.0 }) + + assert_eq(out.status, 'ok', tostring(out.value)) + assert_eq(out.value.sent_bytes, 3) + assert_eq(out.value.commit_resends, 1) +end + +function tests.test_sender_ignores_stale_need_while_committing() + local req = make_req { + data = 'abc', + size = 3, + xfer_id = 'xfer-commit-stale-need', + timeout_s = 0.2, + } + + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-commit-stale-need'))) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-stale-need', 0))) + recv_with_timeout(io.bulk_rx, 'chunk') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-stale-need', 3))) + local commit = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit.frame.type, 'xfer_commit') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-stale-need', 0))) + fibers.perform(sleep.sleep_op(0.01)) + assert_eq(queue.try_recv_now(io.control_rx), nil, 'stale commit need should not resend commit') + + send_frame(io.frame_tx, assert(protocol.xfer_done('xfer-commit-stale-need'))) + end, { timeout_s = 0.2 }) + + assert_eq(out.status, 'ok', tostring(out.value)) + assert_eq(out.value.sent_bytes, 3) + assert_eq(out.value.commit_resends, 0) +end + +function tests.test_sender_future_need_while_committing_times_out_without_refresh() + local req = make_req { + data = 'abc', + size = 3, + xfer_id = 'xfer-commit-future-need', + timeout_s = 0.06, + } + + local out = collect_result(req, function (io) + recv_with_timeout(io.control_rx, 'begin') + send_frame(io.frame_tx, assert(protocol.xfer_ready('xfer-commit-future-need'))) + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-future-need', 0))) + recv_with_timeout(io.bulk_rx, 'chunk') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-future-need', 3))) + local commit = recv_with_timeout(io.control_rx, 'commit') + assert_eq(commit.frame.type, 'xfer_commit') + + send_frame(io.frame_tx, assert(protocol.xfer_need('xfer-commit-future-need', 6))) + end, { timeout_s = 0.06 }) + assert_eq(out.status, 'failed') - assert_match(out.value, 'unexpected_offset') + assert_match(out.value, 'timeout') + assert_match(out.value, 'state=committing') + assert_match(out.value, 'last_need_next=6') end function tests.test_sender_source_read_error_sends_abort_and_fails() diff --git a/tests/unit/hal/control_store_provider_spec.lua b/tests/unit/hal/control_store_provider_spec.lua index 94a6a8b9d..151c646bb 100644 --- a/tests/unit/hal/control_store_provider_spec.lua +++ b/tests/unit/hal/control_store_provider_spec.lua @@ -135,6 +135,24 @@ function T.get_op_returns_not_found_for_missing_key() rm_rf(root) end +function T.get_op_returns_not_found_for_stale_index_entry() + local root = mk_tmpdir('csp-stale-index') + local provider = fresh_provider(root) + + local index = assert(io.open(root .. '/.control_store_index', 'wb')) + index:write('stale\n') + index:close() + + runfibers.run(function() + local fibers = require 'fibers' + local ok, err = fibers.perform(provider:get_op(assert(cap_args.new.ControlStoreGetOpts('stale')))) + assert(ok == false) + assert(tostring(err):match('not found')) + end) + + rm_rf(root) +end + function T.delete_op_removes_key_from_index_and_truncates_file() local root = mk_tmpdir('csp-delete') local provider = fresh_provider(root) diff --git a/tests/unit/hal/uart_manager_spec.lua b/tests/unit/hal/uart_manager_spec.lua index 012864b18..60cdfb6a9 100644 --- a/tests/unit/hal/uart_manager_spec.lua +++ b/tests/unit/hal/uart_manager_spec.lua @@ -68,6 +68,39 @@ function T.apply_config_op_adds_uart_driver_and_emits_added_event() assert(type(ev.capabilities) == 'table' and #ev.capabilities == 1) assert(ev.capabilities[1].class == 'uart') assert(ev.meta.path == port.slave_name) + assert(ev.meta.source_id == 'uart_mcu') + + local ok_stop, err_stop = fibers.perform(M.shutdown_op()) + assert(ok_stop == true, tostring(err_stop)) + end) +end + +function T.apply_config_op_accepts_hal_uart_serial_ports_shape() + local M = fresh_manager() + + runfibers.run(function(scope) + local port = pty.open(scope) + local dev_ev_ch = channel.new(8) + local cap_emit_ch = channel.new(16) + + local ok_start, err_start = fibers.perform(M.start_op(nil, dev_ev_ch, cap_emit_ch)) + assert(ok_start == true, tostring(err_start)) + + local ok_cfg, err_cfg = fibers.perform(M.apply_config_op({ + serial_ports = { + { id = 'uart0', path = port.slave_name, baud = 115200, mode = '8N1' }, + }, + })) + assert(ok_cfg == true, tostring(err_cfg)) + + local ev = recv_or_fail(dev_ev_ch) + assert(ev.event_type == 'added') + assert(ev.class == 'uart') + assert(ev.id == 'uart0') + assert(type(ev.capabilities) == 'table' and #ev.capabilities == 1) + assert(ev.capabilities[1].class == 'uart') + assert(ev.capabilities[1].id == 'uart0') + assert(ev.meta.source_id == 'uart_uart0') local ok_stop, err_stop = fibers.perform(M.shutdown_op()) assert(ok_stop == true, tostring(err_stop)) diff --git a/tests/unit/main/service_spec.lua b/tests/unit/main/service_spec.lua index a0bd146e5..05c6b87e5 100644 --- a/tests/unit/main/service_spec.lua +++ b/tests/unit/main/service_spec.lua @@ -44,4 +44,66 @@ function T.main_fails_boot_when_service_load_fails() assert(tostring(err):match('boot failed')) end +function T.main_builds_ui_admin_auth_from_env() + local opts = mainmod._test.build_service_opts({}, function(name) + if name == 'DEVICECODE_UI_ADMIN_PASSWORD' then return 'e2e' end + return nil + end) + + local admin = assert(opts.ui.auth_opts.users.admin) + assert(admin.password == 'e2e') + assert(admin.principal.kind == 'user') + assert(admin.principal.id == 'admin') + assert(admin.principal.roles[1] == 'admin') +end + +function T.main_preserves_explicit_ui_auth_opts() + local explicit = { users = { tester = { password = 'test-password' } } } + local opts = mainmod._test.build_service_opts({ + ui = { auth_opts = explicit }, + }, function(name) + if name == 'DEVICECODE_UI_ADMIN_PASSWORD' then return 'e2e' end + return nil + end) + + assert(opts.ui.auth_opts == explicit) + assert(opts.ui.auth_opts.users.admin == nil) +end + +function T.main_forwards_service_opts_to_service_start() + local auth_opts = { users = { admin = { password = 'e2e' } } } + local started = mainmod._test.service_start_opts('ui', 'dev', function() end, { + auth_opts = auth_opts, + run_http = true, + }) + + assert(started.name == 'ui') + assert(started.env == 'dev') + assert(started.auth_opts == auth_opts) + assert(started.run_http == true) + assert(type(started.connect) == 'function') +end + +function T.main_summarises_ui_auth_without_passwords() + local count, first = mainmod._test.auth_user_summary({ + auth_opts = { + users = { + admin = { password = 'e2e' }, + }, + }, + }) + + assert(count == 1) + assert(first == 'admin') +end + +function T.main_bootstrap_prefers_local_source_in_dev() + local f = assert(io.open('../src/main.lua', 'r')) + local src = f:read('*a') + f:close() + + local dev_branch = src:match("else(.*)end") + assert(dev_branch and dev_branch:find("add_path%('%./'%)"), 'dev bootstrap should prepend local source path') +end + return T diff --git a/tests/unit/ui/test_http_request.lua b/tests/unit/ui/test_http_request.lua index 92b4afb53..f82e4d037 100644 --- a/tests/unit/ui/test_http_request.lua +++ b/tests/unit/ui/test_http_request.lua @@ -54,6 +54,16 @@ local function fake_ctx(method, path) } end +local function body_from_chunks(chunks) + return { + _chunks = chunks, + read_chunk_op = function (self) + local chunk = table.remove(self._chunks, 1) + return fibers.always(chunk, nil) + end, + } +end + function tests.test_http_read_request_uses_model_and_replies_once() run_fibers.run(function (scope) local model = read_model.new() @@ -67,6 +77,78 @@ function tests.test_http_read_request_uses_model_and_replies_once() end) end +function tests.test_fabric_link_route_returns_link_component_projection() + run_fibers.run(function (scope) + local model = read_model.new() + model:set({ 'state', 'fabric', 'link', 'mcu0', 'component', 'session' }, { + kind = 'fabric.component', + link_id = 'mcu0', + component = 'session', + snapshot = { + phase = 'established', + established = true, + peer_node = 'mcu-1', + }, + }) + model:set({ 'state', 'fabric', 'link', 'mcu0', 'component', 'transfer_manager' }, { + kind = 'fabric.component', + link_id = 'mcu0', + component = 'transfer_manager', + state = 'ready', + snapshot = { + active = nil, + last = nil, + }, + }) + + local ctx = fake_ctx('GET', '/api/fabric/link/mcu0') + local result = request.run(scope, ctx, { model = model }) + assert_eq(result.status, 'ok') + assert_eq(#ctx.replies, 1) + assert_eq(ctx.replies[1].status, 200) + local decoded = assert(cjson.decode(ctx.replies[1].body)) + assert_eq(decoded.session.link_id, 'mcu0') + assert_eq(decoded.session.status.ready, true) + assert_eq(decoded.session.status.state, 'ready') + assert_eq(decoded.session.status.phase, 'established') + assert_eq(decoded.session.status.established, true) + assert_eq(decoded.session.status.peer_node, 'mcu-1') + assert_eq(decoded.transfer_manager.link_id, 'mcu0') + end) +end + +function tests.test_login_returns_nested_session_and_compat_session_id() + run_fibers.run(function (scope) + local ctx = fake_ctx('POST', '/api/login') + ctx.body = { username = 'admin', password = 'e2e' } + local sessions = { + create = function (_, principal, opts) + assert_eq(principal.id, 'admin') + assert_not_nil(opts and opts.data) + return { id = 'sid-1', principal = principal } + end, + } + + local result = request.run(scope, ctx, { + auth = function (credentials) + if credentials.username == 'admin' and credentials.password == 'e2e' then + return { kind = 'user', id = 'admin' }, nil + end + return nil, 'unauthenticated' + end, + sessions = sessions, + }) + + assert_eq(result.status, 'ok') + assert_eq(result.session_id, 'sid-1') + assert_eq(#ctx.replies, 1) + assert_eq(ctx.replies[1].status, 200) + local decoded = assert(cjson.decode(ctx.replies[1].body)) + assert_eq(decoded.session.id, 'sid-1') + assert_eq(decoded.session_id, 'sid-1') + end) +end + function tests.test_http_response_writer_may_yield_inside_request_scope_without_blocking_peers() run_fibers.run(function (scope) @@ -160,6 +242,105 @@ function tests.test_http_command_route_rejects_non_json_body() end) end +function tests.test_update_upload_route_uses_session_principal_for_bus_calls() + run_fibers.run(function (scope) + local captured_principal + local calls = {} + local conn = { + call_op = function (_, topic, payload) + calls[#calls + 1] = { topic = topic, payload = payload } + local method = topic[#topic] + if topic[2] == 'artifact-ingest' and method == 'create' then + return fibers.always({ ingest = { ingest_id = payload.ingest_id or 'ing-upload' } }, nil) + elseif topic[2] == 'artifact-ingest' and method == 'append' then + return fibers.always({ ok = true }, nil) + elseif topic[2] == 'artifact-ingest' and method == 'commit' then + return fibers.always({ commit = { artifact = { artifact_ref = 'artifact-1' } } }, nil) + elseif topic[2] == 'update-manager' and method == 'create-job' then + assert_eq(payload.component, 'mcu') + assert_eq(payload.metadata.image_id, 'mcu-dev-15.0') + assert_eq(payload.metadata.expected_image_id, 'mcu-dev-15.0') + assert_eq(payload.metadata.version, '15.0') + assert_eq(payload.metadata.build, 'fw-update-e2e-15.0') + assert_eq(payload.metadata.transfer_chunk_raw, 1024) + return fibers.always({ job = { job_id = 'job-1', component = payload.component } }, nil) + elseif topic[2] == 'update-manager' and method == 'start-job' then + assert_eq(payload.job_id, 'job-1') + return fibers.always({ ok = true }, nil) + end + return fibers.always(nil, 'unexpected method') + end, + disconnect = function () return true end, + } + local bus = { + connect = function (_, opts) + captured_principal = opts and opts.principal + return conn, nil + end, + } + local sessions = { + get = function (_, sid) + if sid == 'sid-1' then + return { id = sid, principal = { kind = 'user', id = 'tester', roles = { 'admin' } } } + end + return nil + end, + } + local ctx = fake_ctx('POST', '/api/update/upload') + ctx.headers = { + ['x-session-id'] = 'sid-1', + ['x-artifact-component'] = 'mcu', + ['x-artifact-name'] = 'devicecode.dcmcu', + ['x-artifact-version'] = '15.0', + ['x-artifact-build'] = 'fw-update-e2e-15.0', + ['x-artifact-image-id'] = 'mcu-dev-15.0', + ['x-transfer-chunk-raw'] = '1024', + } + ctx.body_stream = body_from_chunks({ 'hello', 'world' }) + + local result = request.run(scope, ctx, { + sessions = sessions, + update = { + bus = bus, + ingest_id = 'ing-upload', + }, + encode_json = function (v) return assert(cjson.encode(v)) end, + }) + + assert_eq(result.status, 'ok') + assert_eq(captured_principal.id, 'tester') + assert_eq(calls[1].topic[5], 'create') + assert_eq(calls[2].topic[5], 'append') + assert_eq(calls[3].topic[5], 'append') + assert_eq(calls[4].topic[5], 'commit') + assert_eq(calls[5].topic[5], 'create-job') + assert_eq(calls[6].topic[5], 'start-job') + assert_eq(result.job.job_id, 'job-1') + assert_eq(#ctx.replies, 1) + assert_eq(ctx.replies[1].status, 200) + end) +end + +function tests.test_update_upload_route_rejects_missing_session() + run_fibers.run(function (scope) + local ctx = fake_ctx('POST', '/api/update/upload') + ctx.body_stream = body_from_chunks({ 'hello' }) + local result = request.run(scope, ctx, { + sessions = { get = function () return nil end }, + update = { + ingest = { + open_ingest_op = function () error('upload must not start without a principal') end, + }, + }, + encode_json = function (v) return assert(cjson.encode(v)) end, + }) + + assert_eq(result.status, 'unauthenticated') + assert_eq(#ctx.replies, 1) + assert_eq(ctx.replies[1].status, 401) + end) +end + function tests.test_unknown_route_replies_not_found_once() run_fibers.run(function (scope) local ctx = fake_ctx('GET', '/api/nope') diff --git a/tests/unit/ui/test_update_upload.lua b/tests/unit/ui/test_update_upload.lua index 16268ae84..11166f235 100644 --- a/tests/unit/ui/test_update_upload.lua +++ b/tests/unit/ui/test_update_upload.lua @@ -78,6 +78,145 @@ function tests.test_committed_artifact_is_not_aborted_when_update_job_create_fai end) end +function tests.test_upload_discards_created_job_when_auto_start_fails() + fibers.run(function () + local handle = { + append_chunk_op = function () return fibers.always(true, nil) end, + commit_op = function () return fibers.always('artifact-3', nil) end, + abort_now = function () error('abort must not be called after commit') end, + } + local calls = {} + local conn = { + call_op = function (_, topic, payload) + local method = topic[5] + calls[#calls + 1] = { method = method, payload = payload } + if method == 'create-job' then return fibers.always({ job_id = 'job-1' }, nil) end + if method == 'start-job' then return fibers.always(nil, 'slot_busy') end + if method == 'discard-job' then return fibers.always({ ok = true }, nil) end + return fibers.always(nil, 'unexpected_method:' .. tostring(method)) + end, + } + + local st, _, primary = fibers.perform(upload.run_op({ + body_stream = body_from_chunks({ 'abc' }), + }, { + ingest = ingest_client(handle), + create_job = true, + start_job = true, + conn = conn, + })) + + assert_eq(st, 'failed') + assert_eq(primary, 'slot_busy') + assert_eq(#calls, 3) + assert_eq(calls[1].method, 'create-job') + assert_eq(calls[2].method, 'start-job') + assert_eq(calls[3].method, 'discard-job') + assert_eq(calls[3].payload.job_id, 'job-1') + assert_eq(calls[3].payload.reason, 'upload_start_failed:slot_busy') + end) +end + +function tests.test_upload_preserves_start_error_when_discard_cleanup_fails() + fibers.run(function () + local handle = { + append_chunk_op = function () return fibers.always(true, nil) end, + commit_op = function () return fibers.always('artifact-4', nil) end, + abort_now = function () error('abort must not be called after commit') end, + } + local conn = { + call_op = function (_, topic) + local method = topic[5] + if method == 'create-job' then return fibers.always({ job_id = 'job-2' }, nil) end + if method == 'start-job' then return fibers.always(nil, 'slot_busy') end + if method == 'discard-job' then return fibers.always(nil, 'discard_denied') end + return fibers.always(nil, 'unexpected_method:' .. tostring(method)) + end, + } + + local st, _, primary = fibers.perform(upload.run_op({ + body_stream = body_from_chunks({ 'abc' }), + }, { + ingest = ingest_client(handle), + create_job = true, + start_job = true, + conn = conn, + })) + + assert_eq(st, 'failed') + assert_eq(primary, 'slot_busy; discard_job_failed:discard_denied') + end) +end + +function tests.test_upload_does_not_discard_created_job_on_ambiguous_start_timeout() + fibers.run(function () + local handle = { + append_chunk_op = function () return fibers.always(true, nil) end, + commit_op = function () return fibers.always('artifact-timeout-start', nil) end, + abort_now = function () error('abort must not be called after commit') end + } + local calls = {} + local conn = { + call_op = function (_, topic, payload) + local method = topic[5] + calls[#calls + 1] = { method = method, payload = payload } + if method == 'create-job' then return fibers.always({ job_id = 'job-timeout-start' }, nil) end + if method == 'start-job' then return fibers.always(nil, 'timeout') end + if method == 'discard-job' then return fibers.always({ ok = true }, nil) end + return fibers.always(nil, 'unexpected_method:' .. tostring(method)) + end, + } + + local st, _, primary = fibers.perform(upload.run_op({ + body_stream = body_from_chunks({ 'abc' }), + }, { + ingest = ingest_client(handle), + create_job = true, + start_job = true, + conn = conn, + })) + + assert_eq(st, 'failed') + assert_eq(primary, 'timeout') + assert_eq(#calls, 2) + assert_eq(calls[1].method, 'create-job') + assert_eq(calls[2].method, 'start-job') + end) +end + +function tests.test_upload_auto_start_success_does_not_discard_job() + fibers.run(function () + local handle = { + append_chunk_op = function () return fibers.always(true, nil) end, + commit_op = function () return fibers.always('artifact-5', nil) end, + abort_now = function () error('abort must not be called after commit') end, + } + local discard_called = false + local conn = { + call_op = function (_, topic) + local method = topic[5] + if method == 'create-job' then return fibers.always({ job_id = 'job-3' }, nil) end + if method == 'start-job' then return fibers.always({ ok = true, accepted = true }, nil) end + if method == 'discard-job' then discard_called = true; return fibers.always({ ok = true }, nil) end + return fibers.always(nil, 'unexpected_method:' .. tostring(method)) + end, + } + + local st, _, result = fibers.perform(upload.run_op({ + body_stream = body_from_chunks({ 'abc' }), + }, { + ingest = ingest_client(handle), + create_job = true, + start_job = true, + conn = conn, + })) + + assert_eq(st, 'ok') + assert_eq(result.artifact_id, 'artifact-5') + assert_eq(discard_called, false) + end) +end + function tests.test_upload_disconnects_owned_update_connection_after_success() fibers.run(function () local handle = { diff --git a/tests/unit/update/test_active_runtime.lua b/tests/unit/update/test_active_runtime.lua index 62e56be38..18e4ee267 100644 --- a/tests/unit/update/test_active_runtime.lua +++ b/tests/unit/update/test_active_runtime.lua @@ -8,6 +8,11 @@ local function assert_eq(a,b,msg) if a ~= b then fail(msg or ('expected '..tostr local function assert_true(v,msg) if v ~= true then fail(msg or ('expected true, got '..tostring(v))) end end local function assert_nil(v,msg) if v ~= nil then fail(msg or ('expected nil, got '..tostring(v))) end end local function assert_not_nil(v,msg) if v == nil then fail(msg or 'expected non-nil') end end +local function assert_contains(s, needle, msg) + if tostring(s or ''):find(tostring(needle), 1, true) == nil then + fail(msg or ('expected '..tostring(s)..' to contain '..tostring(needle))) + end +end local function stage_backend(result) return { stage_op = function () return op.always(result or { ok = true }, nil) end } @@ -185,4 +190,87 @@ function tests.test_component_apply_failure_after_start_keeps_completion_and_rep end) end +function tests.test_cleanup_releases_terminal_reconcile_active_and_reports_change() + fibers.run(function (scope) + local service_tx, service_rx = mailbox.new(8, { full = 'reject_newest' }) + local cancelled + local fake_jobs = { + get = function (_, job_id) + if job_id == 'j1' then return { job_id='j1', state='cancelled' } end + return nil + end, + list = function () return {} end, + } + local component = assert(active.start_component(scope, { + service_id = 'update', + done_tx = service_tx, + work_scope = scope, + jobs = fake_jobs, + })) + local lease = assert(component:claim({ job_id='j1', generation=1, phase='reconcile', token='tok-reconcile' })) + assert_true(lease:handoff()) + component:state().active.handle = { cancel = function (_, reason) cancelled = reason end } + + local ok, reason = component:consider_jobs() + assert_eq(ok, false) + assert_eq(reason, 'no_reconcile_adoption') + assert_nil(component:state().active) + assert_contains(cancelled, 'stale_reconcile_terminal_job') + + local ev = fibers.perform(service_rx:recv_op()) + assert_eq(ev.kind, 'active_runtime_changed') + assert_eq(ev.reason, 'stale_active_cleanup') + assert_eq(ev.action, 'released') + assert_eq(ev.phase, 'reconcile') + assert_nil(ev.snapshot.active) + component:cancel('test complete') + end) +end + +function tests.test_cleanup_releases_missing_reconcile_active() + local state = active.new_state() + local cancelled + local lease = assert(active.claim(state, { job_id='missing', generation=1, phase='reconcile', token='tok-missing' })) + assert_true(lease:handoff()) + state.active.handle = { cancel = function (_, reason) cancelled = reason end } + local changed, reason, extra = active.cleanup_stale_active(state, { get = function () return nil end }) + assert_true(changed) + assert_eq(reason, 'stale_reconcile_missing_job') + assert_eq(extra.action, 'released') + assert_nil(state.active) + assert_eq(cancelled, 'stale_reconcile_missing_job') + + local ok, err = active.apply_completion(state, { + kind='active_job_done', + job_id='missing', + generation=1, + phase='reconcile', + token='tok-missing', + status='ok', + }) + assert_eq(ok, false) + assert_eq(err, 'stale') +end + +function tests.test_cleanup_stage_or_commit_terminal_job_only_requests_cancel() + local state = active.new_state() + local cancelled + local lease = assert(active.claim(state, { job_id='j1', generation=1, phase='commit', token='tok-commit' })) + assert_true(lease:handoff()) + state.active.handle = { cancel = function (_, reason) cancelled = reason end } + local jobs = { get = function () return { job_id='j1', state='cancelled' } end } + + local changed, reason, extra = active.cleanup_stale_active(state, jobs) + assert_true(changed) + assert_contains(reason, 'stale_commit_terminal_job') + assert_eq(extra.action, 'cancel_requested') + assert_not_nil(state.active) + assert_eq(state.active.status, 'cancelling') + assert_contains(cancelled, 'stale_commit_terminal_job') + + local changed_again = active.cleanup_stale_active(state, jobs) + assert_eq(changed_again, false) + assert_not_nil(state.active) +end + return tests diff --git a/tests/unit/update/test_artifact_store_update_adapters.lua b/tests/unit/update/test_artifact_store_update_adapters.lua index 6703abfe4..bcc8a46b9 100644 --- a/tests/unit/update/test_artifact_store_update_adapters.lua +++ b/tests/unit/update/test_artifact_store_update_adapters.lua @@ -15,6 +15,54 @@ local function assert_true(v, msg) if v ~= true then error(msg or ('expected true, got ' .. tostring(v)), 2) end end +local function fabric_fact(peer_sid, session_generation, link_id, link_generation) + return { + seen = true, + updated_at = 10, + fabric = { + peer_sid = peer_sid or 'mcu-sid-1', + session_generation = session_generation or 7, + link_id = link_id or 'mcu-uart0', + link_generation = link_generation or 3, + }, + } +end + +local function coherent_control_plane() + return { + kind = 'mcu_control_plane', + facts = { + software = fabric_fact(), + updater = fabric_fact(), + health = fabric_fact(), + }, + source = { kind = 'member', member = 'mcu' }, + } +end + +local function fake_observer(software, overrides) + local state = { + software = software, + updater = { state = 'running' }, + health = { state = 'ok' }, + actions = { ['prepare-update'] = true, ['stage-update'] = true, ['commit-update'] = true }, + control_plane = coherent_control_plane(), + } + for k, v in pairs(overrides or {}) do state[k] = v end + if overrides and overrides.control_plane == false then state.control_plane = nil end + return { + snapshot = function() + return { + components = { + mcu = { + state = state, + }, + }, + } + end, + } +end + function T.artifact_store_bus_unwraps_hal_reply_envelopes() runfibers.run(function() local sink = { terminated = 0 } @@ -48,7 +96,10 @@ function T.artifact_store_bus_unwraps_hal_reply_envelopes() } local store = store_bus.new(conn) - local got_sink, sink_err = fibers.perform(store:create_sink_op({ meta = { component = 'mcu' }, policy = 'prefer_durable' })) + local got_sink, sink_err = fibers.perform(store:create_sink_op({ + meta = { component = 'mcu' }, + policy = 'prefer_durable', + })) assert_eq(got_sink, sink, tostring(sink_err)) local got_source, source_err = fibers.perform(store:open_source_op('artifact-1')) @@ -88,33 +139,44 @@ function T.component_backend_stage_op_runs_preflight_prepare_and_stage() return op.always(artifact, nil) end, open_source_op = function(_, ref) - assert_eq(ref, 'artifact-1') - return op.always(source, nil) + error('component backend must not open artifact source before Device stage action: ' .. tostring(ref), 0) end, } local seen_payload local seen_prepare + local seen_stage_opts local conn = { - call_op = function(_, topic, payload) + call_op = function(_, topic, payload, opts) assert_eq(topic[1], 'cap') assert_eq(topic[2], 'component') assert_eq(topic[4], 'rpc') if topic[5] == 'prepare-update' then seen_prepare = payload assert_eq(payload.target, 'mcu') - return op.always({ ok = true }, nil) + return op.always({ ok = true, max_chunk_size = 512 }, nil) end if topic[5] == 'stage-update' then seen_payload = payload + seen_stage_opts = opts return op.always({ ok = true, public_status = 'succeeded', value = { transferred = true } }, nil) end return op.always({ ok = true }, nil) end, } - local backend = component_backend.new({ conn = conn, artifact_store = artifact_store, component = 'mcu' }) - local job = { job_id = 'job-1', component = 'mcu', artifact_ref = 'artifact-1', metadata = { image_id = 'img-new' } } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }), + }) + local job = { + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + metadata = { image_id = 'img-new', transfer_chunk_raw = 1024 }, + } local staged, serr = fibers.perform(backend:stage_op(job, {})) assert_eq(type(staged), 'table', tostring(serr)) @@ -122,9 +184,616 @@ function T.component_backend_stage_op_runs_preflight_prepare_and_stage() assert_eq(seen_prepare.target, 'mcu') assert_eq(staged.preflight.size, 12) assert_eq(staged.transfer.size, 12) - assert_eq(seen_payload.source, source) + assert_eq(seen_payload.source, nil) + assert_eq(seen_payload.artifact_ref, 'artifact-1') assert_eq(seen_payload.size, 12) assert_eq(seen_payload.digest, 'abcd') + assert_eq(seen_payload.chunk_size, 1024) + assert_eq(seen_stage_opts.timeout, false) + end) +end + +function T.component_backend_commit_op_requires_explicit_acceptance() + runfibers.run(function() + local replies = { + accepted = { + reply = { accepted = true }, + ok = true, + }, + wrapped = { + reply = { ok = true, public_status = 'succeeded', value = { accepted = true } }, + ok = true, + }, + ok_false = { + reply = { ok = false, reason = 'commit_refused' }, + err = 'commit_refused', + }, + failed_public_status = { + reply = { ok = true, public_status = 'failed', err = 'commit_failed' }, + err = 'commit_failed', + }, + accepted_false = { + reply = { ok = true, public_status = 'succeeded', value = { accepted = false, reason = 'busy' } }, + err = 'busy', + }, + missing_accepted = { + reply = { ok = true, public_status = 'succeeded', value = { state = 'queued' } }, + err = 'component_commit_acceptance_missing', + }, + } + + for _, name in ipairs({ + 'accepted', + 'wrapped', + 'ok_false', + 'failed_public_status', + 'accepted_false', + 'missing_accepted', + }) do + local case = replies[name] + local conn = { + call_op = function(_, topic) + assert_eq(topic[5], 'commit-update') + return op.always(case.reply, nil) + end, + } + local backend = component_backend.new({ conn = conn, component = 'mcu' }) + local got, err = fibers.perform(backend:commit_op({ + job_id = 'job-commit', + component = 'mcu', + metadata = { image_id = 'img-new' }, + }, {})) + if case.ok then + assert_eq(type(got), 'table', tostring(err)) + assert_true(got.accepted) + else + assert_eq(got, nil) + assert_eq(err, case.err, name) + end + end + end) +end + +function T.component_backend_commit_op_treats_mcu_timeout_as_commit_failure() + runfibers.run(function() + local conn = { + call_op = function(_, topic) + assert_eq(topic[5], 'commit-update') + return op.always(nil, 'timeout') + end, + } + local backend = component_backend.new({ conn = conn, component = 'mcu' }) + + local got, err = fibers.perform(backend:commit_op({ + job_id = 'job-commit-timeout', + component = 'mcu', + metadata = { image_id = 'img-new' }, + }, {})) + + assert_eq(got, nil) + assert_eq(err, 'component_commit_update_failed:timeout') + end) +end + +function T.component_backend_stage_op_clamps_metadata_chunk_size_to_prepare_max() + runfibers.run(function() + local source = {} + function source:read_chunk_op() return op.always(nil, nil) end + local artifact = {} + function artifact:describe() + return { artifact_ref = 'artifact-1', size = 12, digest = 'abcd', meta = { image_id = 'img-new' } } + end + local artifact_store = { + open_op = function() return op.always(artifact, nil) end, + open_source_op = function() return op.always(source, nil) end, + } + local seen_payload + local conn = { + call_op = function(_, topic, payload) + if topic[5] == 'prepare-update' then + return op.always({ ok = true, max_chunk_size = 512 }, nil) + end + if topic[5] == 'stage-update' then + seen_payload = payload + return op.always({ ok = true, public_status = 'succeeded' }, nil) + end + return op.always({ ok = true }, nil) + end, + } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }), + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + metadata = { image_id = 'img-new', chunk_size = 2048 }, + }, {})) + assert_eq(type(staged), 'table', tostring(serr)) + assert_eq(seen_payload.chunk_size, 512) + end) +end + +function T.component_backend_stage_op_clamps_default_chunk_size_to_prepare_max() + runfibers.run(function() + local source = {} + function source:read_chunk_op() return op.always(nil, nil) end + local artifact = {} + function artifact:describe() + return { artifact_ref = 'artifact-1', size = 12, digest = 'abcd', meta = { image_id = 'img-new' } } + end + local artifact_store = { + open_op = function() return op.always(artifact, nil) end, + open_source_op = function() return op.always(source, nil) end, + } + local seen_payload + local conn = { + call_op = function(_, topic, payload) + if topic[5] == 'prepare-update' then + return op.always({ ok = true, max_chunk_size = 512 }, nil) + end + if topic[5] == 'stage-update' then + seen_payload = payload + return op.always({ ok = true, public_status = 'succeeded' }, nil) + end + return op.always({ ok = true }, nil) + end, + } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }), + chunk_size = 2048, + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + metadata = { image_id = 'img-new' }, + }, {})) + assert_eq(type(staged), 'table', tostring(serr)) + assert_eq(seen_payload.chunk_size, 512) + end) +end + +function T.component_backend_stage_op_uses_prepare_max_when_no_chunk_size_selected() + runfibers.run(function() + local artifact = {} + function artifact:describe() + return { artifact_ref = 'artifact-1', size = 12, digest = 'abcd', meta = { image_id = 'img-new' } } + end + local artifact_store = { + open_op = function() return op.always(artifact, nil) end, + } + local seen_payload + local conn = { + call_op = function(_, topic, payload) + if topic[5] == 'prepare-update' then + return op.always({ ok = true, max_chunk_size = 512 }, nil) + end + if topic[5] == 'stage-update' then + seen_payload = payload + return op.always({ ok = true, public_status = 'succeeded' }, nil) + end + return op.always({ ok = true }, nil) + end, + } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }), + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + metadata = { image_id = 'img-new' }, + }, {})) + assert_eq(type(staged), 'table', tostring(serr)) + assert_eq(seen_payload.chunk_size, 512) + end) +end + +function T.component_backend_stage_op_keeps_configured_chunk_when_prepare_max_absent() + runfibers.run(function() + local source = {} + function source:read_chunk_op() return op.always(nil, nil) end + local artifact = {} + function artifact:describe() + return { artifact_ref = 'artifact-1', size = 12, digest = 'abcd', meta = { image_id = 'img-new' } } + end + local artifact_store = { + open_op = function() return op.always(artifact, nil) end, + open_source_op = function() return op.always(source, nil) end, + } + local seen_payload + local conn = { + call_op = function(_, topic, payload) + if topic[5] == 'prepare-update' then + return op.always({ ok = true }, nil) + end + if topic[5] == 'stage-update' then + seen_payload = payload + return op.always({ ok = true, public_status = 'succeeded' }, nil) + end + return op.always({ ok = true }, nil) + end, + } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }), + chunk_size = 2048, + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + metadata = { image_id = 'img-new' }, + }, {})) + assert_eq(type(staged), 'table', tostring(serr)) + assert_eq(seen_payload.chunk_size, 2048) + end) +end + +function T.component_backend_stage_op_requires_component_boot_id_before_prepare() + runfibers.run(function() + local calls = {} + local artifact_store = { + open_op = function() + calls.open = true + return op.always({}, nil) + end, + open_source_op = function() + calls.source = true + return op.always({}, nil) + end, + } + local conn = { + call_op = function(_, topic) + calls[topic[5]] = true + return op.always({ ok = true }, nil) + end, + } + + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0' }), + }) + local job = { + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + metadata = { image_id = 'img-new' }, + } + + local staged, serr = fibers.perform(backend:stage_op(job, {})) + assert_eq(staged, nil) + assert_eq(serr, 'mcu_control_plane_not_ready:software_boot_id_unavailable') + assert_eq(calls.open, nil, 'artifact should not open before boot_id readiness') + assert_eq(calls['prepare-update'], nil, 'prepare-update should not be called before boot_id readiness') + assert_eq(calls['stage-update'], nil, 'stage-update should not be called before boot_id readiness') + end) +end + +function T.component_backend_stage_op_requires_mcu_critical_facts_before_artifact_open() + runfibers.run(function() + local calls = {} + local artifact_store = { + open_op = function() calls.open = true; return op.always({}, nil) end, + open_source_op = function() calls.source = true; return op.always({}, nil) end, + } + local conn = { + call_op = function(_, topic) + calls[topic[5]] = true + return op.always({ ok = true }, nil) + end, + } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }, { + updater = {}, + health = {}, + }), + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + }, {})) + assert_eq(staged, nil) + assert_eq(serr, 'mcu_control_plane_not_ready:missing_critical_facts:updater,health') + assert_eq(calls.open, nil, 'artifact should not open before critical state readiness') + assert_eq(calls['prepare-update'], nil, 'prepare-update should not be called before critical state readiness') + end) +end + +function T.component_backend_stage_op_requires_prepare_route_before_artifact_open() + runfibers.run(function() + local calls = {} + local artifact_store = { + open_op = function() calls.open = true; return op.always({}, nil) end, + open_source_op = function() calls.source = true; return op.always({}, nil) end, + } + local conn = { + call_op = function(_, topic) + calls[topic[5]] = true + return op.always({ ok = true }, nil) + end, + } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }, { + actions = { ['stage-update'] = true, ['commit-update'] = true }, + }), + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + }, {})) + assert_eq(staged, nil) + assert_eq(serr, 'mcu_control_plane_not_ready:prepare_route_missing') + assert_eq(calls.open, nil, 'artifact should not open before prepare route readiness') + assert_eq(calls['prepare-update'], nil, 'prepare-update should not be called before prepare route readiness') + end) +end + +function T.component_backend_stage_op_requires_mcu_fact_fabric_metadata() + runfibers.run(function() + local calls = {} + local artifact_store = { + open_op = function() calls.open = true; return op.always({}, nil) end, + } + local conn = { + call_op = function(_, topic) + calls[topic[5]] = true + return op.always({ ok = true }, nil) + end, + } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }, { + control_plane = false, + }), + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + }, {})) + assert_eq(staged, nil) + assert_eq(serr, 'mcu_control_plane_not_ready:fact_origin_missing:software,updater,health') + assert_eq(calls.open, nil, 'artifact should not open before fact session readiness') + assert_eq(calls['prepare-update'], nil, 'prepare-update should not be called before fact session readiness') + end) +end + +function T.component_backend_stage_op_rejects_mixed_mcu_fact_sessions() + runfibers.run(function() + local calls = {} + local artifact_store = { + open_op = function() calls.open = true; return op.always({}, nil) end, + } + local conn = { + call_op = function(_, topic) + calls[topic[5]] = true + return op.always({ ok = true }, nil) + end, + } + local cp = coherent_control_plane() + cp.facts.updater = fabric_fact('mcu-sid-2', 8) + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }, { + control_plane = cp, + }), + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + }, {})) + assert_eq(staged, nil) + assert_eq(serr, 'mcu_control_plane_not_ready:mixed_fact_sessions') + assert_eq(calls.open, nil, 'artifact should not open with mixed critical fact sessions') + end) +end + +function T.component_backend_stage_op_rejects_mixed_mcu_fact_links_when_present() + runfibers.run(function() + local calls = {} + local artifact_store = { + open_op = function() calls.open = true; return op.always({}, nil) end, + } + local conn = { + call_op = function(_, topic) + calls[topic[5]] = true + return op.always({ ok = true }, nil) + end, + } + local cp = coherent_control_plane() + cp.facts.health = fabric_fact('mcu-sid-1', 7, 'mcu-uart1', 3) + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }, { + control_plane = cp, + }), + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + }, {})) + assert_eq(staged, nil) + assert_eq(serr, 'mcu_control_plane_not_ready:mixed_fact_links') + assert_eq(calls.open, nil, 'artifact should not open with mixed critical fact links') + end) +end + +function T.component_backend_stage_op_admits_real_component_projection_shape() + runfibers.run(function() + local opened = false + local artifact = {} + function artifact:describe() + return { artifact_ref = 'artifact-1', size = 12, digest = 'abcd', meta = { image_id = 'img-new' } } + end + local artifact_store = { + open_op = function(_, ref) + opened = true + assert_eq(ref, 'artifact-1') + return op.always(artifact, nil) + end, + } + local conn = { + call_op = function(_, topic) + if topic[5] == 'prepare-update' then + return op.always({ ok = true, max_chunk_size = 512 }, nil) + end + if topic[5] == 'stage-update' then + return op.always({ ok = true, public_status = 'succeeded', value = { transferred = true } }, nil) + end + return op.always({ ok = true }, nil) + end, + } + local observer = { + snapshot = function() + return { + components = { + mcu = { + kind = 'device.component', + component = 'mcu', + software = { image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }, + updater = { state = 'ready' }, + health = 'ok', + actions = { ['prepare-update'] = true, ['stage-update'] = true }, + source = { kind = 'member', member = 'mcu' }, + control_plane = coherent_control_plane(), + }, + }, + } + end, + } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = observer, + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + metadata = { image_id = 'img-new' }, + }, {})) + + assert_eq(type(staged), 'table', tostring(serr)) + assert_true(opened, 'artifact metadata should open after real-shape admission passes') + end) +end + +function T.component_backend_stage_op_rejects_source_reason_before_artifact_open() + runfibers.run(function() + local calls = {} + local artifact_store = { + open_op = function() calls.open = true; return op.always({}, nil) end, + open_source_op = function() calls.source = true; return op.always({}, nil) end, + } + local conn = { + call_op = function(_, topic) + calls[topic[5]] = true + return op.always({ ok = true }, nil) + end, + } + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }, { + source = { reason = 'liveness_timeout' }, + }), + }) + + local staged, serr = fibers.perform(backend:stage_op({ + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + }, {})) + assert_eq(staged, nil) + assert_eq(serr, 'mcu_control_plane_not_ready:liveness_timeout') + assert_eq(calls.open, nil, 'artifact should not open before source readiness') + assert_eq(calls['prepare-update'], nil, 'prepare-update should not be called before source readiness') + end) +end + +function T.component_backend_stage_op_labels_prepare_timeout() + runfibers.run(function() + local artifact = {} + function artifact:describe() + return { artifact_ref = 'artifact-1', size = 12, meta = { image_id = 'img-new' } } + end + + local artifact_store = { + open_op = function() + return op.always(artifact, nil) + end, + open_source_op = function() + error('open_source_op should not run after prepare timeout', 0) + end, + } + local conn = { + call_op = function(_, topic) + if topic[5] == 'prepare-update' then + return op.always(nil, 'timeout') + end + return op.always({ ok = true }, nil) + end, + } + + local backend = component_backend.new({ + conn = conn, + artifact_store = artifact_store, + component = 'mcu', + observer = fake_observer({ image_id = 'img-old', version = '1.0', boot_id = 'boot-old' }), + }) + local job = { + job_id = 'job-1', + component = 'mcu', + artifact_ref = 'artifact-1', + metadata = { image_id = 'img-new' }, + } + + local staged, serr = fibers.perform(backend:stage_op(job, {})) + assert_eq(staged, nil) + assert_eq(serr, 'component_prepare_update_failed:timeout') end) end diff --git a/tests/unit/update/test_job_runtime.lua b/tests/unit/update/test_job_runtime.lua index 0fbd1c7fa..54f72c7cb 100644 --- a/tests/unit/update/test_job_runtime.lua +++ b/tests/unit/update/test_job_runtime.lua @@ -1,5 +1,7 @@ local fibers = require 'fibers' +local sleep = require 'fibers.sleep' local op = require 'fibers.op' +local cond = require 'fibers.cond' local job_runtime = require 'services.update.job_runtime' local store_mod = require 'services.update.job_store_memory' @@ -9,6 +11,17 @@ local function assert_eq(a,b,msg) if a ~= b then fail(msg or ('expected '..tostr local function assert_true(v,msg) if v ~= true then fail(msg or ('expected true, got '..tostring(v))) end end local function assert_not_nil(v,msg) if v == nil then fail(msg or 'expected non-nil') end end +local function wait_until(fn, opts) + opts = opts or {} + local deadline = fibers.now() + (opts.timeout or 0.5) + local interval = opts.interval or 0.005 + while fibers.now() < deadline do + if fn() then return true end + fibers.perform(sleep.sleep_op(interval)) + end + return false +end + local function start_runtime(scope, params) local rt = assert(job_runtime.start(scope, params or {})) local ready, err = fibers.perform(rt:ready_op()) @@ -119,13 +132,277 @@ function tests.test_failed_after_admission_keeps_admitted_lifecycle_marker() end) assert_eq(st, 'ok') assert_eq(result.failed.status, 'failed') + assert_eq(result.failed.reason, 'job_store_save_failed:save_failed') + assert_eq(result.failed.store_op, 'save_job_op') + assert_eq(result.failed.store_err, 'job_store_save_failed:save_failed') local rec = result.transitions.by_id[result.failed.transition_id] assert_eq(rec.state, 'failed') - assert_eq(rec.error, 'save_failed') + assert_eq(rec.error, 'job_store_save_failed:save_failed') + assert_eq(rec.store_op, 'save_job_op') + assert_eq(rec.store_err, 'job_store_save_failed:save_failed') assert_true(rec.admitted, 'failed-after-admission should remain distinguishable from rejection') end) end +function tests.test_retryable_save_failure_stays_pending_and_recovers() + fibers.run(function () + local allow_save = false + local attempts = 0 + local saved + local store = { + save_job_op = function (_, job) + attempts = attempts + 1 + if not allow_save then return op.always(nil, 'control_store_put_timeout') end + saved = job + return op.always(true, nil) + end, + } + + local st, _, result = fibers.run_scope(function (scope) + local rt = start_runtime(scope, { + service_id = 'update', + store = store, + persistence_retry_backoff = 0.005, + }) + local handle, admit_err = rt:admit_transition({ + kind = 'create_job', + generation = 1, + payload = { job_id = 'j-pending', component = 'cm5', artifact_ref = 'artifact-1' }, + }) + assert_not_nil(handle, admit_err) + + assert_true(wait_until(function () + local pending = rt:persistence_snapshot() + return pending and pending.retry_attempts and pending.retry_attempts >= 1 + end), 'expected retryable persistence failure to become visible') + + local pending = rt:persistence_snapshot() + local snapshot = rt:model_snapshot() + local transition = snapshot.transitions.by_id[handle:transition_id()] + assert_eq(rt:persistence_pending(), true) + assert_eq(pending.store_op, 'save_job_op') + assert_eq(pending.store_err, 'job_store_save_failed:control_store_put_timeout') + assert_eq(pending.job_id, 'j-pending') + assert_eq(pending.retry_mode, 'indefinite') + assert_eq(pending.dependency, 'job_store') + assert_not_nil(pending.first_failed_at) + assert_not_nil(pending.last_attempt_at) + assert_true(pending.retry_attempts >= 1) + assert_eq(transition.state, 'persisting') + assert_eq(transition.persistence_pending, true) + assert_eq(transition.retry_mode, 'indefinite') + assert_eq(transition.dependency, 'job_store') + + local second, second_err = rt:admit_transition({ + kind = 'create_job', + generation = 1, + payload = { job_id = 'j-blocked', component = 'cm5', artifact_ref = 'artifact-2' }, + }) + assert_eq(second, nil) + assert_eq(second_err, 'job_persistence_pending') + + allow_save = true + local persisted = fibers.perform(handle:outcome_op()) + local after = rt:model_snapshot() + rt:cancel('test complete') + return { persisted = persisted, after = after } + end) + + assert_eq(st, 'ok') + assert_eq(result.persisted.status, 'persisted') + assert_eq(result.persisted.job_id, 'j-pending') + assert_eq(result.after.persistence, nil) + assert_not_nil(saved, 'expected job to be saved after retry recovery') + assert_true(attempts >= 2, 'expected at least one retry before recovery') + end) +end + +function tests.test_route_missing_store_failure_is_not_persistence_retry() + fibers.run(function () + local store = { + save_job_op = function () + return op.always(nil, 'no_route') + end, + } + + local st, _, result = fibers.run_scope(function (scope) + local rt = start_runtime(scope, { + service_id = 'update', + store = store, + persistence_retry_backoff = 0.005, + }) + local failed = perform_transition(rt, { + kind = 'create_job', + generation = 1, + payload = { job_id = 'j-route', component = 'cm5', artifact_ref = 'artifact-route' }, + }) + local transitions = rt:transition_snapshot() + local persistence = rt:persistence_snapshot() + rt:cancel('test complete') + return { failed = failed, transitions = transitions, persistence = persistence } + end) + + assert_eq(st, 'ok') + assert_eq(result.failed.status, 'failed') + assert_eq(result.failed.reason, 'job_store_save_failed:no_route') + assert_eq(result.persistence, nil) + local rec = result.transitions.by_id[result.failed.transition_id] + assert_eq(rec.state, 'failed') + assert_eq(rec.store_err, 'job_store_save_failed:no_route') + assert_eq(rec.persistence_pending, nil) + end) +end + +function tests.test_discard_job_allows_only_unowned_created_jobs_by_default() + fibers.run(function () + local st, _, result = fibers.run_scope(function (scope) + local rt = start_runtime(scope, { service_id = 'update', store = store_mod.new() }) + local created = perform_transition(rt, { + kind = 'create_job', + generation = 1, + payload = { job_id = 'j-created', component = 'cm5', artifact_ref = 'artifact-created' }, + }) + local discarded = perform_transition(rt, { + kind = 'discard_job', + generation = 1, + job_id = 'j-created', + reason = 'upload_start_failed:slot_busy', + }) + + perform_transition(rt, { + kind = 'create_job', + generation = 1, + payload = { job_id = 'j-active', component = 'cm5', artifact_ref = 'artifact-active' }, + }) + perform_transition(rt, { + kind = 'start_job', + generation = 1, + job_id = 'j-active', + }) + local active_rejected = perform_transition(rt, { + kind = 'discard_job', + generation = 1, + job_id = 'j-active', + reason = 'unsafe_active_discard', + }) + + perform_transition(rt, { + kind = 'create_job', + generation = 1, + payload = { job_id = 'j-staging', component = 'cm5', artifact_ref = 'artifact-staging' }, + }) + perform_transition(rt, { + kind = 'patch_job', + generation = 1, + job_id = 'j-staging', + patch = { state = 'staging', next_step = nil }, + }) + local staging_rejected = perform_transition(rt, { + kind = 'discard_job', + generation = 1, + job_id = 'j-staging', + }) + + perform_transition(rt, { + kind = 'create_job', + generation = 1, + payload = { job_id = 'j-return', component = 'cm5', artifact_ref = 'artifact-return' }, + }) + perform_transition(rt, { + kind = 'patch_job', + generation = 1, + job_id = 'j-return', + patch = { state = 'awaiting_return', next_step = 'reconcile' }, + }) + local return_rejected = perform_transition(rt, { + kind = 'discard_job', + generation = 1, + job_id = 'j-return', + }) + local forced = perform_transition(rt, { + kind = 'discard_job', + generation = 1, + job_id = 'j-return', + force = true, + reason = 'admin_force_discard', + }) + + local snapshot = rt:snapshot() + rt:cancel('test complete') + return { + created = created, + discarded = discarded, + active_rejected = active_rejected, + staging_rejected = staging_rejected, + return_rejected = return_rejected, + forced = forced, + snapshot = snapshot, + } + end) + + assert_eq(st, 'ok') + assert_eq(result.created.status, 'persisted') + assert_eq(result.discarded.status, 'persisted') + assert_eq(result.snapshot.by_id['j-created'], nil) + assert_eq(result.active_rejected.status, 'rejected') + assert_eq(result.active_rejected.reason, 'job_not_discardable') + assert_eq(result.staging_rejected.status, 'rejected') + assert_eq(result.staging_rejected.reason, 'job_not_discardable') + assert_eq(result.return_rejected.status, 'rejected') + assert_eq(result.return_rejected.reason, 'job_not_discardable') + assert_eq(result.forced.status, 'persisted') + assert_not_nil(result.snapshot.by_id['j-active']) + assert_not_nil(result.snapshot.by_id['j-staging']) + assert_eq(result.snapshot.by_id['j-return'], nil) + end) +end + +function tests.test_healthy_inflight_transition_queueing_is_intentional() + fibers.run(function () + local save_entered = cond.new() + local release_save = cond.new() + local saved = {} + local store = { + save_job_op = function (_, job) + saved[#saved + 1] = job.job_id + if job.job_id == 'j-first' then + save_entered:signal() + return release_save:wait_op():wrap(function () return true, nil end) + end + return op.always(true, nil) + end, + } + + local st, _, result = fibers.run_scope(function (scope) + local rt = start_runtime(scope, { service_id = 'update', store = store }) + local first = assert(rt:admit_transition({ + kind = 'create_job', + generation = 1, + payload = { job_id = 'j-first', component = 'cm5', artifact_ref = 'artifact-1' }, + })) + fibers.perform(save_entered:wait_op()) + local second, second_err = rt:admit_transition({ + kind = 'create_job', + generation = 1, + payload = { job_id = 'j-second', component = 'cm5', artifact_ref = 'artifact-2' }, + }) + assert_not_nil(second, second_err) + assert_eq(rt:persistence_pending(), false) + release_save:signal() + local first_result = fibers.perform(first:outcome_op()) + local second_result = fibers.perform(second:outcome_op()) + rt:cancel('test complete') + return { first = first_result, second = second_result } + end) + + assert_eq(st, 'ok') + assert_eq(result.first.status, 'persisted') + assert_eq(result.second.status, 'persisted') + assert_eq(saved[1], 'j-first') + assert_eq(saved[2], 'j-second') + end) +end + function tests.test_submit_transition_rejects_before_ready_without_recording_admission() fibers.run(function () local st, _, result = fibers.run_scope(function (scope) diff --git a/tests/unit/update/test_job_store_control_store.lua b/tests/unit/update/test_job_store_control_store.lua index 3f1172eeb..430cab066 100644 --- a/tests/unit/update/test_job_store_control_store.lua +++ b/tests/unit/update/test_job_store_control_store.lua @@ -118,4 +118,86 @@ function T.accepts_hal_void_success_replies_for_put_and_delete() end) end +function T.load_all_ignores_stale_index_entries() + runfibers.run(function() + local data = { + ['update-job-present'] = require('cjson.safe').encode({ + job_id = 'present', + component = 'mcu', + state = 'created', + }), + } + local conn = { + call_op = function(_, topic, payload) + local method = topic[5] + if method == 'list' then + return require('fibers.op').always({ + ok = true, + reason = { 'update-job-missing', 'update-job-present' }, + }, nil) + elseif method == 'get' then + if data[payload.key] == nil then + return require('fibers.op').always({ ok = false, reason = 'not found' }, nil) + end + return require('fibers.op').always({ ok = true, reason = data[payload.key] }, nil) + end + return require('fibers.op').always({ ok = false, reason = 'bad method' }, nil) + end, + } + + local store = store_mod.new(conn) + local snapshot, load_err = fibers.perform(store:load_all_op()) + assert(snapshot ~= nil, tostring(load_err)) + assert(snapshot.jobs.present.component == 'mcu') + assert(snapshot.jobs.missing == nil) + assert(#snapshot.order == 1) + assert(snapshot.order[1] == 'present') + end) +end + +function T.maps_plain_timeouts_to_method_specific_reasons() + runfibers.run(function() + local op_mod = require('fibers.op') + local method + local conn = { + call_op = function(_, topic) + method = topic[5] + return op_mod.always(nil, 'timeout') + end, + } + local store = store_mod.new(conn) + + local ok_save, save_err = fibers.perform(store:save_job_op({ + job_id = 'job-timeout', + component = 'mcu', + state = 'created', + })) + assert(ok_save == nil) + assert(save_err == 'control_store_put_timeout') + assert(method == 'put') + + local snapshot, list_err = fibers.perform(store:load_all_op()) + assert(snapshot == nil) + assert(list_err == 'control_store_list_timeout') + + local ok_delete, delete_err = fibers.perform(store:delete_job_op('job-timeout')) + assert(ok_delete == nil) + assert(delete_err == 'control_store_delete_timeout') + + local conn2 = { + call_op = function(_, topic) + local m = topic[5] + if m == 'list' then + return op_mod.always({ ok = true, reason = { 'update-job-missing' } }, nil) + end + return op_mod.always(nil, 'timeout') + end, + } + local store2 = store_mod.new(conn2) + local snapshot2, get_err = fibers.perform(store2:load_all_op()) + assert(snapshot2 == nil) + assert(get_err == 'control_store_get_timeout') + end) +end + return T diff --git a/tests/unit/update/test_manager_requests.lua b/tests/unit/update/test_manager_requests.lua index 02e147996..bac4e9ef2 100644 --- a/tests/unit/update/test_manager_requests.lua +++ b/tests/unit/update/test_manager_requests.lua @@ -201,6 +201,46 @@ function tests.test_start_job_caller_cancellation_after_transition_admission_doe end) end +function tests.test_discard_job_uses_explicit_reason() + fibers.run(function () + local captured + local req = request({ method = 'discard_job', job_id = 'j1' }) + local jobs = { + admit_transition = function (_, cmd) + captured = cmd + return { + outcome_op = function () + return op.always({ + status = 'persisted', + job = { job_id = 'j1', component = 'mcu', state = 'discarded' }, + }, nil) + end, + }, nil + end, + } + + local st, _, result = fibers.run_scope(function (scope) + return manager_requests.discard_job(scope, { + request = req, + jobs = jobs, + job_id = 'j1', + generation = 7, + reason = 'upload_start_failed:slot_busy', + }) + end) + + assert_eq(st, 'ok') + assert_eq(result.status, 'persisted') + assert_eq(captured.kind, 'discard_job') + assert_eq(captured.generation, 7) + assert_eq(captured.job_id, 'j1') + assert_eq(captured.reason, 'upload_start_failed:slot_busy') + local ok, value = fibers.perform(req:wait_op()) + assert_true(ok) + assert_eq(value.discarded, true) + end) +end + function tests.test_create_job_requires_artifact_ref() fibers.run(function () diff --git a/tests/unit/update/test_observe_reconcile.lua b/tests/unit/update/test_observe_reconcile.lua index 2866d9eb1..c5c5ae4de 100644 --- a/tests/unit/update/test_observe_reconcile.lua +++ b/tests/unit/update/test_observe_reconcile.lua @@ -3,11 +3,37 @@ local sleep = require 'fibers.sleep' local op = require 'fibers.op' local observe = require 'services.update.observe' local active_job = require 'services.update.active_job' +local active_policy = require 'services.update.active_policy' +local component_backend = require 'services.update.backends.component' local tests = {} local function fail(msg) error(msg or 'assertion failed', 2) end local function assert_eq(a,b,msg) if a ~= b then fail(msg or ('expected '..tostring(b)..', got '..tostring(a))) end end local function assert_true(v,msg) if v ~= true then fail(msg or ('expected true, got '..tostring(v))) end end +local function assert_not_nil(v,msg) if v == nil then fail(msg or 'expected non-nil') end end + +local function assert_list_eq(got, want) + assert_eq(type(got), 'table', 'expected list table') + assert_eq(#got, #want, 'list length mismatch') + for i = 1, #want do + assert_eq(got[i], want[i], 'list item '..i) + end +end + +local function mcu_job() + return { + job_id = 'job-mcu', + component = 'mcu', + metadata = { expected_image_id = 'mcu-image-new' }, + commit_attempt = { + pre_commit = { + expected_image_id = 'mcu-image-new', + pre_commit_image_id = 'mcu-image-old', + pre_commit_boot_id = 'mcu-boot-old', + }, + }, + } +end function tests.test_observer_changed_op_wakes_with_snapshot_copy() fibers.run(function () @@ -54,6 +80,157 @@ function tests.test_reconcile_worker_waits_on_component_observer() end) end +function tests.test_component_reconcile_completes_after_post_reboot_mcu_state_reimport() + fibers.run(function (scope) + local obs = observe.new({ components = { mcu = { component = 'mcu' } } }) + local backend = component_backend.new({ component = 'mcu', observer = obs }) + local job = mcu_job() + + obs:update_component('mcu', { + software = { image_id = 'mcu-image-old', boot_id = 'mcu-boot-old' }, + updater = { state = 'rebooting' }, + health = { state = 'ok' }, + }) + + fibers.spawn(function () + fibers.perform(sleep.sleep_op(0.02)) + obs:update_component('mcu', { + software = { image_id = 'mcu-image-new', version = '15.2', boot_id = 'mcu-boot-new' }, + updater = { state = 'running' }, + health = { state = 'ok' }, + }) + end) + + local result = active_job.reconcile(scope, { + backend = backend, + job = job, + observer = obs, + deadline = fibers.now() + 1, + }) + assert_eq(result.tag, 'reconciled_success') + assert_eq(result.state.software.image_id, 'mcu-image-new') + assert_eq(result.state.software.boot_id, 'mcu-boot-new') + end) +end + +function tests.test_component_reconcile_waits_for_mcu_critical_state_even_when_software_matches() + fibers.run(function () + local obs = observe.new({ components = { mcu = { component = 'mcu' } } }) + local backend = component_backend.new({ component = 'mcu', observer = obs }) + local job = mcu_job() + local snapshot = { + by_id = { + mcu = { + state = { + software = { image_id = 'mcu-image-new', boot_id = 'mcu-boot-new' }, + runtime = { memory = { free = 1234 } }, + }, + }, + }, + } + + local result = backend:evaluate_reconcile(job, snapshot, {}) + assert_eq(result.done, false) + assert_eq(result.reason, 'waiting_for_mcu_critical_state') + assert_list_eq(result.missing_facts, { 'updater', 'health' }) + end) +end + +function tests.test_component_reconcile_timeout_reports_missing_mcu_critical_state() + fibers.run(function (scope) + local obs = observe.new({ components = { mcu = { component = 'mcu' } } }) + local backend = component_backend.new({ component = 'mcu', observer = obs }) + local job = mcu_job() + + obs:update_component('mcu', { + runtime = { memory = { free = 1234 } }, + }) + + local result = active_job.reconcile(scope, { + backend = backend, + job = job, + observer = obs, + deadline = fibers.now() + 0.02, + poll_s = 0.005, + }) + + assert_eq(result.tag, 'reconcile_timeout') + assert_eq(result.reason, 'mcu_critical_state_timeout') + assert_eq(result.last_reason, 'waiting_for_mcu_critical_state') + assert_list_eq(result.missing_facts, { 'software', 'updater', 'health' }) + assert_not_nil(result.state) + assert_not_nil(result.state.runtime) + end) +end + +function tests.test_component_reconcile_preserves_explicit_updater_failure() + fibers.run(function () + local backend = component_backend.new({ component = 'mcu' }) + local result = backend:evaluate_reconcile(mcu_job(), { + by_id = { + mcu = { + state = { + software = { image_id = 'mcu-image-new', boot_id = 'mcu-boot-new' }, + updater = { state = 'failed' }, + }, + }, + }, + }, {}) + + assert_eq(result.done, true) + assert_eq(result.ok, false) + assert_eq(result.reason, 'failed') + end) +end + +function tests.test_component_reconcile_prefers_updater_last_error_on_failed_mcu() + fibers.run(function () + local backend = component_backend.new({ component = 'mcu' }) + local result = backend:evaluate_reconcile(mcu_job(), { + by_id = { + mcu = { + state = { + software = { image_id = 'mcu-image-new', boot_id = 'mcu-boot-new' }, + updater = { + state = 'failed', + last_error = 'abupdate_buy_failed', + boot_buy_rc = -42, + }, + health = { state = 'ok' }, + }, + }, + }, + }, {}) + + assert_eq(result.done, true) + assert_eq(result.ok, false) + assert_eq(result.reason, 'abupdate_buy_failed') + assert_eq(result.state.updater.boot_buy_rc, -42) + end) +end + +function tests.test_active_policy_persists_meaningful_reconcile_timeout_reason() + local job = { + job_id = 'job-timeout', + component = 'mcu', + state = 'awaiting_return', + history = {}, + } + local ok, err = active_policy.apply_completion(job, { + kind = 'active_job_done', + status = 'ok', + result = { + tag = 'reconcile_timeout', + reason = 'mcu_critical_state_timeout', + missing_facts = { 'software', 'updater', 'health' }, + }, + }, 42) + assert_true(ok, err) + assert_eq(job.state, 'timed_out') + assert_eq(job.error, 'mcu_critical_state_timeout') + assert_list_eq(job.result.missing_facts, { 'software', 'updater', 'health' }) +end + function tests.test_stage_worker_runs_single_stage_op() fibers.run(function (scope) @@ -163,6 +340,106 @@ function tests.test_commit_worker_persists_attempt_before_backend_commit_and_pas end) end +function tests.test_commit_worker_rejected_backend_commit_does_not_persist_awaiting_return() + fibers.run(function (scope) + local order = {} + local jobs = {} + function jobs:admit_transition(cmd) + order[#order + 1] = cmd.kind + if cmd.kind == 'commit_accepted' then + error('commit_accepted must not persist after backend rejection', 0) + end + return { + outcome_op = function () + return op.always({ status = 'persisted' }, nil) + end, + }, nil + end + local backend = {} + function backend:commit_capabilities() + return { policy = 'idempotent_by_token' } + end + function backend:commit_op() + order[#order + 1] = 'backend_commit' + return op.always(nil, 'component_commit_rejected') + end + + local ok, err = pcall(function () + active_job.commit(scope, { + backend = backend, + jobs = jobs, + lease = { token = 'active-token', generation = 1 }, + job = { job_id = 'j1', component = 'cm5', state = 'committing', active_token = 'active-token' }, + }) + end) + + assert_eq(ok, false) + assert_true(tostring(err):find('component_commit_rejected', 1, true) ~= nil) + assert_eq(table.concat(order, ','), 'begin_commit_attempt,backend_commit') + end) +end + +function tests.test_commit_worker_persists_ambiguous_backend_commit_acceptance() + fibers.run(function (scope) + local accepted_payload + local jobs = {} + function jobs:admit_transition(cmd) + if cmd.kind == 'begin_commit_attempt' then + return { + outcome_op = function () + return op.always({ + status = 'persisted', + commit_token = cmd.commit_token, + commit_policy = cmd.commit_policy, + }, nil) + end + }, nil + end + if cmd.kind == 'commit_accepted' then + accepted_payload = cmd.accepted + return { + outcome_op = function () + return op.always({ + status = 'persisted', + job = { + job_id = cmd.job_id, + component = 'mcu', + state = 'awaiting_return', + }, + }, nil) + end + }, nil + end + error('unexpected transition '..tostring(cmd.kind), 0) + end + local backend = {} + function backend:commit_capabilities() + return { policy = 'idempotent_by_token' } + end + function backend:commit_op(_, ctx) + return op.always({ + accepted = true, + ambiguous = true, + reason = 'timeout', + token = ctx.commit_token, + }, nil) + end + + local result = active_job.commit(scope, { + backend = backend, + jobs = jobs, + lease = { token = 'active-token', generation = 1 }, + job = { job_id = 'j1', component = 'mcu', state = 'committing', active_token = 'active-token' }, + }) + + assert_eq(result.tag, 'commit_started') + assert_true(result.accepted) + assert_true(result.commit.ambiguous) + assert_eq(accepted_payload.ambiguous, true) + assert_eq(accepted_payload.reason, 'timeout') + end) +end + function tests.test_commit_worker_rejects_backend_without_commit_policy() fibers.run(function (scope) local backend = { commit_op = function () return op.always({ accepted = true }, nil) end } diff --git a/tests/unit/update/test_service.lua b/tests/unit/update/test_service.lua index 3b33fa159..f6dc39e76 100644 --- a/tests/unit/update/test_service.lua +++ b/tests/unit/update/test_service.lua @@ -2,17 +2,22 @@ local fibers = require 'fibers' local sleep = require 'fibers.sleep' +local op = require 'fibers.op' local busmod = require 'bus' +local tablex = require 'shared.table' local update = require 'services.update' local service = require 'services.update.service' local topics = require 'services.update.topics' +local service_base = require 'devicecode.service_base' local probe = require 'tests.support.bus_probe' local tests = {} local function fail(msg) error(msg or 'assertion failed', 2) end -local function assert_eq(a, b, msg) if a ~= b then fail(msg or ('expected ' .. tostring(b) .. ', got ' .. tostring(a))) end end +local function assert_eq(a, b, msg) + if a ~= b then fail(msg or ('expected ' .. tostring(b) .. ', got ' .. tostring(a))) end +end local function assert_true(v, msg) if v ~= true then fail(msg or ('expected true, got ' .. tostring(v))) end end local function assert_not_nil(v, msg) if v == nil then fail(msg or 'expected non-nil value') end end @@ -202,11 +207,245 @@ function tests.test_publisher_failure_is_supervised_component_failure() end) end +function tests.test_initial_runtime_reconcile_failure_publishes_last_failure() + fibers.run(function () + local bus = busmod.new() + local svc_conn = bus:connect() + local lifecycle = service_base.new(svc_conn, { name = 'update' }) + local fake_scope = { + finally = function () end, + status = function () return 'running' end, + admission = function () return 'open' end, + child = function () return nil, 'runtime_scope_child_failed' end, + } + + local ok, err = pcall(function () + service.run(fake_scope, { + publish = false, + service_id = 'update', + svc = lifecycle, + conn = svc_conn, + bind_manager = false, + job_store_kind = 'memory', + watch_config = false, + }) + end) + assert_eq(ok, false) + assert_true(tostring(err):find('runtime_scope_child_failed', 1, true) ~= nil) + + local status = probe.wait_retained_payload(svc_conn, topics.lifecycle_status(), { timeout = 0.2 }) + local failure = status and status.last_failure + assert_not_nil(failure, 'expected service status to publish last_failure') + assert_eq(failure.source, 'runtime_reconcile') + assert_eq(failure.reason, 'runtime_scope_child_failed') + assert_eq(failure.event_kind, 'initial_runtime_reconcile') + assert_eq(failure.event_status, 'failed') + assert_eq(failure.event_primary, 'runtime_scope_child_failed') + end) +end + +function tests.test_active_runtime_failure_publishes_last_failure_with_current_job() + fibers.run(function () + local bus = busmod.new() + local svc_conn = bus:connect() + local lifecycle = service_base.new(svc_conn, { name = 'update' }) + local saves = 0 + local store = { + load_all_op = function () + return op.always({ + jobs = { + ['job-active-fail'] = { + job_id = 'job-active-fail', + component = 'cm5', + state = 'staging', + generation = 1, + active_intent = { token = 'tok-stage', phase = 'stage', generation = 1 }, + created_seq = 1, + updated_seq = 1, + history = {}, + }, + }, + order = { 'job-active-fail' }, + next_seq = 2, + }, nil) + end, + save_job_op = function () + saves = saves + 1 + if saves == 1 then return op.always(true, nil) end + return op.always(nil, 'save_failed') + end, + } + local backend = { + stage_op = function (_, job) + return op.always({ job_id = job.job_id }, nil) + end, + } + + local st, _, primary = fibers.run_scope(function (scope) + return service.run(scope, { + publish = false, + service_id = 'update', + svc = lifecycle, + conn = svc_conn, + job_store = store, + watch_config = false, + backend = backend, + }) + end) + assert_eq(st, 'failed') + assert_true(tostring(primary):find('job_store_save_failed:save_failed', 1, true) ~= nil) + + local status = probe.wait_retained_payload(svc_conn, topics.lifecycle_status(), { timeout = 0.2 }) + local failure = status and status.last_failure + assert_not_nil(failure, 'expected service status to publish active runtime last_failure') + assert_eq(failure.source, 'active_runtime') + assert_eq(failure.reason, 'job_store_save_failed:save_failed') + assert_eq(failure.event_kind, 'component_done') + assert_eq(failure.event_status, 'failed') + assert_eq(failure.component, 'active_runtime') + assert_not_nil(failure.current_job, 'expected current job context') + assert_eq(failure.current_job.job_id, 'job-active-fail') + assert_eq(failure.current_job.state, 'staging') + end) +end + +function tests.test_retryable_active_persistence_failure_degrades_without_crashing() + fibers.run(function (root_scope) + local bus = busmod.new() + local svc_conn = bus:connect() + local caller = bus:connect() + local stored = { jobs = {}, order = {}, next_seq = 1 } + local allow_awaiting_commit = false + local save_attempts = 0 + local function upsert(job) + stored.jobs[job.job_id] = tablex.deep_copy(job) + local found = false + for _, id in ipairs(stored.order) do + if id == job.job_id then found = true; break end + end + if not found then stored.order[#stored.order + 1] = job.job_id end + return true, nil + end + local store = { + load_all_op = function () + return op.always(tablex.deep_copy(stored), nil) + end, + save_job_op = function (_, job) + save_attempts = save_attempts + 1 + if job.state == 'awaiting_commit' and not allow_awaiting_commit then + return op.always(nil, 'control_store_put_timeout') + end + upsert(job) + return op.always(true, nil) + end, + delete_job_op = function (_, job_id) + stored.jobs[job_id] = nil + return op.always(true, nil) + end, + } + local backend = { + stage_op = function (_, job) + return op.always({ job_id = job.job_id, staged = true }, nil) + end, + } + + local child = assert(root_scope:child()) + local ok, err = child:spawn(function (scope) + service.run(scope, { + publish = false, + service_id = 'update', + conn = svc_conn, + watch_config = false, + config = { schema = 'devicecode.update/1', components = { { component = 'cm5' } } }, + job_store = store, + backend = backend, + persistence_retry_backoff = 0.01, + }) + end) + assert_true(ok, err) + fibers.perform(sleep.sleep_op(0.02)) + + local created, create_err = caller:call(topics.update_manager_rpc('create-job'), { + job_id = 'j-persist', + component = 'cm5', + artifact_ref = 'artifact-persist', + }, { timeout = 0.5 }) + assert_not_nil(created, create_err) + + local started, start_err = caller:call(topics.update_manager_rpc('start-job'), { + job_id = 'j-persist', + }, { timeout = 0.5 }) + assert_not_nil(started, start_err) + assert_eq(started.accepted, true) + + local status + assert_true(probe.wait_until(function () + status = caller:call(topics.update_manager_rpc('status'), {}, { timeout = 0.05 }) + local snap = status and status.snapshot + return snap + and snap.reason == 'update_persistence_pending' + and snap.last_warning + and snap.last_warning.store_err == 'job_store_save_failed:control_store_put_timeout' + end, { timeout = 0.5, interval = 0.01 }), 'expected retryable persistence pending status') + + assert_eq(status.snapshot.ready, false) + assert_eq(status.snapshot.last_warning.source, 'job_runtime') + assert_eq(status.snapshot.last_warning.reason, 'job_store_save_failed:control_store_put_timeout') + assert_eq(status.snapshot.last_warning.retry_mode, 'indefinite') + assert_eq(status.snapshot.last_warning.dependency, 'job_store') + assert_not_nil(status.snapshot.last_warning.first_failed_at) + assert_not_nil(status.snapshot.last_warning.last_attempt_at) + assert_eq(status.snapshot.job_runtime.persistence.job_id, 'j-persist') + assert_eq(status.snapshot.job_runtime.persistence.retry_mode, 'indefinite') + assert_eq(status.snapshot.job_runtime.persistence.dependency, 'job_store') + assert_true(status.snapshot.job_runtime.persistence.retry_attempts >= 1) + + local listed, list_err = caller:call(topics.update_manager_rpc('list-jobs'), {}, { timeout = 0.2 }) + assert_not_nil(listed, list_err) + assert_eq(listed.ok, true) + local got, get_err = caller:call(topics.update_manager_rpc('get-job'), { + job_id = 'j-persist', + }, { timeout = 0.2 }) + assert_not_nil(got, get_err) + assert_eq(got.job.job_id, 'j-persist') + + local blocked, block_err = caller:call(topics.update_manager_rpc('create-job'), { + job_id = 'j-blocked', + component = 'cm5', + artifact_ref = 'artifact-blocked', + }, { timeout = 0.2 }) + assert_eq(blocked, nil) + assert_eq(block_err, 'update_persistence_pending') + + allow_awaiting_commit = true + assert_true(probe.wait_until(function () + status = caller:call(topics.update_manager_rpc('status'), {}, { timeout = 0.05 }) + local job = status and status.snapshot and status.snapshot.jobs.by_id['j-persist'] + return job and job.state == 'awaiting_commit' and status.snapshot.reason == nil + end, { timeout = 0.8, interval = 0.01 }), 'expected persistence recovery to advance staged job') + + assert_eq(status.snapshot.ready, true) + assert_eq(status.snapshot.last_warning, nil) + assert_eq(status.snapshot.pending.persistence, nil) + assert_true(save_attempts >= 3, 'expected retry attempts around awaiting_commit persistence') + child:cancel('test complete') + end) +end + function tests.test_service_uses_shared_config_watch_helper() local source = assert(io.open('../src/services/update/service.lua', 'r')):read('*a') - assert_true(source:find("devicecode.support.config_watch", 1, true) ~= nil, 'update service should require shared config_watch helper') - assert_true(source:find('config_watch.open', 1, true) ~= nil, 'update service should open cfg/update through shared config_watch') - assert_true(source:find('watch_retained(conn, topics.config()', 1, true) == nil, 'update service should not own a bespoke retained config watcher') + assert_true( + source:find("devicecode.support.config_watch", 1, true) ~= nil, + 'update service should require shared config_watch helper' + ) + assert_true( + source:find('config_watch.open', 1, true) ~= nil, + 'update service should open cfg/update through shared config_watch' + ) + assert_true( + source:find('watch_retained(conn, topics.config()', 1, true) == nil, + 'update service should not own a bespoke retained config watcher' + ) end function tests.test_service_start_path_allows_injected_config_watch_for_harnesses() diff --git a/tests/unit/update/test_service_phase2.lua b/tests/unit/update/test_service_phase2.lua index a792ad95b..d4c47a595 100644 --- a/tests/unit/update/test_service_phase2.lua +++ b/tests/unit/update/test_service_phase2.lua @@ -154,6 +154,71 @@ function tests.test_restart_adoption_starts_reconcile_for_awaiting_return() end) end +function tests.test_cancelled_reconcile_active_cleanup_unblocks_new_start() + fibers.run(function (root_scope) + local reconcile_started = false + local initial_jobs = { + jobs = { + j1 = { + job_id='j1', + component='cm5', + state='awaiting_return', + created_seq=1, + updated_seq=1, + }, + }, + } + local active_backend = {} + function active_backend:stage_op(job) return op.always({ job_id=job.job_id }, nil) end + function active_backend:evaluate_reconcile() + reconcile_started = true + return { done=false, reason='waiting_for_test' } + end + local child, caller = start_service(root_scope, { + config={ schema='devicecode.update/1', components={ { component='cm5' } } }, + initial_jobs = initial_jobs, + backend = active_backend, + }) + + assert_true(probe.wait_until(function() + local status = caller:call(topics.update_manager_rpc('status'), {}, { timeout=0.05 }) + local active = status and status.snapshot and status.snapshot.update_active + return reconcile_started and active and active.job_id == 'j1' and active.phase == 'reconcile' + end, { timeout=0.8, interval=0.01 }), 'expected reconcile active slot') + + local cancelled = assert(caller:call(topics.update_manager_rpc('cancel-job'), { + job_id='j1', + reason='test_cancel', + }, { timeout=0.5 })) + assert_eq(cancelled.ok, true) + + assert_true(probe.wait_until(function() + local status = caller:call(topics.update_manager_rpc('status'), {}, { timeout=0.05 }) + local job = status and status.snapshot and status.snapshot.jobs.by_id.j1 + local active = status and status.snapshot and status.snapshot.update_active + return job and job.state == 'cancelled' and active == nil + end, { timeout=0.8, interval=0.01 }), 'expected stale reconcile active cleanup') + + assert(caller:call(topics.update_manager_rpc('create-job'), { + job_id='j2', + component='cm5', + artifact_ref='artifact-j2', + }, { timeout=0.5 })) + local started, start_err = caller:call( + topics.update_manager_rpc('start-job'), + { job_id='j2' }, + { timeout=0.5 } + ) + assert_not_nil(started, start_err) + assert_eq(started.accepted, true) + assert_true(probe.wait_until(function() + local status = caller:call(topics.update_manager_rpc('status'), {}, { timeout=0.05 }) + local job = status and status.snapshot and status.snapshot.jobs.by_id.j2 + return job and job.state == 'awaiting_commit' + end, { timeout=0.8, interval=0.01 }), 'expected new start after cleanup') + child:cancel('test complete') + end) +end function tests.test_slow_job_runtime_load_keeps_public_service_responsive() fibers.run(function (root_scope)