From d9fd55c3139915a7715dc2ed4fb9eff9d3f42405 Mon Sep 17 00:00:00 2001 From: Rich Thanki Date: Wed, 17 Jun 2026 14:13:41 +0000 Subject: [PATCH 1/6] makes switch driver architecture cimpliant --- src/services/hal/managers/wired.lua | 85 +++++++++++++++++++++++------ src/services/wired/service.lua | 1 - 2 files changed, 67 insertions(+), 19 deletions(-) diff --git a/src/services/hal/managers/wired.lua b/src/services/hal/managers/wired.lua index b6074430..6b419518 100644 --- a/src/services/hal/managers/wired.lua +++ b/src/services/hal/managers/wired.lua @@ -99,11 +99,9 @@ local function provider_poll_interval_s(config) end -local function driver_result(provider_id, method, opts) - local driver = state.drivers[provider_id] - if not driver then return { ok = false, err = 'wired provider not configured', code = 'not_configured' } end +local function perform_driver_method(driver, method, opts) local opname = tostring(method) .. '_op' - local fn = driver[opname] + local fn = driver and driver[opname] if type(fn) ~= 'function' then return { ok = false, err = 'wired driver missing ' .. opname } end local ok, driver_op = pcall(function () return fn(driver, opts or {}) end) if not ok then return { ok = false, err = tostring(driver_op) } end @@ -114,12 +112,22 @@ local function driver_result(provider_id, method, opts) return { ok = result == true, result = result } end +local function driver_result(provider_id, method, opts) + local driver = state.drivers[provider_id] + if not driver then return { ok = false, err = 'wired provider not configured', code = 'not_configured' } end + return perform_driver_method(driver, method, opts) +end + +local function poller_is_current(provider_id, driver) + local rec = state.pollers and state.pollers[provider_id] or nil + return rec ~= nil and rec.driver == driver and state.drivers[provider_id] == driver +end + local function poll_loop(provider_id, driver, interval_s) - fibers.perform(sleep.sleep_op(interval_s)) - while state.drivers[provider_id] == driver do + while poller_is_current(provider_id, driver) do local started = runtime.now() - local result = driver_result(provider_id, 'snapshot', {}) - if state.drivers[provider_id] ~= driver then return end + local result = perform_driver_method(driver, 'snapshot', {}) + if not poller_is_current(provider_id, driver) then return end if result and result.ok == true then local ok, err = emit_snapshot_now(provider_id, result) if ok ~= true then log('error', { what = 'wired_provider_poll_emit_failed', provider = provider_id, err = err }) end @@ -138,9 +146,44 @@ local function poll_loop(provider_id, driver, interval_s) end end +local function cancel_pollers(reason) + local pollers = state.pollers or {} + state.pollers = {} + for _, rec in pairs(pollers) do + if rec and rec.scope then rec.scope:cancel(reason or 'wired provider poller cancelled') end + end +end + +local function cancel_provider_poller(provider_id, reason) + local rec = state.pollers and state.pollers[provider_id] or nil + if not rec then return end + state.pollers[provider_id] = nil + if rec.scope then rec.scope:cancel(reason or 'wired provider poller cancelled') end +end + local function spawn_poll_loop(provider_id, driver, interval_s) - state.pollers[provider_id] = true - state.scope:spawn(function () poll_loop(provider_id, driver, interval_s) end) + if not state.scope then return nil, 'wired manager scope not started' end + cancel_provider_poller(provider_id, 'wired provider poller replaced') + local poll_scope, scope_err = state.scope:child() + if not poll_scope then return nil, scope_err or 'wired provider poller scope create failed' end + + local rec = { + scope = poll_scope, + driver = driver, + interval_s = interval_s, + } + state.pollers[provider_id] = rec + poll_scope:finally(function () + if state.pollers and state.pollers[provider_id] == rec then state.pollers[provider_id] = nil end + end) + + local ok, err = poll_scope:spawn(function () poll_loop(provider_id, driver, interval_s) end) + if not ok then + if state.pollers[provider_id] == rec then state.pollers[provider_id] = nil end + poll_scope:cancel(tostring(err or 'wired provider poller spawn failed')) + return nil, err or 'wired provider poller spawn failed' + end + return true, nil end local function handle_request(provider_id, req) @@ -293,6 +336,7 @@ function M.apply_config_op(config) local provider_ids, perr = normalise_provider_ids(config or {}) if not provider_ids then return false, perr end + cancel_pollers('reconfigured') stop_drivers('reconfigured') local ok, cerr = reconcile_device_caps(provider_ids) if ok ~= true then return false, cerr end @@ -313,15 +357,19 @@ function M.apply_config_op(config) local driver, err = driver_mod.new(driver_config, driver_opts) if not driver then return false, ('wired provider %s create failed: %s'):format(id, tostring(err)) end state.drivers[id] = driver - local result = driver_result(id, 'snapshot', {}) - if result.ok == true then - local eok, eerr = emit_snapshot_now(id, result) - if eok ~= true then return false, eerr or 'wired provider emit failed' end - else - local eok, eerr = emit_status_now(id, { state = 'unavailable', available = false, err = result.err }) - if eok ~= true then return false, eerr or 'wired provider status emit failed' end + local eok, eerr = emit_status_now(id, { + state = 'observing', + available = false, + driver = driver_config.provider, + polling = true, + }) + if eok ~= true then return false, eerr or 'wired provider status emit failed' end + local spawned, spawn_err = spawn_poll_loop(id, driver, poll_interval_s) + if spawned ~= true then + state.drivers[id] = nil + if driver and type(driver.terminate) == 'function' then driver:terminate('poller spawn failed') end + return false, ('wired provider %s poller failed: %s'):format(id, tostring(spawn_err)) end - spawn_poll_loop(id, driver, poll_interval_s) end end log('info', { what = 'wired_manager_configured', providers = provider_ids }) @@ -341,6 +389,7 @@ function M.shutdown_op(_timeout_s) end function M.terminate(reason) + cancel_pollers(reason or 'terminated') stop_drivers(reason or 'terminated') close_control_channels() state.provider_ids = {} diff --git a/src/services/wired/service.lua b/src/services/wired/service.lua index 5f1dd01f..0360bc15 100644 --- a/src/services/wired/service.lua +++ b/src/services/wired/service.lua @@ -399,7 +399,6 @@ local function apply_config(state, ev) snap.generation = (ev and ev.generation) or (snap.generation + 1) snap.config = { rev = intent.rev, schema = intent.schema, config_schema = intent.config_schema, version = intent.version } snap.config_intent = intent - snap.dependencies = {} snap.stats.config_updates = (snap.stats.config_updates or 0) + 1 return rebuild_derived(snap) end) From 2f7fc708d7801882472fad545a956239a24f8a1c Mon Sep 17 00:00:00 2001 From: Rich Thanki Date: Wed, 17 Jun 2026 23:24:28 +0000 Subject: [PATCH 2/6] polling groups, pollers are owned child workers --- docs/switch.md | 46 +- src/configs/bigbox-v1-cm-2.json | 25 +- .../wired/providers/rtl8380m_http.lua | 400 ++++++++++++++-- src/services/hal/managers/wired.lua | 230 ++++++++-- .../devhost/rtl8380m_switch_spec.lua | 432 +++++++++++++++++- 5 files changed, 1054 insertions(+), 79 deletions(-) diff --git a/docs/switch.md b/docs/switch.md index 41ef79fc..e96d240b 100644 --- a/docs/switch.md +++ b/docs/switch.md @@ -81,7 +81,11 @@ The only supported switch HTTP provider configuration spellings on this path are username = "$SWITCH_USERNAME", password = "$SWITCH_PASSWORD", timeout_s = 0.8, - poll_interval_s = 1.0, + poll = { + fast = { interval_s = 1.0, groups = { "panel", "poe", "counters" } }, + medium = { interval_s = 5.0, groups = { "vlan", "lldp" } }, + slow = { interval_s = 30.0, groups = { "identity", "runtime" } }, + }, http = { capability = "main", response_parser = "legacy-http1-close", @@ -185,7 +189,35 @@ lldp_local lldp_neighbor ``` -The driver captures these into normalised provider observations: +The provider also has narrower read paths for timing and grouped polling. Surface-bearing groups include `home_main` so that rows can be attached to the canonical switch surface names (`GE1` ... `GE10`): + +```text +panel/link path: home_main, panel_info +identity path: sys_sysinfo +port path: home_main, port_port +vlan path: home_main, vlan_create, vlan_conf, vlan_port, vlan_membership +poe path: home_main, poe_poe +lldp path: lldp_local, lldp_neighbor +runtime path: sys_cpumem +counters path: home_main, rmon_statistics +stats path: sys_cpumem, rmon_statistics +full path: all read-side commands +``` + +The panel/link path is the cheapest source of the switch front-panel state: which GE/SFP surfaces are present and whether they are connected. It deliberately avoids `sys_cpumem` and `rmon_statistics`, which can be slower on this switch. The devhost timing sweep can time each group separately against the fixed test switch: + +```sh +cd tests +SWITCH_TEST_FIXED_SWITCH_TIMING=1 \ +SWITCH_TEST_TIMING_TIMEOUT_S=2.5 \ +SWITCH_TEST_TIMING_ITERATIONS=3 \ +TEST_FILTER=rtl8380m_fixed_switch_admin_command_timing_sweep \ +luajit run.lua +``` + +Use `SWITCH_TEST_TIMING_GROUPS=panel,poe,runtime,counters` to restrict the sweep, and `SWITCH_TEST_TIMING_REQUIRE_ALL=1` when failures should fail the test rather than only being reported. + +The driver captures the full snapshot into normalised provider observations: ```text raw/host/wired/provider/switch-main/status @@ -198,7 +230,15 @@ raw/host/wired/provider/switch-main/state/topology If `include_raw = true` is set in a test, the snapshot also keeps the source command payloads for parser debugging. Full raw CGI bodies should not be promoted to public retained state by default. -The HAL wired manager owns scheduling. For the RTL8380M provider it takes an immediate snapshot when configured and then polls at `poll_interval_s`, which is `1.0` seconds in the Big Box configuration. Polls are non-overlapping: if a read is slow or fails, the next poll is not queued behind it. Poll failures update only the provider status and leave the last good identity/runtime/power/surfaces/topology retained facts in place. +The HAL wired manager owns scheduling. For the RTL8380M provider, manager apply admits the provider and starts owned poller work; switch observation is not part of configuration admission. The Big Box poll plan is grouped and sequential: + +```text +fast, 1 Hz: panel, poe, counters +medium, 5 s: vlan, lldp +slow, 30 s: identity, runtime +``` + +Each poll loop is non-overlapping. A slow runtime read therefore cannot queue behind, block, or mark the fast link-state path unavailable. Successful groups merge into the retained raw observation cache, so `state/surfaces` carries last-known link, PoE, counter and VLAN facts together. Group failures update provider status but leave the last good identity/runtime/power/surfaces/topology retained facts in place. Canonical observation names are deliberately strict. CPU and memory are published as `runtime.cpu` and `runtime.memory`; PoE device-level power and temperature are published as `power.poe`; port counters are published under each surface as `counters`. The switch path must not publish `telemetry.cpu`, `telemetry.mem`, `telemetry.poe`, or any compatibility topic for `state/telemetry`. diff --git a/src/configs/bigbox-v1-cm-2.json b/src/configs/bigbox-v1-cm-2.json index 87454588..23d7dfa5 100644 --- a/src/configs/bigbox-v1-cm-2.json +++ b/src/configs/bigbox-v1-cm-2.json @@ -62,7 +62,30 @@ "username": "$SWITCH_USERNAME", "password": "$SWITCH_PASSWORD", "timeout_s": 0.8, - "poll_interval_s": 1.0, + "poll": { + "fast": { + "interval_s": 1.0, + "groups": [ + "panel", + "poe", + "counters" + ] + }, + "medium": { + "interval_s": 5.0, + "groups": [ + "vlan", + "lldp" + ] + }, + "slow": { + "interval_s": 30.0, + "groups": [ + "identity", + "runtime" + ] + } + }, "http": { "response_parser": "legacy-http1-close", "capability": "main", diff --git a/src/services/hal/backends/wired/providers/rtl8380m_http.lua b/src/services/hal/backends/wired/providers/rtl8380m_http.lua index 07edd546..a499b738 100644 --- a/src/services/hal/backends/wired/providers/rtl8380m_http.lua +++ b/src/services/hal/backends/wired/providers/rtl8380m_http.lua @@ -39,6 +39,58 @@ local READ_COMMANDS = { 'rmon_statistics', } +local STATS_COMMANDS = { + 'sys_cpumem', + 'rmon_statistics', +} + +local PANEL_COMMANDS = { + 'home_main', + 'panel_info', +} + +local COMMAND_GROUPS = { + home_main = { 'home_main' }, + panel_info = { 'panel_info' }, + panel = PANEL_COMMANDS, + identity = { 'sys_sysinfo' }, + port = { 'home_main', 'port_port' }, + vlan_create = { 'home_main', 'vlan_create' }, + vlan_conf = { 'home_main', 'vlan_conf' }, + vlan_port = { 'home_main', 'vlan_port' }, + vlan_membership = { 'home_main', 'vlan_membership' }, + vlan = { 'home_main', 'vlan_create', 'vlan_conf', 'vlan_port', 'vlan_membership' }, + poe = { 'home_main', 'poe_poe' }, + lldp_local = { 'lldp_local' }, + lldp_neighbor = { 'lldp_neighbor' }, + lldp = { 'lldp_local', 'lldp_neighbor' }, + runtime = { 'sys_cpumem' }, + counters = { 'home_main', 'rmon_statistics' }, + stats = STATS_COMMANDS, + full = READ_COMMANDS, +} + +local COMMAND_GROUP_ORDER = { + 'home_main', + 'panel_info', + 'panel', + 'identity', + 'port', + 'vlan_create', + 'vlan_conf', + 'vlan_port', + 'vlan_membership', + 'vlan', + 'poe', + 'lldp_local', + 'lldp_neighbor', + 'lldp', + 'runtime', + 'counters', + 'stats', + 'full', +} + local VLAN_MODE = { [0] = 'hybrid', [1] = 'access', @@ -162,6 +214,18 @@ local function header_values(headers, name) return values end +local function reset_session(self) + self.jar = CookieJar.new({ cookie_language = 'defLang_en' }) + self.logged_in = false +end + +local function auth_invalid_body(body) + local s = tostring(body or ''):lower() + return s:find('login.html', 1, true) ~= nil + or s:find('home_login', 1, true) ~= nil + or s:find('loginstatus', 1, true) ~= nil +end + local function request(self, method, url, body, extra_headers) if not self.http_ref or type(self.http_ref.exchange_op) ~= 'function' then return nil, 'http capability ref not configured' @@ -317,19 +381,21 @@ local function response_status_ok(body) return tostring(body or ''):find('"status"%s*:%s*"ok"') ~= nil end -local function login(self) - if self.disable_login or self.logged_in then return true, nil end +local function login(self, opts) + opts = opts or {} + if self.disable_login then return true, nil end + if self.logged_in and not opts.force then return true, nil end if not self.username or not self.password then return false, 'switch username/password not configured' end - self.jar:set('cookie_language', 'defLang_en') + reset_session(self) local info_url = cgi_url(self.base_url, 'get', 'home_login') local info, err = request(self, 'GET', info_url) - if not info or info.status ~= 200 then return false, err or ('home_login HTTP ' .. tostring(info and info.status)) end + if not info or info.status ~= 200 then reset_session(self); return false, err or ('home_login HTTP ' .. tostring(info and info.status)) end local info_json, jerr = json_decode(info.body) - if not info_json then return false, 'home_login invalid JSON: ' .. tostring(jerr) end + if not info_json then reset_session(self); return false, 'home_login invalid JSON: ' .. tostring(jerr) end local modulus = info_json.data and info_json.data.modulus - if not modulus then return false, 'home_login response missing RSA modulus' end + if not modulus then reset_session(self); return false, 'home_login response missing RSA modulus' end local encrypted, enc_err = openssl_encrypt_b64(self, modulus, self.password) - if not encrypted then return false, enc_err end + if not encrypted then reset_session(self); return false, enc_err end local form_data = '_ds=1&' .. form_encode({ username = self.username, password = encrypted }) .. '&_de=1' local auth_url = cgi_url(self.base_url, 'set', 'home_loginAuth', os.time()) local origin = parse_origin(self.base_url) @@ -353,6 +419,7 @@ local function login(self) if status and response_status_ok(status.body) then self.logged_in = true; return true, nil end end end + reset_session(self) return false, 'RTL8380 RSA login was not confirmed' end @@ -360,12 +427,26 @@ local function get_cmd(self, cmd) local url = cgi_url(self.base_url, 'get', cmd) if cmd == 'rmon_statistics' then url = url .. '&time=0' end local r, err = request(self, 'GET', url) - if not r then return nil, err end - if r.status ~= 200 then return nil, ('%s HTTP %s'):format(cmd, tostring(r.status)) end + if not r then return nil, err, 'transport' end + if r.status == 401 or r.status == 403 then return nil, ('%s HTTP %s'):format(cmd, tostring(r.status)), 'auth_invalid' end + if r.status ~= 200 then return nil, ('%s HTTP %s'):format(cmd, tostring(r.status)), 'http' end local parsed, perr = json_decode(r.body) - if not parsed then return nil, cmd .. ' invalid JSON: ' .. tostring(perr) end - if parsed.logout then return nil, cmd .. ' returned logout=' .. tostring(parsed.logout) .. ' reason=' .. tostring(parsed.reason) end - return parsed.data or parsed, nil + if not parsed then + if auth_invalid_body(r.body) then return nil, cmd .. ' returned login page', 'auth_invalid' end + return nil, cmd .. ' invalid JSON: ' .. tostring(perr), 'parse' + end + if parsed.logout then return nil, cmd .. ' returned logout=' .. tostring(parsed.logout) .. ' reason=' .. tostring(parsed.reason), 'auth_invalid' end + return parsed.data or parsed, nil, nil +end + +local function read_commands(self, commands) + local data = {} + for _, cmd in ipairs(commands or {}) do + local d, err, code = get_cmd(self, cmd) + if not d then return nil, err or ('failed to read ' .. cmd), code end + data[cmd] = d + end + return data, nil, nil end local function parse_speed_mbps(v) @@ -565,11 +646,11 @@ local function build_surfaces(data) return surfaces end -local function build_snapshot(self, data) +local function build_identity(data) + data = data or {} local sys = data.sys_sysinfo or {} local home = data.home_main or {} - local poe = data.poe_poe or {} - local identity = { + return { model = home.model or home.title, hostname = sys.hostname, mac = sys.sysMac, @@ -581,20 +662,28 @@ local function build_snapshot(self, data) management_ipv4 = sys.currIpv4, management_ipv6 = sys.currIpv6, } +end + +local function base_status(self) + return { + state = 'available', + available = true, + mode = self.mode, + driver = DRIVER, + base_url = self.base_url, + login = self.logged_in and 'confirmed' or (self.disable_login and 'disabled' or 'attempted'), + } +end + +local function build_snapshot(self, data) + local poe = data.poe_poe or {} return { ok = true, provider_id = self.id, mode = self.mode, writable = false, - status = { - state = 'available', - available = true, - mode = self.mode, - driver = DRIVER, - base_url = self.base_url, - login = self.logged_in and 'confirmed' or (self.disable_login and 'disabled' or 'attempted'), - }, - identity = identity, + status = base_status(self), + identity = build_identity(data), surfaces = build_surfaces(data), topology = { lldp_local = data.lldp_local, @@ -606,6 +695,55 @@ local function build_snapshot(self, data) } end +local function build_panel(self, data) + return { + ok = true, + provider_id = self.id, + status = base_status(self), + surfaces = build_surfaces(data), + raw = self.include_raw and data or nil, + } +end + +local function build_group_observation(self, group, data) + group = tostring(group or '') + if group == 'full' then return build_snapshot(self, data) end + local out = { + ok = true, + provider_id = self.id, + group = group, + status = base_status(self), + } + + if group == 'identity' then + out.identity = build_identity(data) + elseif group == 'runtime' then + out.runtime = parse_runtime(data.sys_cpumem) + elseif group == 'poe' then + out.power = parse_power(data.poe_poe) + out.surfaces = build_surfaces(data) + elseif group == 'lldp' or group == 'lldp_local' or group == 'lldp_neighbor' then + out.topology = { + lldp_local = data.lldp_local, + lldp_neighbor = data.lldp_neighbor, + } + elseif group == 'panel' or group == 'home_main' or group == 'panel_info' or group == 'port' + or group == 'vlan' or group == 'vlan_create' or group == 'vlan_conf' + or group == 'vlan_port' or group == 'vlan_membership' + or group == 'counters' + then + out.surfaces = build_surfaces(data) + elseif group == 'stats' then + out.runtime = parse_runtime(data.sys_cpumem) + out.surfaces = build_surfaces(data) + else + out.raw = self.include_raw and data or nil + end + + if self.include_raw then out.raw = data end + return out +end + local function require_http_config(config) local http = config and config.http or nil if type(http) ~= 'table' then return nil, 'http table is required' end @@ -692,18 +830,123 @@ function Provider:fetch_snapshot() local ok_login, lerr = login(self) if not ok_login then - return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, err = lerr or 'login failed' } } + return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = lerr or 'login failed' } } end - local data = {} - for _, cmd in ipairs(READ_COMMANDS) do - local d, err = get_cmd(self, cmd) - if not d then - return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, err = err or ('failed to read ' .. cmd) } } + local data, err, code = read_commands(self, READ_COMMANDS) + if data then return build_snapshot(self, data) end + + if code == 'auth_invalid' then + reset_session(self) + local ok_relogin, relogin_err = login(self, { force = true }) + if not ok_relogin then + return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = relogin_err or 're-login failed' } } end - data[cmd] = d + data, err, code = read_commands(self, READ_COMMANDS) + if data then return build_snapshot(self, data) end end - return build_snapshot(self, data) + + return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = self.logged_in and 'confirmed' or 'failed', err = err or 'switch snapshot failed' } } +end + +function Provider:fetch_command_group(group_name) + local group = tostring(group_name or '') + local commands = COMMAND_GROUPS[group] + if not commands then + return { + ok = false, + provider_id = self.id, + status = { state = 'unavailable', available = false, driver = DRIVER, err = 'unknown command group: ' .. group }, + } + end + + local ok_login, lerr = login(self) + if not ok_login then + return { ok = false, provider_id = self.id, group = group, commands = commands, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = lerr or 'login failed' } } + end + + local data, err, code = read_commands(self, commands) + if code == 'auth_invalid' then + reset_session(self) + local ok_relogin, relogin_err = login(self, { force = true }) + if not ok_relogin then + return { ok = false, provider_id = self.id, group = group, commands = commands, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = relogin_err or 're-login failed' } } + end + data, err, code = read_commands(self, commands) + end + + if not data then + return { ok = false, provider_id = self.id, group = group, commands = commands, status = { state = 'unavailable', available = false, driver = DRIVER, login = self.logged_in and 'confirmed' or 'failed', err = err or ('switch command group failed: ' .. group) } } + end + + return { + ok = true, + provider_id = self.id, + group = group, + commands = commands, + status = { state = 'available', available = true, driver = DRIVER, login = self.logged_in and 'confirmed' or 'disabled' }, + raw = data, + } +end + +function Provider:fetch_panel() + if self.disable_login then + local data, err = read_commands(self, PANEL_COMMANDS) + if not data then return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'disabled', err = err or 'panel read failed' } } end + return build_panel(self, data) + end + + local ok_login, lerr = login(self) + if not ok_login then + return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = lerr or 'login failed' } } + end + + local data, err, code = read_commands(self, PANEL_COMMANDS) + if code == 'auth_invalid' then + reset_session(self) + local ok_relogin, relogin_err = login(self, { force = true }) + if not ok_relogin then + return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = relogin_err or 're-login failed' } } + end + data, err, code = read_commands(self, PANEL_COMMANDS) + end + if not data then + return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = self.logged_in and 'confirmed' or 'failed', err = err or 'switch panel read failed' } } + end + return build_panel(self, data) +end + +function Provider:fetch_stats() + if self.client and type(self.client.stats) == 'function' then + local data, err = self.client:stats(self) + if not data then return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, err = err } } end + return { ok = true, provider_id = self.id, runtime = parse_runtime(data.sys_cpumem), raw = self.include_raw and data or nil } + end + + local ok_login, lerr = login(self) + if not ok_login then + return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = lerr or 'login failed' } } + end + + local data, err, code = read_commands(self, STATS_COMMANDS) + if code == 'auth_invalid' then + reset_session(self) + local ok_relogin, relogin_err = login(self, { force = true }) + if not ok_relogin then + return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = relogin_err or 're-login failed' } } + end + data, err, code = read_commands(self, STATS_COMMANDS) + end + if not data then + return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = self.logged_in and 'confirmed' or 'failed', err = err or 'switch stats read failed' } } + end + return { + ok = true, + provider_id = self.id, + status = { state = 'available', available = true, driver = DRIVER, login = self.logged_in and 'confirmed' or 'disabled' }, + runtime = parse_runtime(data.sys_cpumem), + raw = self.include_raw and data or nil, + } end function Provider:fetch_snapshot_op(_req) @@ -728,10 +971,95 @@ end function Provider:snapshot_op(req) return self:fetch_snapshot_op(req) end function Provider:watch_op(req) return self:fetch_snapshot_op(req) end +function Provider:command_group_op(req) + req = req or {} + return op.guard(function () + return fibers.run_scope_op(function () + return self:fetch_command_group(req.group) + end):wrap(function (status, _report, result_or_primary, err) + if status == 'ok' then return result_or_primary, err end + return { + ok = false, + provider_id = self.id, + group = req.group, + status = { + state = 'unavailable', + available = false, + driver = DRIVER, + err = tostring(result_or_primary or status or 'command group failed'), + }, + }, nil + end) + end) +end + +function Provider:group_observation_op(req) + req = req or {} + return op.guard(function () + return fibers.run_scope_op(function () + local group = tostring(req.group or '') + local result = self:fetch_command_group(group) + if result.ok ~= true then return result end + return build_group_observation(self, group, result.raw or {}) + end):wrap(function (status, _report, result_or_primary, err) + if status == 'ok' then return result_or_primary, err end + return { + ok = false, + provider_id = self.id, + group = req.group, + status = { + state = 'unavailable', + available = false, + driver = DRIVER, + err = tostring(result_or_primary or status or 'group observation failed'), + }, + }, nil + end) + end) +end + +function Provider:panel_op(_req) + return op.guard(function () + return fibers.run_scope_op(function () + return self:fetch_panel() + end):wrap(function (status, _report, result_or_primary, err) + if status == 'ok' then return result_or_primary, err end + return { + ok = false, + provider_id = self.id, + status = { + state = 'unavailable', + available = false, + driver = DRIVER, + err = tostring(result_or_primary or status or 'panel read failed'), + }, + }, nil + end) + end) +end +function Provider:stats_op(_req) + return op.guard(function () + return fibers.run_scope_op(function () + return self:fetch_stats() + end):wrap(function (status, _report, result_or_primary, err) + if status == 'ok' then return result_or_primary, err end + return { + ok = false, + provider_id = self.id, + status = { + state = 'unavailable', + available = false, + driver = DRIVER, + err = tostring(result_or_primary or status or 'stats failed'), + }, + }, nil + end) + end) +end function Provider:apply_attachments_op(_req) return op.always(contract.read_only('apply_attachments')) end function Provider:set_poe_op(_req) return op.always(contract.read_only('set_poe')) end function Provider:bounce_op(_req) return op.always(contract.read_only('bounce')) end -function Provider:terminate(_reason) return true end +function Provider:terminate(_reason) reset_session(self); return true end M._test = { VLAN_MODE = VLAN_MODE, @@ -743,6 +1071,12 @@ M._test = { parse_runtime = parse_runtime, parse_power = parse_power, parse_surface_counters = parse_surface_counters, + build_group_observation = function(provider_like, group, data) return build_group_observation(provider_like or { id = 'switch-main', mode = 'read_only' }, group, data or {}) end, + auth_invalid_body = auth_invalid_body, + STATS_COMMANDS = STATS_COMMANDS, + PANEL_COMMANDS = PANEL_COMMANDS, + COMMAND_GROUPS = COMMAND_GROUPS, + COMMAND_GROUP_ORDER = COMMAND_GROUP_ORDER, } return M diff --git a/src/services/hal/managers/wired.lua b/src/services/hal/managers/wired.lua index 6b419518..6c3cf844 100644 --- a/src/services/hal/managers/wired.lua +++ b/src/services/hal/managers/wired.lua @@ -8,8 +8,10 @@ local fibers = require 'fibers' local op = require 'fibers.op' local channel = require 'fibers.channel' +local cond = require 'fibers.cond' local sleep = require 'fibers.sleep' local runtime = require 'fibers.runtime' +local tablex = require 'shared.table' local strict = require 'services.hal.support.strict_manager' local hal_types = require 'services.hal.types.core' @@ -29,6 +31,7 @@ local state = { controls = {}, provider_ids = {}, pollers = {}, + observations = {}, device_registered = false, } @@ -58,6 +61,58 @@ local function shallow_copy(t) return out end +local function copy(v) return tablex.deep_copy(v) end + +local function merge_table(dst, src) + dst = dst or {} + if type(src) ~= 'table' then return dst end + for k, v in pairs(src) do + if v ~= nil then + if type(v) == 'table' and type(dst[k]) == 'table' then + merge_table(dst[k], v) + else + dst[k] = copy(v) + end + end + end + return dst +end + +local function observation_cache(provider_id) + state.observations = state.observations or {} + local cache = state.observations[provider_id] + if cache == nil then + cache = { + status = {}, + identity = {}, + runtime = {}, + power = {}, + surfaces = {}, + topology = {}, + } + state.observations[provider_id] = cache + end + return cache +end + +local function merge_observation(provider_id, snapshot) + local cache = observation_cache(provider_id) + snapshot = snapshot or {} + if type(snapshot.status) == 'table' then merge_table(cache.status, snapshot.status) end + for _, key in ipairs({ 'identity', 'runtime', 'power', 'topology' }) do + if type(snapshot[key]) == 'table' then merge_table(cache[key], snapshot[key]) end + end + if type(snapshot.surfaces) == 'table' then + for surface_id, surface in pairs(snapshot.surfaces) do + local id = tostring(surface_id or '') + if id ~= '' and type(surface) == 'table' then + cache.surfaces[id] = merge_table(cache.surfaces[id] or {}, surface) + end + end + end + return cache +end + local function max(a, b) if a > b then return a end return b end local function list_signature(list) @@ -98,6 +153,42 @@ local function provider_poll_interval_s(config) return n, nil end +local function normalise_groups(groups, path) + if type(groups) ~= 'table' then return nil, path .. '.groups must be a non-empty array' end + local out = {} + for i = 1, #groups do + local group = groups[i] + if type(group) ~= 'string' or group == '' then return nil, path .. '.groups[' .. tostring(i) .. '] must be a non-empty string' end + out[#out + 1] = group + end + if #out == 0 then return nil, path .. '.groups must be a non-empty array' end + return out, nil +end + +local function provider_poll_plan(config) + config = config or {} + if config.poll ~= nil then + if config.poll_interval_s ~= nil then return nil, 'use poll, not poll_interval_s, for grouped wired polling' end + if type(config.poll) ~= 'table' then return nil, 'poll must be a table' end + local out = {} + for _, name in ipairs(sorted_keys(config.poll)) do + local rec = config.poll[name] + local path = 'poll.' .. tostring(name) + if type(rec) ~= 'table' then return nil, path .. ' must be a table' end + local interval_s = tonumber(rec.interval_s) + if interval_s == nil or interval_s <= 0 then return nil, path .. '.interval_s must be a positive number' end + local groups, gerr = normalise_groups(rec.groups, path) + if not groups then return nil, gerr end + out[#out + 1] = { name = tostring(name), interval_s = interval_s, groups = groups } + end + if #out == 0 then return nil, 'poll must contain at least one poll group' end + return out, nil + end + + local interval_s, err = provider_poll_interval_s(config) + if not interval_s then return nil, err end + return { { name = 'snapshot', interval_s = interval_s, method = 'snapshot' } }, nil +end local function perform_driver_method(driver, method, opts) local opname = tostring(method) .. '_op' @@ -123,26 +214,80 @@ local function poller_is_current(provider_id, driver) return rec ~= nil and rec.driver == driver and state.drivers[provider_id] == driver end -local function poll_loop(provider_id, driver, interval_s) - while poller_is_current(provider_id, driver) do - local started = runtime.now() +local function emit_observing_once(provider_id, driver) + local rec = state.pollers and state.pollers[provider_id] or nil + if not rec or rec.observing_emitted then return true, nil end + local ok, err = emit_status_now(provider_id, { + state = 'observing', + available = false, + driver = driver.provider or driver.driver or 'wired-provider', + polling = true, + }) + if ok == true then rec.observing_emitted = true end + return ok, err +end + +local function publish_observation(provider_id, snapshot) + local cache = merge_observation(provider_id, snapshot) + return emit_snapshot_now(provider_id, cache) +end + +local function failure_status_for_plan(plan, result) + local err = result and result.err or (result and result.status and result.status.err) or 'wired provider observation failed' + local unavailable = false + if plan and plan.groups then + for _, group in ipairs(plan.groups) do + if group == 'panel' then unavailable = true end + end + else + unavailable = true + end + return { + state = unavailable and 'unavailable' or 'degraded', + available = not unavailable, + err = err, + poll = plan and plan.name or nil, + polling = true, + } +end + +local function perform_poll_plan(provider_id, driver, plan) + if plan.method == 'snapshot' then local result = perform_driver_method(driver, 'snapshot', {}) - if not poller_is_current(provider_id, driver) then return end + if result and result.ok == true then return publish_observation(provider_id, result) end + return emit_status_now(provider_id, failure_status_for_plan(plan, result)) + end + + for _, group in ipairs(plan.groups or {}) do + if not poller_is_current(provider_id, driver) then return true, nil end + local result = perform_driver_method(driver, 'group_observation', { group = group }) + if not poller_is_current(provider_id, driver) then return true, nil end if result and result.ok == true then - local ok, err = emit_snapshot_now(provider_id, result) - if ok ~= true then log('error', { what = 'wired_provider_poll_emit_failed', provider = provider_id, err = err }) end + local ok, err = publish_observation(provider_id, result) + if ok ~= true then return nil, err end else - local status = { - state = 'unavailable', - available = false, - err = result and result.err or 'switch snapshot failed', - polling = true, - } - local ok, err = emit_status_now(provider_id, status) - if ok ~= true then log('error', { what = 'wired_provider_poll_status_emit_failed', provider = provider_id, err = err }) end + local ok, err = emit_status_now(provider_id, failure_status_for_plan({ name = plan.name, groups = { group } }, result)) + if ok ~= true then return nil, err end end + end + return true, nil +end + +local function poll_loop(provider_id, driver, plan, ready_cond) + if ready_cond ~= nil then + fibers.perform(ready_cond:wait_op()) + if not poller_is_current(provider_id, driver) then return end + end + + local ok, err = emit_observing_once(provider_id, driver) + if ok ~= true then log('error', { what = 'wired_provider_initial_status_emit_failed', provider = provider_id, err = err }) end + + while poller_is_current(provider_id, driver) do + local started = runtime.now() + local ok, err = perform_poll_plan(provider_id, driver, plan) + if ok ~= true then log('error', { what = 'wired_provider_poll_emit_failed', provider = provider_id, poll = plan.name, err = err }) end local elapsed = runtime.now() - started - fibers.perform(sleep.sleep_op(max(0, interval_s - elapsed))) + fibers.perform(sleep.sleep_op(max(0, plan.interval_s - elapsed))) end end @@ -161,7 +306,7 @@ local function cancel_provider_poller(provider_id, reason) if rec.scope then rec.scope:cancel(reason or 'wired provider poller cancelled') end end -local function spawn_poll_loop(provider_id, driver, interval_s) +local function spawn_provider_poller(provider_id, driver, poll_plan, ready_cond) if not state.scope then return nil, 'wired manager scope not started' end cancel_provider_poller(provider_id, 'wired provider poller replaced') local poll_scope, scope_err = state.scope:child() @@ -170,18 +315,22 @@ local function spawn_poll_loop(provider_id, driver, interval_s) local rec = { scope = poll_scope, driver = driver, - interval_s = interval_s, + poll_plan = poll_plan, + ready_cond = ready_cond, + observing_emitted = false, } state.pollers[provider_id] = rec poll_scope:finally(function () if state.pollers and state.pollers[provider_id] == rec then state.pollers[provider_id] = nil end end) - local ok, err = poll_scope:spawn(function () poll_loop(provider_id, driver, interval_s) end) - if not ok then - if state.pollers[provider_id] == rec then state.pollers[provider_id] = nil end - poll_scope:cancel(tostring(err or 'wired provider poller spawn failed')) - return nil, err or 'wired provider poller spawn failed' + for _, plan in ipairs(poll_plan or {}) do + local ok, err = poll_scope:spawn(function () poll_loop(provider_id, driver, plan, ready_cond) end) + if not ok then + if state.pollers[provider_id] == rec then state.pollers[provider_id] = nil end + poll_scope:cancel(tostring(err or 'wired provider poller spawn failed')) + return nil, err or 'wired provider poller spawn failed' + end end return true, nil end @@ -221,12 +370,12 @@ local function make_caps(provider_ids) return caps end -local function device_event_op(event_type, caps) +local function device_event_op(event_type, caps, ready_cond) local ev = assert(hal_types.new.DeviceEvent(event_type, 'wired', 'main', { source = 'host', source_id = 'wired', manager = 'wired', - }, caps or {})) + }, caps or {}, ready_cond)) return state.dev_ev_ch:put_op(ev):wrap(function () return true, nil end) end @@ -278,7 +427,7 @@ end local function reconcile_device_caps(provider_ids) local new_sig = list_signature(provider_ids) local old_sig = list_signature(state.provider_ids) - if new_sig == old_sig then return true, nil end + if new_sig == old_sig then return true, nil, nil end if state.device_registered then local ok, err = fibers.perform(device_event_op('removed', {})) @@ -289,11 +438,12 @@ local function reconcile_device_caps(provider_ids) close_control_channels() state.provider_ids = {} - if #provider_ids == 0 then return true, nil end + if #provider_ids == 0 then return true, nil, nil end local caps = make_caps(provider_ids) spawn_control_loops(provider_ids) - local ok, err = fibers.perform(device_event_op('added', caps)) + local ready_cond = cond.new() + local ok, err = fibers.perform(device_event_op('added', caps, ready_cond)) if ok == false or ok == nil then close_control_channels() return nil, err or 'wired device add event failed' @@ -301,7 +451,7 @@ local function reconcile_device_caps(provider_ids) state.provider_ids = provider_ids state.device_registered = true - return true, nil + return true, nil, ready_cond end function M.start_op(logger, dev_ev_ch, cap_emit_ch, opts) @@ -320,6 +470,7 @@ function M.start_op(logger, dev_ev_ch, cap_emit_ch, opts) state.drivers = {} state.provider_ids = {} state.pollers = {} + state.observations = {} state.device_registered = false child:finally(function (_, status, primary) M.terminate(primary or status or 'wired manager closed') end) @@ -338,33 +489,28 @@ function M.apply_config_op(config) cancel_pollers('reconfigured') stop_drivers('reconfigured') - local ok, cerr = reconcile_device_caps(provider_ids) + state.observations = {} + local ok, cerr, caps_ready_cond = reconcile_device_caps(provider_ids) if ok ~= true then return false, cerr end for i = 1, #provider_ids do local id = provider_ids[i] local pcfg = configured_provider(config or {}, id) if not pcfg then - local eok, eerr = emit_snapshot_now(id, { status = { state = 'not_configured', available = false }, surfaces = {}, topology = {} }) - if eok ~= true then return false, eerr or 'wired provider status emit failed' end + return false, ('wired provider %s missing configuration'):format(id) else local driver_config = {} for k, v in pairs(pcfg) do driver_config[k] = v end local driver_opts = { logger = state.logger, cap_emit_ch = state.cap_emit_ch, provider_id = id } if driver_config.provider == 'rtl8380m_http' then driver_opts.http_client_for = state.http_client_for end - local poll_interval_s, poll_err = provider_poll_interval_s(driver_config) - if not poll_interval_s then return false, poll_err end + local poll_plan, poll_err = provider_poll_plan(driver_config) + if not poll_plan then return false, poll_err end + driver_config.poll = nil local driver, err = driver_mod.new(driver_config, driver_opts) if not driver then return false, ('wired provider %s create failed: %s'):format(id, tostring(err)) end + driver.provider = driver_config.provider state.drivers[id] = driver - local eok, eerr = emit_status_now(id, { - state = 'observing', - available = false, - driver = driver_config.provider, - polling = true, - }) - if eok ~= true then return false, eerr or 'wired provider status emit failed' end - local spawned, spawn_err = spawn_poll_loop(id, driver, poll_interval_s) + local spawned, spawn_err = spawn_provider_poller(id, driver, poll_plan, caps_ready_cond) if spawned ~= true then state.drivers[id] = nil if driver and type(driver.terminate) == 'function' then driver:terminate('poller spawn failed') end @@ -394,6 +540,7 @@ function M.terminate(reason) close_control_channels() state.provider_ids = {} state.pollers = {} + state.observations = {} state.device_registered = false if state.scope then local scope = state.scope; state.scope = nil; scope:cancel(reason or 'terminated') end state.started = false @@ -411,6 +558,7 @@ end M._test = { normalise_provider_ids = normalise_provider_ids, provider_poll_interval_s = provider_poll_interval_s, + provider_poll_plan = provider_poll_plan, } return M diff --git a/tests/integration/devhost/rtl8380m_switch_spec.lua b/tests/integration/devhost/rtl8380m_switch_spec.lua index 009ce329..8bc1503f 100644 --- a/tests/integration/devhost/rtl8380m_switch_spec.lua +++ b/tests/integration/devhost/rtl8380m_switch_spec.lua @@ -12,10 +12,13 @@ -- lua tests/run.lua -- -- The tests exercise the production provider through cap/http/main. They do --- not submit VLAN, PoE, save, reboot or other configuration writes. +-- not submit VLAN, PoE, save, reboot or other configuration writes. The fixed timing tests use http://192.168.1.1/ with admin/admin when their +-- SWITCH_TEST_FIXED_SWITCH_* flag is set. local busmod = require 'bus' local fibers = require 'fibers' +local pulse = require 'fibers.pulse' +local runtime = require 'fibers.runtime' local runfibers = require 'tests.support.run_fibers' local probe = require 'tests.support.bus_probe' @@ -29,6 +32,34 @@ local hal_deps = require 'services.hal.dependencies' local T = {} +local DEFAULT_TIMING_GROUPS = { + 'home_main', + 'panel_info', + 'panel', + 'identity', + 'port', + 'vlan_create', + 'vlan_conf', + 'vlan_port', + 'vlan_membership', + 'vlan', + 'poe', + 'lldp_local', + 'lldp_neighbor', + 'lldp', + 'runtime', + 'counters', + 'stats', + 'full', +} + +local DEFAULT_CONCURRENT_TIMING_GROUPS = { + 'panel', + 'poe', + 'counters', + 'runtime', +} + local function skip(reason) return { skip = true, reason = reason } end @@ -67,6 +98,82 @@ local function required_env() } end +local function fixed_stats_env() + if os.getenv('SWITCH_TEST_FIXED_SWITCH_STATS') ~= '1' then + return nil, 'set SWITCH_TEST_FIXED_SWITCH_STATS=1 to run the fixed 192.168.1.1 stats timing test' + end + return { + base_url = 'http://192.168.1.1/', + username = 'admin', + password = 'admin', + timeout_s = tonumber(os.getenv('SWITCH_TEST_STATS_TIMEOUT_S') or '1.2') or 1.2, + run_timeout_s = tonumber(os.getenv('SWITCH_TEST_RUN_TIMEOUT_S') or '30') or 30, + stats_budget_s = tonumber(os.getenv('SWITCH_TEST_STATS_BUDGET_S') or '2.5') or 2.5, + openssl_bin = os.getenv('SWITCH_TEST_OPENSSL') or os.getenv('SWITCH_OPENSSL') or 'openssl', + } +end + +local function fixed_panel_env() + if os.getenv('SWITCH_TEST_FIXED_SWITCH_PANEL_TIMING') ~= '1' then + return nil, 'set SWITCH_TEST_FIXED_SWITCH_PANEL_TIMING=1 to run the fixed 192.168.1.1 panel timing test' + end + return { + base_url = 'http://192.168.1.1/', + username = 'admin', + password = 'admin', + timeout_s = tonumber(os.getenv('SWITCH_TEST_PANEL_TIMEOUT_S') or '0.75') or 0.75, + run_timeout_s = tonumber(os.getenv('SWITCH_TEST_RUN_TIMEOUT_S') or '30') or 30, + panel_budget_s = tonumber(os.getenv('SWITCH_TEST_PANEL_BUDGET_S') or '1.0') or 1.0, + panel_iterations = tonumber(os.getenv('SWITCH_TEST_PANEL_ITERATIONS') or '10') or 10, + openssl_bin = os.getenv('SWITCH_TEST_OPENSSL') or os.getenv('SWITCH_OPENSSL') or 'openssl', + } +end + +local function split_csv(s) + local out = {} + for token in tostring(s or ''):gmatch('[^,%s]+') do out[#out + 1] = token end + return out +end + +local function fixed_timing_env() + if os.getenv('SWITCH_TEST_FIXED_SWITCH_TIMING') ~= '1' then + return nil, 'set SWITCH_TEST_FIXED_SWITCH_TIMING=1 to run the fixed 192.168.1.1 command timing sweep' + end + local groups = split_csv(os.getenv('SWITCH_TEST_TIMING_GROUPS')) + if #groups == 0 then groups = DEFAULT_TIMING_GROUPS end + return { + base_url = 'http://192.168.1.1/', + username = 'admin', + password = 'admin', + timeout_s = tonumber(os.getenv('SWITCH_TEST_TIMING_TIMEOUT_S') or '2.5') or 2.5, + run_timeout_s = tonumber(os.getenv('SWITCH_TEST_RUN_TIMEOUT_S') or '60') or 60, + iterations = tonumber(os.getenv('SWITCH_TEST_TIMING_ITERATIONS') or '3') or 3, + require_all = os.getenv('SWITCH_TEST_TIMING_REQUIRE_ALL') == '1', + groups = groups, + openssl_bin = os.getenv('SWITCH_TEST_OPENSSL') or os.getenv('SWITCH_OPENSSL') or 'openssl', + } +end + +local function fixed_concurrent_timing_env() + if os.getenv('SWITCH_TEST_FIXED_SWITCH_CONCURRENT_TIMING') ~= '1' then + return nil, 'set SWITCH_TEST_FIXED_SWITCH_CONCURRENT_TIMING=1 to run the fixed 192.168.1.1 concurrent command timing test' + end + local groups = split_csv(os.getenv('SWITCH_TEST_CONCURRENT_GROUPS')) + if #groups == 0 then groups = DEFAULT_CONCURRENT_TIMING_GROUPS end + return { + base_url = 'http://192.168.1.1/', + username = 'admin', + password = 'admin', + timeout_s = tonumber(os.getenv('SWITCH_TEST_CONCURRENT_TIMEOUT_S') or '2.5') or 2.5, + run_timeout_s = tonumber(os.getenv('SWITCH_TEST_RUN_TIMEOUT_S') or '60') or 60, + iterations = tonumber(os.getenv('SWITCH_TEST_CONCURRENT_ITERATIONS') or '3') or 3, + require_all = os.getenv('SWITCH_TEST_CONCURRENT_REQUIRE_ALL') == '1', + budget_s = tonumber(os.getenv('SWITCH_TEST_CONCURRENT_BUDGET_S') or ''), + groups = groups, + openssl_bin = os.getenv('SWITCH_TEST_OPENSSL') or os.getenv('SWITCH_OPENSSL') or 'openssl', + } +end + local function wait_http_available(bus) local reader = bus:connect({ origin_base = { kind = 'local', component = 'test-http-reader' } }) probe.wait_retained_payload(reader, { 'cap', 'http', 'main', 'status' }, { @@ -113,6 +220,86 @@ local function count_poe_surfaces(surfaces) return n end +local function count_link_states(surfaces) + local up, down = 0, 0 + for _, surface in pairs(surfaces or {}) do + local state = surface.link and surface.link.state + if state == 'up' then up = up + 1 + elseif state == 'down' then down = down + 1 end + end + return up, down +end + +local function count_table_keys(t) + local n = 0 + for _ in pairs(t or {}) do n = n + 1 end + return n +end + +local function command_result_raw_count(result) + return count_table_keys(result and result.raw or nil) +end + + +local function summarise_command_result(result) + if result and result.ok == true then + return true, command_result_raw_count(result), nil + end + local status = result and result.status or {} + return false, 0, tostring(status.err or (result and result.err) or 'unknown error') +end + +local function run_command_groups_sequential(provider, groups) + local results = {} + local started = runtime.now() + for i, group in ipairs(groups) do + local group_started = runtime.now() + local result = fibers.perform(provider:command_group_op({ group = group })) + local ok, raw_count, err = summarise_command_result(result) + results[i] = { group = group, ok = ok, raw_count = raw_count, err = err, elapsed_s = runtime.now() - group_started } + end + return runtime.now() - started, results +end + +local function run_command_groups_concurrent(scope, provider, groups) + local results = {} + local done = pulse.new() + local seen = done:version() + local remaining = #groups + local started = runtime.now() + + if remaining == 0 then return 0, results end + + for i, group in ipairs(groups) do + local ok_spawn, spawn_err = scope:spawn(function () + local group_started = runtime.now() + local result = fibers.perform(provider:command_group_op({ group = group })) + local ok, raw_count, err = summarise_command_result(result) + results[i] = { group = group, ok = ok, raw_count = raw_count, err = err, elapsed_s = runtime.now() - group_started } + remaining = remaining - 1 + if remaining == 0 then done:signal() end + end) + if not ok_spawn then error(spawn_err, 2) end + end + + if remaining > 0 then fibers.perform(done:changed_op(seen)) end + return runtime.now() - started, results +end + +local function count_command_result_failures(results) + local ok_count, fail_count, raw_total, last_err = 0, 0, 0, nil + for _, result in ipairs(results or {}) do + if result.ok == true then + ok_count = ok_count + 1 + raw_total = raw_total + (result.raw_count or 0) + else + fail_count = fail_count + 1 + last_err = result.group .. ': ' .. tostring(result.err or 'failed') + end + end + return ok_count, fail_count, raw_total, last_err +end + local function has_known_vlan_mode(surface) local mode = surface and surface.attachment and surface.attachment.mode return mode == nil or mode == 'hybrid' or mode == 'access' or mode == 'trunk' or mode == 'tunnel' @@ -348,4 +535,247 @@ function T.rtl8380m_real_switch_raw_observations_project_to_state_wired() end, { timeout = env.run_timeout_s }) end + +function T.rtl8380m_fixed_switch_admin_stats_within_allotted_time() + local env, err = fixed_stats_env() + if not env then return skip(err) end + + runfibers.run(function () + local b = busmod.new() + local http = start_http_capability(b, env) + wait_http_available(b) + local provider = new_real_switch_provider(b, env) + + -- Warm the authenticated session once. The timing assertion below is for + -- the stats read using the retained session, not for RSA login. + local warm = require_successful_snapshot(provider) + assert_eq(warm.status.login, 'confirmed') + + local started = runtime.now() + local stats = fibers.perform(provider:stats_op({})) + local elapsed = runtime.now() - started + assert_not_nil(stats, 'stats_op should return a table') + if stats.ok ~= true then + local status = stats.status or {} + error('switch stats failed: ' .. tostring(status.err or stats.err or 'unknown error'), 2) + end + assert_true(elapsed <= env.stats_budget_s, ('stats elapsed %.3fs exceeded budget %.3fs'):format(elapsed, env.stats_budget_s)) + assert_eq(stats.status.login, 'confirmed') + assert_not_nil(stats.runtime, 'stats should include runtime') + assert_not_nil(stats.runtime.cpu, 'stats should include runtime.cpu') + assert_not_nil(stats.runtime.memory, 'stats should include runtime.memory') + assert_not_nil(stats.raw, 'include_raw=true should preserve stats payloads') + assert_not_nil(stats.raw.sys_cpumem, 'stats raw sys_cpumem should be captured') + assert_not_nil(stats.raw.rmon_statistics, 'stats raw rmon_statistics should be captured') + + provider:terminate('test complete') + http:terminate('test complete') + end, { timeout = env.run_timeout_s }) +end + +function T.rtl8380m_fixed_switch_admin_panel_timing() + local env, err = fixed_panel_env() + if not env then return skip(err) end + + runfibers.run(function () + local b = busmod.new() + local http = start_http_capability(b, env) + wait_http_available(b) + local provider = new_real_switch_provider(b, env) + + -- Warm login using the same cheap panel path. The timed loop below + -- measures retained-session panel reads only; it does not run a full + -- switch snapshot and therefore does not depend on sys_cpumem or RMON. + local warm = fibers.perform(provider:panel_op({})) + assert_not_nil(warm, 'panel_op should return a table') + if warm.ok ~= true then + local status = warm.status or {} + error('switch panel warm-up failed: ' .. tostring(status.err or warm.err or 'unknown error'), 2) + end + assert_eq(warm.status.login, 'confirmed') + + local min_s, max_s, total_s = nil, 0, 0 + local last_panel + for _ = 1, env.panel_iterations do + local started = runtime.now() + local panel = fibers.perform(provider:panel_op({})) + local elapsed = runtime.now() - started + assert_not_nil(panel, 'panel_op should return a table') + if panel.ok ~= true then + local status = panel.status or {} + error('switch panel read failed: ' .. tostring(status.err or panel.err or 'unknown error'), 2) + end + last_panel = panel + if min_s == nil or elapsed < min_s then min_s = elapsed end + if elapsed > max_s then max_s = elapsed end + total_s = total_s + elapsed + assert_true(elapsed <= env.panel_budget_s, ('panel read elapsed %.3fs exceeded budget %.3fs'):format(elapsed, env.panel_budget_s)) + end + + local surface_count = count_surfaces_with_prefix(last_panel.surfaces, 'GE') + local up, down = count_link_states(last_panel.surfaces) + assert_true(surface_count >= 10, 'panel read should expose the ten GE switch surfaces') + assert_true((up + down) >= 10, 'panel read should expose link state for switch surfaces') + io.stderr:write(('rtl8380m panel timing: n=%d min=%.3fs avg=%.3fs max=%.3fs ge=%d link_up=%d link_down=%d\n'):format( + env.panel_iterations, + min_s or 0, + total_s / env.panel_iterations, + max_s, + surface_count, + up, + down + )) + + provider:terminate('test complete') + http:terminate('test complete') + end, { timeout = env.run_timeout_s }) +end + + +function T.rtl8380m_fixed_switch_admin_command_timing_sweep() + local env, err = fixed_timing_env() + if not env then return skip(err) end + + runfibers.run(function () + local b = busmod.new() + local http = start_http_capability(b, env) + wait_http_available(b) + local provider = new_real_switch_provider(b, env) + + local warm = fibers.perform(provider:panel_op({})) + assert_not_nil(warm, 'panel warm-up should return a table') + if warm.ok ~= true then + local status = warm.status or {} + error('switch panel warm-up failed: ' .. tostring(status.err or warm.err or 'unknown error'), 2) + end + + local any_ok = false + local failures = {} + for _, group in ipairs(env.groups) do + local min_s, max_s, total_s = nil, 0, 0 + local ok_count, fail_count, raw_count = 0, 0, 0 + local last_err + for _ = 1, env.iterations do + local started = runtime.now() + local result = fibers.perform(provider:command_group_op({ group = group })) + local elapsed = runtime.now() - started + if min_s == nil or elapsed < min_s then min_s = elapsed end + if elapsed > max_s then max_s = elapsed end + total_s = total_s + elapsed + if result and result.ok == true then + ok_count = ok_count + 1 + any_ok = true + raw_count = command_result_raw_count(result) + else + fail_count = fail_count + 1 + local status = result and result.status or {} + last_err = tostring(status.err or (result and result.err) or 'unknown error') + end + end + local avg_s = total_s / env.iterations + io.stderr:write(('rtl8380m command timing: group=%s n=%d ok=%d fail=%d min=%.3fs avg=%.3fs max=%.3fs raw_keys=%d%s\n'):format( + group, + env.iterations, + ok_count, + fail_count, + min_s or 0, + avg_s, + max_s, + raw_count, + last_err and (' last_err=' .. last_err) or '' + )) + if env.require_all and fail_count > 0 then failures[#failures + 1] = group .. ': ' .. tostring(last_err or 'failed') end + end + + assert_true(any_ok, 'at least one switch command timing group should succeed') + if env.require_all and #failures > 0 then error('switch timing failures: ' .. table.concat(failures, '; '), 2) end + + provider:terminate('test complete') + http:terminate('test complete') + end, { timeout = env.run_timeout_s }) +end + +function T.rtl8380m_fixed_switch_admin_concurrent_command_timing() + local env, err = fixed_concurrent_timing_env() + if not env then return skip(err) end + + runfibers.run(function (scope) + local b = busmod.new() + local http = start_http_capability(b, env) + wait_http_available(b) + local provider = new_real_switch_provider(b, env) + + local warm = fibers.perform(provider:panel_op({})) + assert_not_nil(warm, 'panel warm-up should return a table') + if warm.ok ~= true then + local status = warm.status or {} + error('switch panel warm-up failed: ' .. tostring(status.err or warm.err or 'unknown error'), 2) + end + + local seq_total, conc_total = 0, 0 + local best_seq, best_conc, worst_seq, worst_conc = nil, nil, 0, 0 + local any_ok = false + local failures = {} + local group_list = table.concat(env.groups, ',') + + for i = 1, env.iterations do + local seq_elapsed, seq_results = run_command_groups_sequential(provider, env.groups) + local seq_ok, seq_fail, seq_raw, seq_err = count_command_result_failures(seq_results) + + local conc_elapsed, conc_results = run_command_groups_concurrent(scope, provider, env.groups) + local conc_ok, conc_fail, conc_raw, conc_err = count_command_result_failures(conc_results) + + seq_total = seq_total + seq_elapsed + conc_total = conc_total + conc_elapsed + if best_seq == nil or seq_elapsed < best_seq then best_seq = seq_elapsed end + if best_conc == nil or conc_elapsed < best_conc then best_conc = conc_elapsed end + if seq_elapsed > worst_seq then worst_seq = seq_elapsed end + if conc_elapsed > worst_conc then worst_conc = conc_elapsed end + if seq_ok > 0 or conc_ok > 0 then any_ok = true end + + local speedup = conc_elapsed > 0 and (seq_elapsed / conc_elapsed) or 0 + io.stderr:write(('rtl8380m concurrent timing: iter=%d groups=%s sequential=%.3fs concurrent=%.3fs speedup=%.2fx seq_ok=%d seq_fail=%d conc_ok=%d conc_fail=%d raw_seq=%d raw_conc=%d%s%s\n'):format( + i, + group_list, + seq_elapsed, + conc_elapsed, + speedup, + seq_ok, + seq_fail, + conc_ok, + conc_fail, + seq_raw, + conc_raw, + seq_err and (' seq_err=' .. seq_err) or '', + conc_err and (' conc_err=' .. conc_err) or '' + )) + + if env.budget_s and conc_elapsed > env.budget_s then + failures[#failures + 1] = ('iteration %d concurrent %.3fs exceeded budget %.3fs'):format(i, conc_elapsed, env.budget_s) + end + if env.require_all and (seq_fail > 0 or conc_fail > 0) then + failures[#failures + 1] = ('iteration %d failures: seq=%s conc=%s'):format(i, tostring(seq_err), tostring(conc_err)) + end + end + + io.stderr:write(('rtl8380m concurrent timing summary: n=%d groups=%s seq_avg=%.3fs seq_min=%.3fs seq_max=%.3fs conc_avg=%.3fs conc_min=%.3fs conc_max=%.3fs speedup=%.2fx\n'):format( + env.iterations, + group_list, + seq_total / env.iterations, + best_seq or 0, + worst_seq, + conc_total / env.iterations, + best_conc or 0, + worst_conc, + conc_total > 0 and (seq_total / conc_total) or 0 + )) + + assert_true(any_ok, 'at least one sequential or concurrent command group read should succeed') + if #failures > 0 then error('switch concurrent timing failures: ' .. table.concat(failures, '; '), 2) end + + provider:terminate('test complete') + http:terminate('test complete') + end, { timeout = env.run_timeout_s }) +end + return T From 2d8af4baa395cafabda851e1fb4551b8b104dde1 Mon Sep 17 00:00:00 2001 From: Rich Thanki Date: Wed, 17 Jun 2026 23:43:40 +0000 Subject: [PATCH 3/6] cleanup --- docs/switch.md | 38 +- .../wired/providers/rtl8380m_http.lua | 191 +------- .../hal/backends/wired/providers/static.lua | 1 - src/services/hal/managers/wired.lua | 1 + .../devhost/rtl8380m_switch_spec.lua | 430 +----------------- 5 files changed, 24 insertions(+), 637 deletions(-) diff --git a/docs/switch.md b/docs/switch.md index e96d240b..e1db1de6 100644 --- a/docs/switch.md +++ b/docs/switch.md @@ -189,33 +189,31 @@ lldp_local lldp_neighbor ``` -The provider also has narrower read paths for timing and grouped polling. Surface-bearing groups include `home_main` so that rows can be attached to the canonical switch surface names (`GE1` ... `GE10`): +The provider has narrow read groups for grouped polling. Surface-bearing groups include `home_main` so that rows can be attached to the canonical switch surface names (`GE1` ... `GE10`): ```text -panel/link path: home_main, panel_info -identity path: sys_sysinfo -port path: home_main, port_port -vlan path: home_main, vlan_create, vlan_conf, vlan_port, vlan_membership -poe path: home_main, poe_poe -lldp path: lldp_local, lldp_neighbor -runtime path: sys_cpumem -counters path: home_main, rmon_statistics -stats path: sys_cpumem, rmon_statistics -full path: all read-side commands +panel path: home_main, panel_info +identity path: sys_sysinfo +vlan path: home_main, vlan_create, vlan_conf, vlan_port, vlan_membership +poe path: home_main, poe_poe +lldp path: lldp_local, lldp_neighbor +runtime path: sys_cpumem +counters path: home_main, rmon_statistics ``` -The panel/link path is the cheapest source of the switch front-panel state: which GE/SFP surfaces are present and whether they are connected. It deliberately avoids `sys_cpumem` and `rmon_statistics`, which can be slower on this switch. The devhost timing sweep can time each group separately against the fixed test switch: +The poll plan is based on timings measured against the fixed RTL8380M switch on 192.168.1.1 using a retained admin session: -```sh -cd tests -SWITCH_TEST_FIXED_SWITCH_TIMING=1 \ -SWITCH_TEST_TIMING_TIMEOUT_S=2.5 \ -SWITCH_TEST_TIMING_ITERATIONS=3 \ -TEST_FILTER=rtl8380m_fixed_switch_admin_command_timing_sweep \ -luajit run.lua +```text +panel avg 0.303 s, max 0.344 s +vlan avg 0.363 s, max 0.389 s +poe avg 0.077 s, max 0.084 s +lldp avg 0.160 s, max 0.183 s +counters avg 0.203 s, max 0.257 s +runtime avg 2.085 s, max 2.089 s +full read avg 3.522 s, max 3.554 s, with observed timeout ``` -Use `SWITCH_TEST_TIMING_GROUPS=panel,poe,runtime,counters` to restrict the sweep, and `SWITCH_TEST_TIMING_REQUIRE_ALL=1` when failures should fail the test rather than only being reported. +A concurrent probe over `panel,poe,counters,runtime` improved wall-clock time only modestly, from 2.688 s sequential average to 2.309 s concurrent average, so the production poller remains grouped and sequential. The driver captures the full snapshot into normalised provider observations: diff --git a/src/services/hal/backends/wired/providers/rtl8380m_http.lua b/src/services/hal/backends/wired/providers/rtl8380m_http.lua index a499b738..4031b6a3 100644 --- a/src/services/hal/backends/wired/providers/rtl8380m_http.lua +++ b/src/services/hal/backends/wired/providers/rtl8380m_http.lua @@ -39,56 +39,14 @@ local READ_COMMANDS = { 'rmon_statistics', } -local STATS_COMMANDS = { - 'sys_cpumem', - 'rmon_statistics', -} - -local PANEL_COMMANDS = { - 'home_main', - 'panel_info', -} - local COMMAND_GROUPS = { - home_main = { 'home_main' }, - panel_info = { 'panel_info' }, - panel = PANEL_COMMANDS, + panel = { 'home_main', 'panel_info' }, identity = { 'sys_sysinfo' }, - port = { 'home_main', 'port_port' }, - vlan_create = { 'home_main', 'vlan_create' }, - vlan_conf = { 'home_main', 'vlan_conf' }, - vlan_port = { 'home_main', 'vlan_port' }, - vlan_membership = { 'home_main', 'vlan_membership' }, vlan = { 'home_main', 'vlan_create', 'vlan_conf', 'vlan_port', 'vlan_membership' }, poe = { 'home_main', 'poe_poe' }, - lldp_local = { 'lldp_local' }, - lldp_neighbor = { 'lldp_neighbor' }, lldp = { 'lldp_local', 'lldp_neighbor' }, runtime = { 'sys_cpumem' }, counters = { 'home_main', 'rmon_statistics' }, - stats = STATS_COMMANDS, - full = READ_COMMANDS, -} - -local COMMAND_GROUP_ORDER = { - 'home_main', - 'panel_info', - 'panel', - 'identity', - 'port', - 'vlan_create', - 'vlan_conf', - 'vlan_port', - 'vlan_membership', - 'vlan', - 'poe', - 'lldp_local', - 'lldp_neighbor', - 'lldp', - 'runtime', - 'counters', - 'stats', - 'full', } local VLAN_MODE = { @@ -695,19 +653,8 @@ local function build_snapshot(self, data) } end -local function build_panel(self, data) - return { - ok = true, - provider_id = self.id, - status = base_status(self), - surfaces = build_surfaces(data), - raw = self.include_raw and data or nil, - } -end - local function build_group_observation(self, group, data) group = tostring(group or '') - if group == 'full' then return build_snapshot(self, data) end local out = { ok = true, provider_id = self.id, @@ -722,22 +669,15 @@ local function build_group_observation(self, group, data) elseif group == 'poe' then out.power = parse_power(data.poe_poe) out.surfaces = build_surfaces(data) - elseif group == 'lldp' or group == 'lldp_local' or group == 'lldp_neighbor' then + elseif group == 'lldp' then out.topology = { lldp_local = data.lldp_local, lldp_neighbor = data.lldp_neighbor, } - elseif group == 'panel' or group == 'home_main' or group == 'panel_info' or group == 'port' - or group == 'vlan' or group == 'vlan_create' or group == 'vlan_conf' - or group == 'vlan_port' or group == 'vlan_membership' - or group == 'counters' - then - out.surfaces = build_surfaces(data) - elseif group == 'stats' then - out.runtime = parse_runtime(data.sys_cpumem) + elseif group == 'panel' or group == 'vlan' or group == 'counters' then out.surfaces = build_surfaces(data) else - out.raw = self.include_raw and data or nil + return { ok = false, provider_id = self.id, group = group, status = { state = 'unavailable', available = false, driver = DRIVER, err = 'unknown command group: ' .. group } } end if self.include_raw then out.raw = data end @@ -762,7 +702,6 @@ local CONFIG_FIELDS = { username = true, password = true, timeout_s = true, - poll_interval_s = true, http = true, openssl_bin = true, disable_login = true, @@ -889,66 +828,6 @@ function Provider:fetch_command_group(group_name) } end -function Provider:fetch_panel() - if self.disable_login then - local data, err = read_commands(self, PANEL_COMMANDS) - if not data then return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'disabled', err = err or 'panel read failed' } } end - return build_panel(self, data) - end - - local ok_login, lerr = login(self) - if not ok_login then - return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = lerr or 'login failed' } } - end - - local data, err, code = read_commands(self, PANEL_COMMANDS) - if code == 'auth_invalid' then - reset_session(self) - local ok_relogin, relogin_err = login(self, { force = true }) - if not ok_relogin then - return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = relogin_err or 're-login failed' } } - end - data, err, code = read_commands(self, PANEL_COMMANDS) - end - if not data then - return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = self.logged_in and 'confirmed' or 'failed', err = err or 'switch panel read failed' } } - end - return build_panel(self, data) -end - -function Provider:fetch_stats() - if self.client and type(self.client.stats) == 'function' then - local data, err = self.client:stats(self) - if not data then return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, err = err } } end - return { ok = true, provider_id = self.id, runtime = parse_runtime(data.sys_cpumem), raw = self.include_raw and data or nil } - end - - local ok_login, lerr = login(self) - if not ok_login then - return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = lerr or 'login failed' } } - end - - local data, err, code = read_commands(self, STATS_COMMANDS) - if code == 'auth_invalid' then - reset_session(self) - local ok_relogin, relogin_err = login(self, { force = true }) - if not ok_relogin then - return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = relogin_err or 're-login failed' } } - end - data, err, code = read_commands(self, STATS_COMMANDS) - end - if not data then - return { ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, login = self.logged_in and 'confirmed' or 'failed', err = err or 'switch stats read failed' } } - end - return { - ok = true, - provider_id = self.id, - status = { state = 'available', available = true, driver = DRIVER, login = self.logged_in and 'confirmed' or 'disabled' }, - runtime = parse_runtime(data.sys_cpumem), - raw = self.include_raw and data or nil, - } -end - function Provider:fetch_snapshot_op(_req) return op.guard(function () return fibers.run_scope_op(function () @@ -971,27 +850,6 @@ end function Provider:snapshot_op(req) return self:fetch_snapshot_op(req) end function Provider:watch_op(req) return self:fetch_snapshot_op(req) end -function Provider:command_group_op(req) - req = req or {} - return op.guard(function () - return fibers.run_scope_op(function () - return self:fetch_command_group(req.group) - end):wrap(function (status, _report, result_or_primary, err) - if status == 'ok' then return result_or_primary, err end - return { - ok = false, - provider_id = self.id, - group = req.group, - status = { - state = 'unavailable', - available = false, - driver = DRIVER, - err = tostring(result_or_primary or status or 'command group failed'), - }, - }, nil - end) - end) -end function Provider:group_observation_op(req) req = req or {} @@ -1018,44 +876,6 @@ function Provider:group_observation_op(req) end) end -function Provider:panel_op(_req) - return op.guard(function () - return fibers.run_scope_op(function () - return self:fetch_panel() - end):wrap(function (status, _report, result_or_primary, err) - if status == 'ok' then return result_or_primary, err end - return { - ok = false, - provider_id = self.id, - status = { - state = 'unavailable', - available = false, - driver = DRIVER, - err = tostring(result_or_primary or status or 'panel read failed'), - }, - }, nil - end) - end) -end -function Provider:stats_op(_req) - return op.guard(function () - return fibers.run_scope_op(function () - return self:fetch_stats() - end):wrap(function (status, _report, result_or_primary, err) - if status == 'ok' then return result_or_primary, err end - return { - ok = false, - provider_id = self.id, - status = { - state = 'unavailable', - available = false, - driver = DRIVER, - err = tostring(result_or_primary or status or 'stats failed'), - }, - }, nil - end) - end) -end function Provider:apply_attachments_op(_req) return op.always(contract.read_only('apply_attachments')) end function Provider:set_poe_op(_req) return op.always(contract.read_only('set_poe')) end function Provider:bounce_op(_req) return op.always(contract.read_only('bounce')) end @@ -1073,10 +893,7 @@ M._test = { parse_surface_counters = parse_surface_counters, build_group_observation = function(provider_like, group, data) return build_group_observation(provider_like or { id = 'switch-main', mode = 'read_only' }, group, data or {}) end, auth_invalid_body = auth_invalid_body, - STATS_COMMANDS = STATS_COMMANDS, - PANEL_COMMANDS = PANEL_COMMANDS, COMMAND_GROUPS = COMMAND_GROUPS, - COMMAND_GROUP_ORDER = COMMAND_GROUP_ORDER, } return M diff --git a/src/services/hal/backends/wired/providers/static.lua b/src/services/hal/backends/wired/providers/static.lua index 4fe3d36f..ad1a47d1 100644 --- a/src/services/hal/backends/wired/providers/static.lua +++ b/src/services/hal/backends/wired/providers/static.lua @@ -17,7 +17,6 @@ local CONFIG_FIELDS = { surfaces = true, topology = true, meta = true, - poll_interval_s = true, } local function check_allowed_config(config) diff --git a/src/services/hal/managers/wired.lua b/src/services/hal/managers/wired.lua index 6c3cf844..4ec6ab8d 100644 --- a/src/services/hal/managers/wired.lua +++ b/src/services/hal/managers/wired.lua @@ -506,6 +506,7 @@ function M.apply_config_op(config) local poll_plan, poll_err = provider_poll_plan(driver_config) if not poll_plan then return false, poll_err end driver_config.poll = nil + driver_config.poll_interval_s = nil local driver, err = driver_mod.new(driver_config, driver_opts) if not driver then return false, ('wired provider %s create failed: %s'):format(id, tostring(err)) end driver.provider = driver_config.provider diff --git a/tests/integration/devhost/rtl8380m_switch_spec.lua b/tests/integration/devhost/rtl8380m_switch_spec.lua index 8bc1503f..4012abe2 100644 --- a/tests/integration/devhost/rtl8380m_switch_spec.lua +++ b/tests/integration/devhost/rtl8380m_switch_spec.lua @@ -12,13 +12,10 @@ -- lua tests/run.lua -- -- The tests exercise the production provider through cap/http/main. They do --- not submit VLAN, PoE, save, reboot or other configuration writes. The fixed timing tests use http://192.168.1.1/ with admin/admin when their --- SWITCH_TEST_FIXED_SWITCH_* flag is set. +-- not submit VLAN, PoE, save, reboot or other configuration writes. local busmod = require 'bus' local fibers = require 'fibers' -local pulse = require 'fibers.pulse' -local runtime = require 'fibers.runtime' local runfibers = require 'tests.support.run_fibers' local probe = require 'tests.support.bus_probe' @@ -32,34 +29,6 @@ local hal_deps = require 'services.hal.dependencies' local T = {} -local DEFAULT_TIMING_GROUPS = { - 'home_main', - 'panel_info', - 'panel', - 'identity', - 'port', - 'vlan_create', - 'vlan_conf', - 'vlan_port', - 'vlan_membership', - 'vlan', - 'poe', - 'lldp_local', - 'lldp_neighbor', - 'lldp', - 'runtime', - 'counters', - 'stats', - 'full', -} - -local DEFAULT_CONCURRENT_TIMING_GROUPS = { - 'panel', - 'poe', - 'counters', - 'runtime', -} - local function skip(reason) return { skip = true, reason = reason } end @@ -98,82 +67,6 @@ local function required_env() } end -local function fixed_stats_env() - if os.getenv('SWITCH_TEST_FIXED_SWITCH_STATS') ~= '1' then - return nil, 'set SWITCH_TEST_FIXED_SWITCH_STATS=1 to run the fixed 192.168.1.1 stats timing test' - end - return { - base_url = 'http://192.168.1.1/', - username = 'admin', - password = 'admin', - timeout_s = tonumber(os.getenv('SWITCH_TEST_STATS_TIMEOUT_S') or '1.2') or 1.2, - run_timeout_s = tonumber(os.getenv('SWITCH_TEST_RUN_TIMEOUT_S') or '30') or 30, - stats_budget_s = tonumber(os.getenv('SWITCH_TEST_STATS_BUDGET_S') or '2.5') or 2.5, - openssl_bin = os.getenv('SWITCH_TEST_OPENSSL') or os.getenv('SWITCH_OPENSSL') or 'openssl', - } -end - -local function fixed_panel_env() - if os.getenv('SWITCH_TEST_FIXED_SWITCH_PANEL_TIMING') ~= '1' then - return nil, 'set SWITCH_TEST_FIXED_SWITCH_PANEL_TIMING=1 to run the fixed 192.168.1.1 panel timing test' - end - return { - base_url = 'http://192.168.1.1/', - username = 'admin', - password = 'admin', - timeout_s = tonumber(os.getenv('SWITCH_TEST_PANEL_TIMEOUT_S') or '0.75') or 0.75, - run_timeout_s = tonumber(os.getenv('SWITCH_TEST_RUN_TIMEOUT_S') or '30') or 30, - panel_budget_s = tonumber(os.getenv('SWITCH_TEST_PANEL_BUDGET_S') or '1.0') or 1.0, - panel_iterations = tonumber(os.getenv('SWITCH_TEST_PANEL_ITERATIONS') or '10') or 10, - openssl_bin = os.getenv('SWITCH_TEST_OPENSSL') or os.getenv('SWITCH_OPENSSL') or 'openssl', - } -end - -local function split_csv(s) - local out = {} - for token in tostring(s or ''):gmatch('[^,%s]+') do out[#out + 1] = token end - return out -end - -local function fixed_timing_env() - if os.getenv('SWITCH_TEST_FIXED_SWITCH_TIMING') ~= '1' then - return nil, 'set SWITCH_TEST_FIXED_SWITCH_TIMING=1 to run the fixed 192.168.1.1 command timing sweep' - end - local groups = split_csv(os.getenv('SWITCH_TEST_TIMING_GROUPS')) - if #groups == 0 then groups = DEFAULT_TIMING_GROUPS end - return { - base_url = 'http://192.168.1.1/', - username = 'admin', - password = 'admin', - timeout_s = tonumber(os.getenv('SWITCH_TEST_TIMING_TIMEOUT_S') or '2.5') or 2.5, - run_timeout_s = tonumber(os.getenv('SWITCH_TEST_RUN_TIMEOUT_S') or '60') or 60, - iterations = tonumber(os.getenv('SWITCH_TEST_TIMING_ITERATIONS') or '3') or 3, - require_all = os.getenv('SWITCH_TEST_TIMING_REQUIRE_ALL') == '1', - groups = groups, - openssl_bin = os.getenv('SWITCH_TEST_OPENSSL') or os.getenv('SWITCH_OPENSSL') or 'openssl', - } -end - -local function fixed_concurrent_timing_env() - if os.getenv('SWITCH_TEST_FIXED_SWITCH_CONCURRENT_TIMING') ~= '1' then - return nil, 'set SWITCH_TEST_FIXED_SWITCH_CONCURRENT_TIMING=1 to run the fixed 192.168.1.1 concurrent command timing test' - end - local groups = split_csv(os.getenv('SWITCH_TEST_CONCURRENT_GROUPS')) - if #groups == 0 then groups = DEFAULT_CONCURRENT_TIMING_GROUPS end - return { - base_url = 'http://192.168.1.1/', - username = 'admin', - password = 'admin', - timeout_s = tonumber(os.getenv('SWITCH_TEST_CONCURRENT_TIMEOUT_S') or '2.5') or 2.5, - run_timeout_s = tonumber(os.getenv('SWITCH_TEST_RUN_TIMEOUT_S') or '60') or 60, - iterations = tonumber(os.getenv('SWITCH_TEST_CONCURRENT_ITERATIONS') or '3') or 3, - require_all = os.getenv('SWITCH_TEST_CONCURRENT_REQUIRE_ALL') == '1', - budget_s = tonumber(os.getenv('SWITCH_TEST_CONCURRENT_BUDGET_S') or ''), - groups = groups, - openssl_bin = os.getenv('SWITCH_TEST_OPENSSL') or os.getenv('SWITCH_OPENSSL') or 'openssl', - } -end - local function wait_http_available(bus) local reader = bus:connect({ origin_base = { kind = 'local', component = 'test-http-reader' } }) probe.wait_retained_payload(reader, { 'cap', 'http', 'main', 'status' }, { @@ -220,86 +113,6 @@ local function count_poe_surfaces(surfaces) return n end -local function count_link_states(surfaces) - local up, down = 0, 0 - for _, surface in pairs(surfaces or {}) do - local state = surface.link and surface.link.state - if state == 'up' then up = up + 1 - elseif state == 'down' then down = down + 1 end - end - return up, down -end - -local function count_table_keys(t) - local n = 0 - for _ in pairs(t or {}) do n = n + 1 end - return n -end - -local function command_result_raw_count(result) - return count_table_keys(result and result.raw or nil) -end - - -local function summarise_command_result(result) - if result and result.ok == true then - return true, command_result_raw_count(result), nil - end - local status = result and result.status or {} - return false, 0, tostring(status.err or (result and result.err) or 'unknown error') -end - -local function run_command_groups_sequential(provider, groups) - local results = {} - local started = runtime.now() - for i, group in ipairs(groups) do - local group_started = runtime.now() - local result = fibers.perform(provider:command_group_op({ group = group })) - local ok, raw_count, err = summarise_command_result(result) - results[i] = { group = group, ok = ok, raw_count = raw_count, err = err, elapsed_s = runtime.now() - group_started } - end - return runtime.now() - started, results -end - -local function run_command_groups_concurrent(scope, provider, groups) - local results = {} - local done = pulse.new() - local seen = done:version() - local remaining = #groups - local started = runtime.now() - - if remaining == 0 then return 0, results end - - for i, group in ipairs(groups) do - local ok_spawn, spawn_err = scope:spawn(function () - local group_started = runtime.now() - local result = fibers.perform(provider:command_group_op({ group = group })) - local ok, raw_count, err = summarise_command_result(result) - results[i] = { group = group, ok = ok, raw_count = raw_count, err = err, elapsed_s = runtime.now() - group_started } - remaining = remaining - 1 - if remaining == 0 then done:signal() end - end) - if not ok_spawn then error(spawn_err, 2) end - end - - if remaining > 0 then fibers.perform(done:changed_op(seen)) end - return runtime.now() - started, results -end - -local function count_command_result_failures(results) - local ok_count, fail_count, raw_total, last_err = 0, 0, 0, nil - for _, result in ipairs(results or {}) do - if result.ok == true then - ok_count = ok_count + 1 - raw_total = raw_total + (result.raw_count or 0) - else - fail_count = fail_count + 1 - last_err = result.group .. ': ' .. tostring(result.err or 'failed') - end - end - return ok_count, fail_count, raw_total, last_err -end - local function has_known_vlan_mode(surface) local mode = surface and surface.attachment and surface.attachment.mode return mode == nil or mode == 'hybrid' or mode == 'access' or mode == 'trunk' or mode == 'tunnel' @@ -536,246 +349,5 @@ function T.rtl8380m_real_switch_raw_observations_project_to_state_wired() end -function T.rtl8380m_fixed_switch_admin_stats_within_allotted_time() - local env, err = fixed_stats_env() - if not env then return skip(err) end - - runfibers.run(function () - local b = busmod.new() - local http = start_http_capability(b, env) - wait_http_available(b) - local provider = new_real_switch_provider(b, env) - - -- Warm the authenticated session once. The timing assertion below is for - -- the stats read using the retained session, not for RSA login. - local warm = require_successful_snapshot(provider) - assert_eq(warm.status.login, 'confirmed') - - local started = runtime.now() - local stats = fibers.perform(provider:stats_op({})) - local elapsed = runtime.now() - started - assert_not_nil(stats, 'stats_op should return a table') - if stats.ok ~= true then - local status = stats.status or {} - error('switch stats failed: ' .. tostring(status.err or stats.err or 'unknown error'), 2) - end - assert_true(elapsed <= env.stats_budget_s, ('stats elapsed %.3fs exceeded budget %.3fs'):format(elapsed, env.stats_budget_s)) - assert_eq(stats.status.login, 'confirmed') - assert_not_nil(stats.runtime, 'stats should include runtime') - assert_not_nil(stats.runtime.cpu, 'stats should include runtime.cpu') - assert_not_nil(stats.runtime.memory, 'stats should include runtime.memory') - assert_not_nil(stats.raw, 'include_raw=true should preserve stats payloads') - assert_not_nil(stats.raw.sys_cpumem, 'stats raw sys_cpumem should be captured') - assert_not_nil(stats.raw.rmon_statistics, 'stats raw rmon_statistics should be captured') - - provider:terminate('test complete') - http:terminate('test complete') - end, { timeout = env.run_timeout_s }) -end - -function T.rtl8380m_fixed_switch_admin_panel_timing() - local env, err = fixed_panel_env() - if not env then return skip(err) end - - runfibers.run(function () - local b = busmod.new() - local http = start_http_capability(b, env) - wait_http_available(b) - local provider = new_real_switch_provider(b, env) - - -- Warm login using the same cheap panel path. The timed loop below - -- measures retained-session panel reads only; it does not run a full - -- switch snapshot and therefore does not depend on sys_cpumem or RMON. - local warm = fibers.perform(provider:panel_op({})) - assert_not_nil(warm, 'panel_op should return a table') - if warm.ok ~= true then - local status = warm.status or {} - error('switch panel warm-up failed: ' .. tostring(status.err or warm.err or 'unknown error'), 2) - end - assert_eq(warm.status.login, 'confirmed') - - local min_s, max_s, total_s = nil, 0, 0 - local last_panel - for _ = 1, env.panel_iterations do - local started = runtime.now() - local panel = fibers.perform(provider:panel_op({})) - local elapsed = runtime.now() - started - assert_not_nil(panel, 'panel_op should return a table') - if panel.ok ~= true then - local status = panel.status or {} - error('switch panel read failed: ' .. tostring(status.err or panel.err or 'unknown error'), 2) - end - last_panel = panel - if min_s == nil or elapsed < min_s then min_s = elapsed end - if elapsed > max_s then max_s = elapsed end - total_s = total_s + elapsed - assert_true(elapsed <= env.panel_budget_s, ('panel read elapsed %.3fs exceeded budget %.3fs'):format(elapsed, env.panel_budget_s)) - end - - local surface_count = count_surfaces_with_prefix(last_panel.surfaces, 'GE') - local up, down = count_link_states(last_panel.surfaces) - assert_true(surface_count >= 10, 'panel read should expose the ten GE switch surfaces') - assert_true((up + down) >= 10, 'panel read should expose link state for switch surfaces') - io.stderr:write(('rtl8380m panel timing: n=%d min=%.3fs avg=%.3fs max=%.3fs ge=%d link_up=%d link_down=%d\n'):format( - env.panel_iterations, - min_s or 0, - total_s / env.panel_iterations, - max_s, - surface_count, - up, - down - )) - - provider:terminate('test complete') - http:terminate('test complete') - end, { timeout = env.run_timeout_s }) -end - - -function T.rtl8380m_fixed_switch_admin_command_timing_sweep() - local env, err = fixed_timing_env() - if not env then return skip(err) end - - runfibers.run(function () - local b = busmod.new() - local http = start_http_capability(b, env) - wait_http_available(b) - local provider = new_real_switch_provider(b, env) - - local warm = fibers.perform(provider:panel_op({})) - assert_not_nil(warm, 'panel warm-up should return a table') - if warm.ok ~= true then - local status = warm.status or {} - error('switch panel warm-up failed: ' .. tostring(status.err or warm.err or 'unknown error'), 2) - end - - local any_ok = false - local failures = {} - for _, group in ipairs(env.groups) do - local min_s, max_s, total_s = nil, 0, 0 - local ok_count, fail_count, raw_count = 0, 0, 0 - local last_err - for _ = 1, env.iterations do - local started = runtime.now() - local result = fibers.perform(provider:command_group_op({ group = group })) - local elapsed = runtime.now() - started - if min_s == nil or elapsed < min_s then min_s = elapsed end - if elapsed > max_s then max_s = elapsed end - total_s = total_s + elapsed - if result and result.ok == true then - ok_count = ok_count + 1 - any_ok = true - raw_count = command_result_raw_count(result) - else - fail_count = fail_count + 1 - local status = result and result.status or {} - last_err = tostring(status.err or (result and result.err) or 'unknown error') - end - end - local avg_s = total_s / env.iterations - io.stderr:write(('rtl8380m command timing: group=%s n=%d ok=%d fail=%d min=%.3fs avg=%.3fs max=%.3fs raw_keys=%d%s\n'):format( - group, - env.iterations, - ok_count, - fail_count, - min_s or 0, - avg_s, - max_s, - raw_count, - last_err and (' last_err=' .. last_err) or '' - )) - if env.require_all and fail_count > 0 then failures[#failures + 1] = group .. ': ' .. tostring(last_err or 'failed') end - end - - assert_true(any_ok, 'at least one switch command timing group should succeed') - if env.require_all and #failures > 0 then error('switch timing failures: ' .. table.concat(failures, '; '), 2) end - - provider:terminate('test complete') - http:terminate('test complete') - end, { timeout = env.run_timeout_s }) -end - -function T.rtl8380m_fixed_switch_admin_concurrent_command_timing() - local env, err = fixed_concurrent_timing_env() - if not env then return skip(err) end - - runfibers.run(function (scope) - local b = busmod.new() - local http = start_http_capability(b, env) - wait_http_available(b) - local provider = new_real_switch_provider(b, env) - - local warm = fibers.perform(provider:panel_op({})) - assert_not_nil(warm, 'panel warm-up should return a table') - if warm.ok ~= true then - local status = warm.status or {} - error('switch panel warm-up failed: ' .. tostring(status.err or warm.err or 'unknown error'), 2) - end - - local seq_total, conc_total = 0, 0 - local best_seq, best_conc, worst_seq, worst_conc = nil, nil, 0, 0 - local any_ok = false - local failures = {} - local group_list = table.concat(env.groups, ',') - - for i = 1, env.iterations do - local seq_elapsed, seq_results = run_command_groups_sequential(provider, env.groups) - local seq_ok, seq_fail, seq_raw, seq_err = count_command_result_failures(seq_results) - - local conc_elapsed, conc_results = run_command_groups_concurrent(scope, provider, env.groups) - local conc_ok, conc_fail, conc_raw, conc_err = count_command_result_failures(conc_results) - - seq_total = seq_total + seq_elapsed - conc_total = conc_total + conc_elapsed - if best_seq == nil or seq_elapsed < best_seq then best_seq = seq_elapsed end - if best_conc == nil or conc_elapsed < best_conc then best_conc = conc_elapsed end - if seq_elapsed > worst_seq then worst_seq = seq_elapsed end - if conc_elapsed > worst_conc then worst_conc = conc_elapsed end - if seq_ok > 0 or conc_ok > 0 then any_ok = true end - - local speedup = conc_elapsed > 0 and (seq_elapsed / conc_elapsed) or 0 - io.stderr:write(('rtl8380m concurrent timing: iter=%d groups=%s sequential=%.3fs concurrent=%.3fs speedup=%.2fx seq_ok=%d seq_fail=%d conc_ok=%d conc_fail=%d raw_seq=%d raw_conc=%d%s%s\n'):format( - i, - group_list, - seq_elapsed, - conc_elapsed, - speedup, - seq_ok, - seq_fail, - conc_ok, - conc_fail, - seq_raw, - conc_raw, - seq_err and (' seq_err=' .. seq_err) or '', - conc_err and (' conc_err=' .. conc_err) or '' - )) - - if env.budget_s and conc_elapsed > env.budget_s then - failures[#failures + 1] = ('iteration %d concurrent %.3fs exceeded budget %.3fs'):format(i, conc_elapsed, env.budget_s) - end - if env.require_all and (seq_fail > 0 or conc_fail > 0) then - failures[#failures + 1] = ('iteration %d failures: seq=%s conc=%s'):format(i, tostring(seq_err), tostring(conc_err)) - end - end - - io.stderr:write(('rtl8380m concurrent timing summary: n=%d groups=%s seq_avg=%.3fs seq_min=%.3fs seq_max=%.3fs conc_avg=%.3fs conc_min=%.3fs conc_max=%.3fs speedup=%.2fx\n'):format( - env.iterations, - group_list, - seq_total / env.iterations, - best_seq or 0, - worst_seq, - conc_total / env.iterations, - best_conc or 0, - worst_conc, - conc_total > 0 and (seq_total / conc_total) or 0 - )) - - assert_true(any_ok, 'at least one sequential or concurrent command group read should succeed') - if #failures > 0 then error('switch concurrent timing failures: ' .. table.concat(failures, '; '), 2) end - - provider:terminate('test complete') - http:terminate('test complete') - end, { timeout = env.run_timeout_s }) -end return T From c6e0028ffc87684c82244fdf5cecbfa33df4ce58 Mon Sep 17 00:00:00 2001 From: Rich Thanki Date: Thu, 18 Jun 2026 00:06:27 +0000 Subject: [PATCH 4/6] adopts stronger ordering and removes the poll_interval_s compatibility path --- src/configs/bigbox-v1-cm-2.json | 8 ++ src/services/hal/managers/wired.lua | 134 +++++++++++++++---------- tests/unit/hal/wired_provider_spec.lua | 27 +++-- 3 files changed, 108 insertions(+), 61 deletions(-) diff --git a/src/configs/bigbox-v1-cm-2.json b/src/configs/bigbox-v1-cm-2.json index 23d7dfa5..53e1a591 100644 --- a/src/configs/bigbox-v1-cm-2.json +++ b/src/configs/bigbox-v1-cm-2.json @@ -31,6 +31,14 @@ "cm5-local-wired": { "provider": "static", "mode": "read_only", + "poll": { + "static": { + "interval_s": 30.0, + "groups": [ + "snapshot" + ] + } + }, "surfaces": { "eth0": { "provider_surface_id": "eth0", diff --git a/src/services/hal/managers/wired.lua b/src/services/hal/managers/wired.lua index 4ec6ab8d..cfacfa4f 100644 --- a/src/services/hal/managers/wired.lua +++ b/src/services/hal/managers/wired.lua @@ -146,13 +146,6 @@ local function emit_snapshot_now(provider_id, snapshot) return true, nil end -local function provider_poll_interval_s(config) - local n = tonumber(config and config.poll_interval_s) - if n == nil then return 1.0 end - if n <= 0 then return nil, 'poll_interval_s must be a positive number' end - return n, nil -end - local function normalise_groups(groups, path) if type(groups) ~= 'table' then return nil, path .. '.groups must be a non-empty array' end local out = {} @@ -167,27 +160,22 @@ end local function provider_poll_plan(config) config = config or {} - if config.poll ~= nil then - if config.poll_interval_s ~= nil then return nil, 'use poll, not poll_interval_s, for grouped wired polling' end - if type(config.poll) ~= 'table' then return nil, 'poll must be a table' end - local out = {} - for _, name in ipairs(sorted_keys(config.poll)) do - local rec = config.poll[name] - local path = 'poll.' .. tostring(name) - if type(rec) ~= 'table' then return nil, path .. ' must be a table' end - local interval_s = tonumber(rec.interval_s) - if interval_s == nil or interval_s <= 0 then return nil, path .. '.interval_s must be a positive number' end - local groups, gerr = normalise_groups(rec.groups, path) - if not groups then return nil, gerr end - out[#out + 1] = { name = tostring(name), interval_s = interval_s, groups = groups } - end - if #out == 0 then return nil, 'poll must contain at least one poll group' end - return out, nil + if config.poll_interval_s ~= nil then return nil, 'use poll, not poll_interval_s, for grouped wired polling' end + if config.poll == nil then return nil, 'poll is required' end + if type(config.poll) ~= 'table' then return nil, 'poll must be a table' end + local out = {} + for _, name in ipairs(sorted_keys(config.poll)) do + local rec = config.poll[name] + local path = 'poll.' .. tostring(name) + if type(rec) ~= 'table' then return nil, path .. ' must be a table' end + local interval_s = tonumber(rec.interval_s) + if interval_s == nil or interval_s <= 0 then return nil, path .. '.interval_s must be a positive number' end + local groups, gerr = normalise_groups(rec.groups, path) + if not groups then return nil, gerr end + out[#out + 1] = { name = tostring(name), interval_s = interval_s, groups = groups } end - - local interval_s, err = provider_poll_interval_s(config) - if not interval_s then return nil, err end - return { { name = 'snapshot', interval_s = interval_s, method = 'snapshot' } }, nil + if #out == 0 then return nil, 'poll must contain at least one poll group' end + return out, nil end local function perform_driver_method(driver, method, opts) @@ -237,7 +225,7 @@ local function failure_status_for_plan(plan, result) local unavailable = false if plan and plan.groups then for _, group in ipairs(plan.groups) do - if group == 'panel' then unavailable = true end + if group == 'panel' or group == 'snapshot' then unavailable = true end end else unavailable = true @@ -260,7 +248,12 @@ local function perform_poll_plan(provider_id, driver, plan) for _, group in ipairs(plan.groups or {}) do if not poller_is_current(provider_id, driver) then return true, nil end - local result = perform_driver_method(driver, 'group_observation', { group = group }) + local result + if group == 'snapshot' then + result = perform_driver_method(driver, 'snapshot', {}) + else + result = perform_driver_method(driver, 'group_observation', { group = group }) + end if not poller_is_current(provider_id, driver) then return true, nil end if result and result.ok == true then local ok, err = publish_observation(provider_id, result) @@ -480,6 +473,46 @@ function M.start_op(logger, dev_ev_ch, cap_emit_ch, opts) end) end + +local function terminate_prepared(prepared, reason) + for _, rec in pairs(prepared or {}) do + local driver = rec and rec.driver + if driver and type(driver.terminate) == 'function' then driver:terminate(reason or 'discarded') end + end +end + +local function prepare_providers(config, provider_ids) + local prepared = {} + for i = 1, #provider_ids do + local id = provider_ids[i] + local pcfg = configured_provider(config or {}, id) + if not pcfg then + terminate_prepared(prepared, 'prepare failed') + return nil, ('wired provider %s missing configuration'):format(id) + end + + local driver_config = {} + for k, v in pairs(pcfg) do driver_config[k] = v end + local poll_plan, poll_err = provider_poll_plan(driver_config) + if not poll_plan then + terminate_prepared(prepared, 'prepare failed') + return nil, ('wired provider %s poll config failed: %s'):format(id, tostring(poll_err)) + end + driver_config.poll = nil + + local driver_opts = { logger = state.logger, cap_emit_ch = state.cap_emit_ch, provider_id = id } + if driver_config.provider == 'rtl8380m_http' then driver_opts.http_client_for = state.http_client_for end + local driver, err = driver_mod.new(driver_config, driver_opts) + if not driver then + terminate_prepared(prepared, 'prepare failed') + return nil, ('wired provider %s create failed: %s'):format(id, tostring(err)) + end + driver.provider = driver_config.provider + prepared[id] = { driver = driver, poll_plan = poll_plan } + end + return prepared, nil +end + function M.apply_config_op(config) return op.guard(function () if not state.started then return op.always(false, 'wired manager not started') end @@ -487,36 +520,32 @@ function M.apply_config_op(config) local provider_ids, perr = normalise_provider_ids(config or {}) if not provider_ids then return false, perr end + local prepared, prep_err = prepare_providers(config or {}, provider_ids) + if not prepared then return false, prep_err end + cancel_pollers('reconfigured') stop_drivers('reconfigured') state.observations = {} local ok, cerr, caps_ready_cond = reconcile_device_caps(provider_ids) - if ok ~= true then return false, cerr end + if ok ~= true then + terminate_prepared(prepared, 'capability reconcile failed') + return false, cerr + end + + state.drivers = {} + for i = 1, #provider_ids do + local id = provider_ids[i] + state.drivers[id] = prepared[id].driver + end for i = 1, #provider_ids do local id = provider_ids[i] - local pcfg = configured_provider(config or {}, id) - if not pcfg then - return false, ('wired provider %s missing configuration'):format(id) - else - local driver_config = {} - for k, v in pairs(pcfg) do driver_config[k] = v end - local driver_opts = { logger = state.logger, cap_emit_ch = state.cap_emit_ch, provider_id = id } - if driver_config.provider == 'rtl8380m_http' then driver_opts.http_client_for = state.http_client_for end - local poll_plan, poll_err = provider_poll_plan(driver_config) - if not poll_plan then return false, poll_err end - driver_config.poll = nil - driver_config.poll_interval_s = nil - local driver, err = driver_mod.new(driver_config, driver_opts) - if not driver then return false, ('wired provider %s create failed: %s'):format(id, tostring(err)) end - driver.provider = driver_config.provider - state.drivers[id] = driver - local spawned, spawn_err = spawn_provider_poller(id, driver, poll_plan, caps_ready_cond) - if spawned ~= true then - state.drivers[id] = nil - if driver and type(driver.terminate) == 'function' then driver:terminate('poller spawn failed') end - return false, ('wired provider %s poller failed: %s'):format(id, tostring(spawn_err)) - end + local rec = prepared[id] + local spawned, spawn_err = spawn_provider_poller(id, rec.driver, rec.poll_plan, caps_ready_cond) + if spawned ~= true then + cancel_pollers('poller spawn failed') + stop_drivers('poller spawn failed') + return false, ('wired provider %s poller failed: %s'):format(id, tostring(spawn_err)) end end log('info', { what = 'wired_manager_configured', providers = provider_ids }) @@ -558,7 +587,6 @@ end M._test = { normalise_provider_ids = normalise_provider_ids, - provider_poll_interval_s = provider_poll_interval_s, provider_poll_plan = provider_poll_plan, } diff --git a/tests/unit/hal/wired_provider_spec.lua b/tests/unit/hal/wired_provider_spec.lua index 8bf37a7d..58f50225 100644 --- a/tests/unit/hal/wired_provider_spec.lua +++ b/tests/unit/hal/wired_provider_spec.lua @@ -214,15 +214,26 @@ function tests.test_rtl8380m_http_accepts_narrow_http_client_factory() end -function tests.test_wired_manager_poll_interval_is_positive_and_defaults_to_one_second() +function tests.test_wired_manager_requires_canonical_poll_table() local manager = require 'services.hal.managers.wired' - local n, err = manager._test.provider_poll_interval_s({}) - assert_eq(n, 1.0, err) - n, err = manager._test.provider_poll_interval_s({ poll_interval_s = 0.5 }) - assert_eq(n, 0.5, err) - n, err = manager._test.provider_poll_interval_s({ poll_interval_s = 0 }) - assert_eq(n, nil) - assert_true(type(err) == 'string' and err:find('positive', 1, true) ~= nil, tostring(err)) + local plan, err = manager._test.provider_poll_plan({}) + assert_eq(plan, nil) + assert_true(type(err) == 'string' and err:find('poll is required', 1, true) ~= nil, tostring(err)) + + plan, err = manager._test.provider_poll_plan({ poll_interval_s = 0.5 }) + assert_eq(plan, nil) + assert_true(type(err) == 'string' and err:find('poll_interval_s', 1, true) ~= nil, tostring(err)) + + plan, err = manager._test.provider_poll_plan({ + poll = { + static = { interval_s = 30.0, groups = { 'snapshot' } }, + }, + }) + assert_not_nil(plan, err) + assert_eq(#plan, 1) + assert_eq(plan[1].name, 'static') + assert_eq(plan[1].interval_s, 30.0) + assert_eq(plan[1].groups[1], 'snapshot') end return tests From b3a5020cf1ae8fffdb0aedf3c7a22ad5a2a2ffa2 Mon Sep 17 00:00:00 2001 From: Rich Thanki Date: Thu, 18 Jun 2026 00:23:38 +0000 Subject: [PATCH 5/6] better obs --- docs/switch.md | 2 + src/services/hal/managers/wired.lua | 93 ++++++++++++++++++++++++----- 2 files changed, 80 insertions(+), 15 deletions(-) diff --git a/docs/switch.md b/docs/switch.md index e1db1de6..331e0efb 100644 --- a/docs/switch.md +++ b/docs/switch.md @@ -238,6 +238,8 @@ slow, 30 s: identity, runtime Each poll loop is non-overlapping. A slow runtime read therefore cannot queue behind, block, or mark the fast link-state path unavailable. Successful groups merge into the retained raw observation cache, so `state/surfaces` carries last-known link, PoE, counter and VLAN facts together. Group failures update provider status but leave the last good identity/runtime/power/surfaces/topology retained facts in place. +The HAL wired manager emits raw provider facts on a changed-retained basis. A successful `panel` group can update `state/surfaces` without re-emitting unchanged identity/runtime/power/topology facts, and repeated identical provider statuses are suppressed. This keeps switch visibility in the provider status and semantic `state/wired/...` surfaces rather than turning the monitor into a per-request trace. + Canonical observation names are deliberately strict. CPU and memory are published as `runtime.cpu` and `runtime.memory`; PoE device-level power and temperature are published as `power.poe`; port counters are published under each surface as `counters`. The switch path must not publish `telemetry.cpu`, `telemetry.mem`, `telemetry.poe`, or any compatibility topic for `state/telemetry`. ## Snapshot shape diff --git a/src/services/hal/managers/wired.lua b/src/services/hal/managers/wired.lua index cfacfa4f..cb5e845a 100644 --- a/src/services/hal/managers/wired.lua +++ b/src/services/hal/managers/wired.lua @@ -32,6 +32,7 @@ local state = { provider_ids = {}, pollers = {}, observations = {}, + emitted = {}, device_registered = false, } @@ -63,6 +64,37 @@ end local function copy(v) return tablex.deep_copy(v) end +local function stable_signature(v) + local tv = type(v) + if tv == 'nil' or tv == 'boolean' or tv == 'number' or tv == 'string' then + return tv .. ':' .. tostring(v) + end + if tv ~= 'table' then return tv .. ':' .. tostring(v) end + local keys = {} + for k in pairs(v) do keys[#keys + 1] = k end + table.sort(keys, function(a, b) return tostring(a) < tostring(b) end) + local out = { 'table{' } + for i = 1, #keys do + local k = keys[i] + out[#out + 1] = stable_signature(k) + out[#out + 1] = '=' + out[#out + 1] = stable_signature(v[k]) + out[#out + 1] = ';' + end + out[#out + 1] = '}' + return table.concat(out) +end + +local function emitted_cache(provider_id) + state.emitted = state.emitted or {} + local rec = state.emitted[provider_id] + if rec == nil then + rec = {} + state.emitted[provider_id] = rec + end + return rec +end + local function merge_table(dst, src) dst = dst or {} if type(src) ~= 'table' then return dst end @@ -124,25 +156,53 @@ local function emit_state(class, id, key, payload) return state.cap_emit_ch:put_op(ev):wrap(function () return true, nil end) end +local function emit_state_changed(provider_id, key, payload) + local cache = emitted_cache(provider_id) + local sig = stable_signature(payload or {}) + if cache[key] == sig then return true, nil, false end + cache[key] = sig + local ok, err = fibers.perform(emit_state('wired-provider', provider_id, key, payload or {})) + if ok == false or ok == nil then + cache[key] = nil + return nil, err, false + end + return true, nil, true +end + local function emit_status_now(provider_id, status) - local ok, err = fibers.perform(emit_state('wired-provider', provider_id, 'status', status or { state = 'available', available = true })) + local ok, err = emit_state_changed(provider_id, 'status', status or { state = 'available', available = true }) if ok == false or ok == nil then return nil, err end return true, nil end -local function emit_snapshot_now(provider_id, snapshot) - local ok, err = emit_status_now(provider_id, snapshot.status or { state = 'available', available = snapshot.ok == true }) - if ok ~= true then return nil, err end - ok, err = fibers.perform(emit_state('wired-provider', provider_id, 'identity', snapshot.identity or {})) - if ok == false or ok == nil then return nil, err end - ok, err = fibers.perform(emit_state('wired-provider', provider_id, 'runtime', snapshot.runtime or {})) - if ok == false or ok == nil then return nil, err end - ok, err = fibers.perform(emit_state('wired-provider', provider_id, 'power', snapshot.power or {})) - if ok == false or ok == nil then return nil, err end - ok, err = fibers.perform(emit_state('wired-provider', provider_id, 'surfaces', { surfaces = snapshot.surfaces or {} })) - if ok == false or ok == nil then return nil, err end - ok, err = fibers.perform(emit_state('wired-provider', provider_id, 'topology', snapshot.topology or {})) - if ok == false or ok == nil then return nil, err end +local function emit_snapshot_now(provider_id, snapshot, present) + snapshot = snapshot or {} + present = present or snapshot + local ok, err + if present.status ~= nil then + ok, err = emit_status_now(provider_id, snapshot.status or { state = 'available', available = snapshot.ok == true }) + if ok ~= true then return nil, err end + end + if present.identity ~= nil then + ok, err = emit_state_changed(provider_id, 'identity', snapshot.identity or {}) + if ok == false or ok == nil then return nil, err end + end + if present.runtime ~= nil then + ok, err = emit_state_changed(provider_id, 'runtime', snapshot.runtime or {}) + if ok == false or ok == nil then return nil, err end + end + if present.power ~= nil then + ok, err = emit_state_changed(provider_id, 'power', snapshot.power or {}) + if ok == false or ok == nil then return nil, err end + end + if present.surfaces ~= nil then + ok, err = emit_state_changed(provider_id, 'surfaces', { surfaces = snapshot.surfaces or {} }) + if ok == false or ok == nil then return nil, err end + end + if present.topology ~= nil then + ok, err = emit_state_changed(provider_id, 'topology', snapshot.topology or {}) + if ok == false or ok == nil then return nil, err end + end return true, nil end @@ -217,7 +277,7 @@ end local function publish_observation(provider_id, snapshot) local cache = merge_observation(provider_id, snapshot) - return emit_snapshot_now(provider_id, cache) + return emit_snapshot_now(provider_id, cache, snapshot or {}) end local function failure_status_for_plan(plan, result) @@ -464,6 +524,7 @@ function M.start_op(logger, dev_ev_ch, cap_emit_ch, opts) state.provider_ids = {} state.pollers = {} state.observations = {} + state.emitted = {} state.device_registered = false child:finally(function (_, status, primary) M.terminate(primary or status or 'wired manager closed') end) @@ -526,6 +587,7 @@ function M.apply_config_op(config) cancel_pollers('reconfigured') stop_drivers('reconfigured') state.observations = {} + state.emitted = {} local ok, cerr, caps_ready_cond = reconcile_device_caps(provider_ids) if ok ~= true then terminate_prepared(prepared, 'capability reconcile failed') @@ -571,6 +633,7 @@ function M.terminate(reason) state.provider_ids = {} state.pollers = {} state.observations = {} + state.emitted = {} state.device_registered = false if state.scope then local scope = state.scope; state.scope = nil; scope:cancel(reason or 'terminated') end state.started = false From cc6bbb70cadec488aae62fd9032299928d1200cd Mon Sep 17 00:00:00 2001 From: Rich Thanki Date: Fri, 19 Jun 2026 01:04:31 +0000 Subject: [PATCH 6/6] significant cleanup, structural serialisation and testing --- docs/switch.md | 8 +- src/services/hal/backends/wired/provider.lua | 4 +- .../wired/providers/rtl8380m_http.lua | 103 ++++- .../hal/backends/wired/providers/static.lua | 13 + src/services/hal/drivers/wired.lua | 47 +- src/services/hal/managers/wired.lua | 423 ++++-------------- .../hal/managers/wired/provider_runner.lua | 359 +++++++++++++++ .../devhost/rtl8380m_switch_spec.lua | 177 ++++++++ tests/unit/hal/wired_provider_spec.lua | 50 +++ 9 files changed, 829 insertions(+), 355 deletions(-) create mode 100644 src/services/hal/managers/wired/provider_runner.lua diff --git a/docs/switch.md b/docs/switch.md index 331e0efb..d4a34a93 100644 --- a/docs/switch.md +++ b/docs/switch.md @@ -228,7 +228,7 @@ raw/host/wired/provider/switch-main/state/topology If `include_raw = true` is set in a test, the snapshot also keeps the source command payloads for parser debugging. Full raw CGI bodies should not be promoted to public retained state by default. -The HAL wired manager owns scheduling. For the RTL8380M provider, manager apply admits the provider and starts owned poller work; switch observation is not part of configuration admission. The Big Box poll plan is grouped and sequential: +The HAL wired manager owns scheduling. For the RTL8380M provider, manager apply admits the provider and starts one owned provider runner; switch observation is not part of configuration admission. The Big Box poll plan is grouped: ```text fast, 1 Hz: panel, poe, counters @@ -236,7 +236,11 @@ medium, 5 s: vlan, lldp slow, 30 s: identity, runtime ``` -Each poll loop is non-overlapping. A slow runtime read therefore cannot queue behind, block, or mark the fast link-state path unavailable. Successful groups merge into the retained raw observation cache, so `state/surfaces` carries last-known link, PoE, counter and VLAN facts together. Group failures update provider status but leave the last good identity/runtime/power/surfaces/topology retained facts in place. +There is one runner per provider, not one fibre per poll group. The runner lives in `services/hal/managers/wired/provider_runner.lua` and owns the backend object, request mailbox, switch session, observation cache and due-time schedule. Capability snapshot/control requests are sent to the runner mailbox, so the RTL8380M backend is touched only by the runner fibre. This gives serialisation by ownership rather than a lock or semaphore. + +Each runner cycle coalesces all due poll groups, calls the mandatory backend `observe_groups_op` once, and lets the backend de-duplicate shared CGI commands such as `home_main`. A saturated cycle schedules the next attempt from the finish time rather than trying to catch up, and applies a short minimum idle interval before another due cycle. Slow runtime reads can therefore degrade runtime status without creating overlapping switch sessions or a busy catch-up loop. + +Successful groups merge into the retained raw observation cache, so `state/surfaces` carries last-known link, PoE, counter and VLAN facts together. Group failures update provider status but leave the last good identity/runtime/power/surfaces/topology retained facts in place. The HAL wired manager emits raw provider facts on a changed-retained basis. A successful `panel` group can update `state/surfaces` without re-emitting unchanged identity/runtime/power/topology facts, and repeated identical provider statuses are suppressed. This keeps switch visibility in the provider status and semantic `state/wired/...` surfaces rather than turning the monitor into a per-request trace. diff --git a/src/services/hal/backends/wired/provider.lua b/src/services/hal/backends/wired/provider.lua index bf9b9a39..4990f469 100644 --- a/src/services/hal/backends/wired/provider.lua +++ b/src/services/hal/backends/wired/provider.lua @@ -17,7 +17,9 @@ function M.new(config, opts) local ok, mod = pcall(require, modname) if not ok then return nil, ('wired provider %s not available: %s'):format(name, tostring(mod)) end if type(mod) ~= 'table' or type(mod.new) ~= 'function' then return nil, 'wired provider module must export new(config, opts)' end - return mod.new(config, opts or {}) + local backend, err = mod.new(config, opts or {}) + if not backend then return nil, err end + return backend, nil, name end return M diff --git a/src/services/hal/backends/wired/providers/rtl8380m_http.lua b/src/services/hal/backends/wired/providers/rtl8380m_http.lua index 4031b6a3..cef1b53c 100644 --- a/src/services/hal/backends/wired/providers/rtl8380m_http.lua +++ b/src/services/hal/backends/wired/providers/rtl8380m_http.lua @@ -70,6 +70,39 @@ local VLAN_MEMBERSHIP = { local function copy(v) return tablex.deep_copy(v) end +local function merge_table(dst, src) + dst = dst or {} + if type(src) ~= 'table' then return dst end + for k, v in pairs(src) do + if v ~= nil then + if type(v) == 'table' and type(dst[k]) == 'table' then + merge_table(dst[k], v) + else + dst[k] = copy(v) + end + end + end + return dst +end + +local function append_unique(out, seen, value) + value = tostring(value or '') + if value ~= '' and not seen[value] then + seen[value] = true + out[#out + 1] = value + end +end + +local function commands_for_groups(groups) + local out, seen = {}, {} + for _, group in ipairs(groups or {}) do + local commands = COMMAND_GROUPS[tostring(group or '')] + if not commands then return nil, 'unknown command group: ' .. tostring(group) end + for _, cmd in ipairs(commands) do append_unique(out, seen, cmd) end + end + return out, nil +end + local function trim(s) return (tostring(s or ''):gsub('^%s+', ''):gsub('%s+$', '')) end @@ -684,6 +717,30 @@ local function build_group_observation(self, group, data) return out end +local function build_groups_observation(self, groups, data) + local out = { + ok = true, + provider_id = self.id, + groups = copy(groups or {}), + status = base_status(self), + } + for _, group in ipairs(groups or {}) do + local partial = build_group_observation(self, group, data) + if not partial or partial.ok ~= true then return partial end + for _, key in ipairs({ 'identity', 'runtime', 'power', 'topology' }) do + if type(partial[key]) == 'table' then out[key] = merge_table(out[key] or {}, partial[key]) end + end + if type(partial.surfaces) == 'table' then + out.surfaces = out.surfaces or {} + for surface_id, surface in pairs(partial.surfaces) do + out.surfaces[surface_id] = merge_table(out.surfaces[surface_id] or {}, surface) + end + end + end + if self.include_raw then out.raw = data end + return out +end + local function require_http_config(config) local http = config and config.http or nil if type(http) ~= 'table' then return nil, 'http table is required' end @@ -828,6 +885,37 @@ function Provider:fetch_command_group(group_name) } end +function Provider:fetch_command_groups(groups) + groups = groups or {} + local commands, cerr = commands_for_groups(groups) + if not commands then + return { ok = false, provider_id = self.id, groups = copy(groups), status = { state = 'unavailable', available = false, driver = DRIVER, err = cerr } } + end + + local ok_login, lerr = login(self) + if not ok_login then + return { ok = false, provider_id = self.id, groups = copy(groups), commands = commands, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = lerr or 'login failed' } } + end + + local data, err, code = read_commands(self, commands) + if code == 'auth_invalid' then + reset_session(self) + local ok_relogin, relogin_err = login(self, { force = true }) + if not ok_relogin then + return { ok = false, provider_id = self.id, groups = copy(groups), commands = commands, status = { state = 'unavailable', available = false, driver = DRIVER, login = 'failed', err = relogin_err or 're-login failed' } } + end + data, err, code = read_commands(self, commands) + end + + if not data then + return { ok = false, provider_id = self.id, groups = copy(groups), commands = commands, status = { state = 'unavailable', available = false, driver = DRIVER, login = self.logged_in and 'confirmed' or 'failed', err = err or 'switch command groups failed' } } + end + + local out = build_groups_observation(self, groups, data) + if out and out.ok == true then out.commands = commands end + return out +end + function Provider:fetch_snapshot_op(_req) return op.guard(function () return fibers.run_scope_op(function () @@ -851,20 +939,21 @@ end function Provider:snapshot_op(req) return self:fetch_snapshot_op(req) end function Provider:watch_op(req) return self:fetch_snapshot_op(req) end -function Provider:group_observation_op(req) +function Provider:observe_groups_op(req) req = req or {} + local groups = req.groups or {} + if type(groups) ~= 'table' then + return op.always({ ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, driver = DRIVER, err = 'groups must be an array' } }) + end return op.guard(function () return fibers.run_scope_op(function () - local group = tostring(req.group or '') - local result = self:fetch_command_group(group) - if result.ok ~= true then return result end - return build_group_observation(self, group, result.raw or {}) + return self:fetch_command_groups(groups) end):wrap(function (status, _report, result_or_primary, err) if status == 'ok' then return result_or_primary, err end return { ok = false, provider_id = self.id, - group = req.group, + groups = copy(groups), status = { state = 'unavailable', available = false, @@ -892,6 +981,8 @@ M._test = { parse_power = parse_power, parse_surface_counters = parse_surface_counters, build_group_observation = function(provider_like, group, data) return build_group_observation(provider_like or { id = 'switch-main', mode = 'read_only' }, group, data or {}) end, + build_groups_observation = function(provider_like, groups, data) return build_groups_observation(provider_like or { id = 'switch-main', mode = 'read_only' }, groups or {}, data or {}) end, + commands_for_groups = commands_for_groups, auth_invalid_body = auth_invalid_body, COMMAND_GROUPS = COMMAND_GROUPS, } diff --git a/src/services/hal/backends/wired/providers/static.lua b/src/services/hal/backends/wired/providers/static.lua index ad1a47d1..975e1682 100644 --- a/src/services/hal/backends/wired/providers/static.lua +++ b/src/services/hal/backends/wired/providers/static.lua @@ -56,6 +56,19 @@ function Provider:snapshot_op(_req) end function Provider:watch_op(req) return self:snapshot_op(req) end + +function Provider:observe_groups_op(req) + req = req or {} + local groups = req.groups or {} + if type(groups) ~= 'table' then return op.always({ ok = false, provider_id = self.id, status = { state = 'unavailable', available = false, err = 'groups must be an array' } }) end + for i = 1, #groups do + if groups[i] ~= 'snapshot' then + return op.always({ ok = false, provider_id = self.id, group = groups[i], status = { state = 'unavailable', available = false, err = 'unsupported static poll group: ' .. tostring(groups[i]) } }) + end + end + return self:snapshot_op(req) +end + function Provider:apply_attachments_op(_req) return op.always(contract.read_only('apply_attachments')) end function Provider:set_poe_op(_req) return op.always(contract.read_only('set_poe')) end function Provider:bounce_op(_req) return op.always(contract.read_only('bounce')) end diff --git a/src/services/hal/drivers/wired.lua b/src/services/hal/drivers/wired.lua index 41c5e4df..4e2b3faf 100644 --- a/src/services/hal/drivers/wired.lua +++ b/src/services/hal/drivers/wired.lua @@ -7,20 +7,49 @@ local M = {} local Driver = {} Driver.__index = Driver +local REQUIRED_BACKEND_OPS = { + 'snapshot_op', + 'watch_op', + 'observe_groups_op', + 'apply_attachments_op', + 'set_poe_op', + 'bounce_op', +} + +local function validate_backend(backend, provider_name) + for _, opname in ipairs(REQUIRED_BACKEND_OPS) do + if type(backend[opname]) ~= 'function' then + return nil, ('wired provider %s missing required %s'):format(tostring(provider_name), opname) + end + end + return true, nil +end + function M.new(config, opts) - local provider, err = provider_loader.new(config or {}, opts or {}) - if not provider then return nil, err end - return setmetatable({ provider = provider }, Driver), nil + opts = opts or {} + local backend, err, provider_name = provider_loader.new(config or {}, opts) + if not backend then return nil, err end + local ok, verr = validate_backend(backend, provider_name) + if not ok then + if type(backend.terminate) == 'function' then backend:terminate('invalid backend') end + return nil, verr + end + return setmetatable({ + backend = backend, + provider_name = provider_name, + provider_id = opts.provider_id, + }, Driver), nil end -function Driver:snapshot_op(req) return self.provider:snapshot_op(req) end -function Driver:watch_op(req) return self.provider:watch_op(req) end -function Driver:apply_attachments_op(req) return self.provider:apply_attachments_op(req) end -function Driver:set_poe_op(req) return self.provider:set_poe_op(req) end -function Driver:bounce_op(req) return self.provider:bounce_op(req) end +function Driver:snapshot_op(req) return self.backend:snapshot_op(req) end +function Driver:watch_op(req) return self.backend:watch_op(req) end +function Driver:observe_groups_op(req) return self.backend:observe_groups_op(req) end +function Driver:apply_attachments_op(req) return self.backend:apply_attachments_op(req) end +function Driver:set_poe_op(req) return self.backend:set_poe_op(req) end +function Driver:bounce_op(req) return self.backend:bounce_op(req) end function Driver:terminate(reason) - if self.provider and type(self.provider.terminate) == 'function' then return self.provider:terminate(reason) end + if self.backend and type(self.backend.terminate) == 'function' then return self.backend:terminate(reason) end return true, nil end diff --git a/src/services/hal/managers/wired.lua b/src/services/hal/managers/wired.lua index cb5e845a..7dd6d12c 100644 --- a/src/services/hal/managers/wired.lua +++ b/src/services/hal/managers/wired.lua @@ -6,17 +6,16 @@ -- Wired service combines them with Device assembly into public state/wired/... surfaces. local fibers = require 'fibers' +local safe = require 'coxpcall' local op = require 'fibers.op' local channel = require 'fibers.channel' local cond = require 'fibers.cond' -local sleep = require 'fibers.sleep' -local runtime = require 'fibers.runtime' -local tablex = require 'shared.table' local strict = require 'services.hal.support.strict_manager' local hal_types = require 'services.hal.types.core' local cap_types = require 'services.hal.types.capabilities' -local driver_mod = require 'services.hal.drivers.wired' +local backend_mod = require 'services.hal.drivers.wired' +local provider_runner = require 'services.hal.managers.wired.provider_runner' local M = strict.api_table() @@ -27,12 +26,9 @@ local state = { dev_ev_ch = nil, cap_emit_ch = nil, http_client_for = nil, - drivers = {}, + runners = {}, -- provider_id -> provider runner handle controls = {}, provider_ids = {}, - pollers = {}, - observations = {}, - emitted = {}, device_registered = false, } @@ -62,150 +58,10 @@ local function shallow_copy(t) return out end -local function copy(v) return tablex.deep_copy(v) end - -local function stable_signature(v) - local tv = type(v) - if tv == 'nil' or tv == 'boolean' or tv == 'number' or tv == 'string' then - return tv .. ':' .. tostring(v) - end - if tv ~= 'table' then return tv .. ':' .. tostring(v) end - local keys = {} - for k in pairs(v) do keys[#keys + 1] = k end - table.sort(keys, function(a, b) return tostring(a) < tostring(b) end) - local out = { 'table{' } - for i = 1, #keys do - local k = keys[i] - out[#out + 1] = stable_signature(k) - out[#out + 1] = '=' - out[#out + 1] = stable_signature(v[k]) - out[#out + 1] = ';' - end - out[#out + 1] = '}' - return table.concat(out) -end - -local function emitted_cache(provider_id) - state.emitted = state.emitted or {} - local rec = state.emitted[provider_id] - if rec == nil then - rec = {} - state.emitted[provider_id] = rec - end - return rec -end - -local function merge_table(dst, src) - dst = dst or {} - if type(src) ~= 'table' then return dst end - for k, v in pairs(src) do - if v ~= nil then - if type(v) == 'table' and type(dst[k]) == 'table' then - merge_table(dst[k], v) - else - dst[k] = copy(v) - end - end - end - return dst -end - -local function observation_cache(provider_id) - state.observations = state.observations or {} - local cache = state.observations[provider_id] - if cache == nil then - cache = { - status = {}, - identity = {}, - runtime = {}, - power = {}, - surfaces = {}, - topology = {}, - } - state.observations[provider_id] = cache - end - return cache -end - -local function merge_observation(provider_id, snapshot) - local cache = observation_cache(provider_id) - snapshot = snapshot or {} - if type(snapshot.status) == 'table' then merge_table(cache.status, snapshot.status) end - for _, key in ipairs({ 'identity', 'runtime', 'power', 'topology' }) do - if type(snapshot[key]) == 'table' then merge_table(cache[key], snapshot[key]) end - end - if type(snapshot.surfaces) == 'table' then - for surface_id, surface in pairs(snapshot.surfaces) do - local id = tostring(surface_id or '') - if id ~= '' and type(surface) == 'table' then - cache.surfaces[id] = merge_table(cache.surfaces[id] or {}, surface) - end - end - end - return cache -end - -local function max(a, b) if a > b then return a end return b end - local function list_signature(list) return table.concat(list or {}, '\0') end -local function emit_state(class, id, key, payload) - local ev = assert(hal_types.new.Emit(class, id, 'state', key, payload)) - return state.cap_emit_ch:put_op(ev):wrap(function () return true, nil end) -end - -local function emit_state_changed(provider_id, key, payload) - local cache = emitted_cache(provider_id) - local sig = stable_signature(payload or {}) - if cache[key] == sig then return true, nil, false end - cache[key] = sig - local ok, err = fibers.perform(emit_state('wired-provider', provider_id, key, payload or {})) - if ok == false or ok == nil then - cache[key] = nil - return nil, err, false - end - return true, nil, true -end - -local function emit_status_now(provider_id, status) - local ok, err = emit_state_changed(provider_id, 'status', status or { state = 'available', available = true }) - if ok == false or ok == nil then return nil, err end - return true, nil -end - -local function emit_snapshot_now(provider_id, snapshot, present) - snapshot = snapshot or {} - present = present or snapshot - local ok, err - if present.status ~= nil then - ok, err = emit_status_now(provider_id, snapshot.status or { state = 'available', available = snapshot.ok == true }) - if ok ~= true then return nil, err end - end - if present.identity ~= nil then - ok, err = emit_state_changed(provider_id, 'identity', snapshot.identity or {}) - if ok == false or ok == nil then return nil, err end - end - if present.runtime ~= nil then - ok, err = emit_state_changed(provider_id, 'runtime', snapshot.runtime or {}) - if ok == false or ok == nil then return nil, err end - end - if present.power ~= nil then - ok, err = emit_state_changed(provider_id, 'power', snapshot.power or {}) - if ok == false or ok == nil then return nil, err end - end - if present.surfaces ~= nil then - ok, err = emit_state_changed(provider_id, 'surfaces', { surfaces = snapshot.surfaces or {} }) - if ok == false or ok == nil then return nil, err end - end - if present.topology ~= nil then - ok, err = emit_state_changed(provider_id, 'topology', snapshot.topology or {}) - if ok == false or ok == nil then return nil, err end - end - return true, nil -end - local function normalise_groups(groups, path) if type(groups) ~= 'table' then return nil, path .. '.groups must be a non-empty array' end local out = {} @@ -238,154 +94,30 @@ local function provider_poll_plan(config) return out, nil end -local function perform_driver_method(driver, method, opts) - local opname = tostring(method) .. '_op' - local fn = driver and driver[opname] - if type(fn) ~= 'function' then return { ok = false, err = 'wired driver missing ' .. opname } end - local ok, driver_op = pcall(function () return fn(driver, opts or {}) end) - if not ok then return { ok = false, err = tostring(driver_op) } end - if type(driver_op) ~= 'table' then return { ok = false, err = opname .. ' did not return an Op' } end - local ok2, result = pcall(function () return fibers.perform(driver_op) end) - if not ok2 then return { ok = false, err = tostring(result) } end - if type(result) == 'table' then return result end - return { ok = result == true, result = result } -end - -local function driver_result(provider_id, method, opts) - local driver = state.drivers[provider_id] - if not driver then return { ok = false, err = 'wired provider not configured', code = 'not_configured' } end - return perform_driver_method(driver, method, opts) -end - -local function poller_is_current(provider_id, driver) - local rec = state.pollers and state.pollers[provider_id] or nil - return rec ~= nil and rec.driver == driver and state.drivers[provider_id] == driver -end - -local function emit_observing_once(provider_id, driver) - local rec = state.pollers and state.pollers[provider_id] or nil - if not rec or rec.observing_emitted then return true, nil end - local ok, err = emit_status_now(provider_id, { - state = 'observing', - available = false, - driver = driver.provider or driver.driver or 'wired-provider', - polling = true, - }) - if ok == true then rec.observing_emitted = true end - return ok, err -end - -local function publish_observation(provider_id, snapshot) - local cache = merge_observation(provider_id, snapshot) - return emit_snapshot_now(provider_id, cache, snapshot or {}) -end - -local function failure_status_for_plan(plan, result) - local err = result and result.err or (result and result.status and result.status.err) or 'wired provider observation failed' - local unavailable = false - if plan and plan.groups then - for _, group in ipairs(plan.groups) do - if group == 'panel' or group == 'snapshot' then unavailable = true end - end - else - unavailable = true - end - return { - state = unavailable and 'unavailable' or 'degraded', - available = not unavailable, - err = err, - poll = plan and plan.name or nil, - polling = true, - } +local function emit_state(class, id, key, payload) + local ev = assert(hal_types.new.Emit(class, id, 'state', key, payload)) + return state.cap_emit_ch:put_op(ev):wrap(function () return true, nil end) end -local function perform_poll_plan(provider_id, driver, plan) - if plan.method == 'snapshot' then - local result = perform_driver_method(driver, 'snapshot', {}) - if result and result.ok == true then return publish_observation(provider_id, result) end - return emit_status_now(provider_id, failure_status_for_plan(plan, result)) - end - - for _, group in ipairs(plan.groups or {}) do - if not poller_is_current(provider_id, driver) then return true, nil end - local result - if group == 'snapshot' then - result = perform_driver_method(driver, 'snapshot', {}) - else - result = perform_driver_method(driver, 'group_observation', { group = group }) - end - if not poller_is_current(provider_id, driver) then return true, nil end - if result and result.ok == true then - local ok, err = publish_observation(provider_id, result) - if ok ~= true then return nil, err end - else - local ok, err = emit_status_now(provider_id, failure_status_for_plan({ name = plan.name, groups = { group } }, result)) - if ok ~= true then return nil, err end - end - end +local function emit_provider_state(provider_id, key, payload) + local ok, err = fibers.perform(emit_state('wired-provider', provider_id, key, payload or {})) + if ok == false or ok == nil then return nil, err end return true, nil end -local function poll_loop(provider_id, driver, plan, ready_cond) - if ready_cond ~= nil then - fibers.perform(ready_cond:wait_op()) - if not poller_is_current(provider_id, driver) then return end - end - - local ok, err = emit_observing_once(provider_id, driver) - if ok ~= true then log('error', { what = 'wired_provider_initial_status_emit_failed', provider = provider_id, err = err }) end - - while poller_is_current(provider_id, driver) do - local started = runtime.now() - local ok, err = perform_poll_plan(provider_id, driver, plan) - if ok ~= true then log('error', { what = 'wired_provider_poll_emit_failed', provider = provider_id, poll = plan.name, err = err }) end - local elapsed = runtime.now() - started - fibers.perform(sleep.sleep_op(max(0, plan.interval_s - elapsed))) - end -end - -local function cancel_pollers(reason) - local pollers = state.pollers or {} - state.pollers = {} - for _, rec in pairs(pollers) do - if rec and rec.scope then rec.scope:cancel(reason or 'wired provider poller cancelled') end - end -end - -local function cancel_provider_poller(provider_id, reason) - local rec = state.pollers and state.pollers[provider_id] or nil - if not rec then return end - state.pollers[provider_id] = nil - if rec.scope then rec.scope:cancel(reason or 'wired provider poller cancelled') end -end - -local function spawn_provider_poller(provider_id, driver, poll_plan, ready_cond) - if not state.scope then return nil, 'wired manager scope not started' end - cancel_provider_poller(provider_id, 'wired provider poller replaced') - local poll_scope, scope_err = state.scope:child() - if not poll_scope then return nil, scope_err or 'wired provider poller scope create failed' end - - local rec = { - scope = poll_scope, - driver = driver, - poll_plan = poll_plan, - ready_cond = ready_cond, - observing_emitted = false, - } - state.pollers[provider_id] = rec - poll_scope:finally(function () - if state.pollers and state.pollers[provider_id] == rec then state.pollers[provider_id] = nil end - end) - - for _, plan in ipairs(poll_plan or {}) do - local ok, err = poll_scope:spawn(function () poll_loop(provider_id, driver, plan, ready_cond) end) - if not ok then - if state.pollers[provider_id] == rec then state.pollers[provider_id] = nil end - poll_scope:cancel(tostring(err or 'wired provider poller spawn failed')) - return nil, err or 'wired provider poller spawn failed' - end - end - return true, nil +local function runner_result(provider_id, method, opts) + local runner = state.runners[provider_id] + if not runner then return { ok = false, err = 'wired provider not configured', code = 'not_configured' } end + local opname = tostring(method) .. '_op' + local fn = runner[opname] + if type(fn) ~= 'function' then return { ok = false, err = 'wired runner missing ' .. opname } end + local ok, runner_op = safe.pcall(function () return fn(runner, opts or {}) end) + if not ok then return { ok = false, err = tostring(runner_op) } end + if type(runner_op) ~= 'table' then return { ok = false, err = opname .. ' did not return an Op' } end + local ok2, result = safe.pcall(function () return fibers.perform(runner_op) end) + if not ok2 then return { ok = false, err = tostring(result) } end + if type(result) == 'table' then return result end + return { ok = result == true, result = result } end local function handle_request(provider_id, req) @@ -393,13 +125,10 @@ local function handle_request(provider_id, req) local opts = req and req.opts or {} local result if verb == 'snapshot' or verb == 'watch' or verb == 'apply_attachments' or verb == 'set_poe' or verb == 'bounce' then - result = driver_result(provider_id, verb, opts) + result = runner_result(provider_id, verb, opts) else result = { ok = false, err = 'unsupported wired-provider verb: ' .. tostring(verb) } end - if result and result.ok == true and (verb == 'snapshot' or verb == 'watch') then - emit_snapshot_now(provider_id, result) - end reply(req, result and result.ok == true, result) end @@ -437,11 +166,11 @@ local function close_control_channels() state.controls = {} end -local function stop_drivers(reason) - for _, driver in pairs(state.drivers or {}) do - if driver and type(driver.terminate) == 'function' then driver:terminate(reason or 'reconfigured') end +local function stop_runners(reason) + for _, runner in pairs(state.runners or {}) do + if runner and type(runner.terminate) == 'function' then runner:terminate(reason or 'reconfigured') end end - state.drivers = {} + state.runners = {} end local function spawn_control_loops(provider_ids) @@ -477,10 +206,10 @@ local function configured_provider(config, provider_id) return shallow_copy(rec) end -local function reconcile_device_caps(provider_ids) +local function reconcile_device_caps(provider_ids, ready_cond) local new_sig = list_signature(provider_ids) local old_sig = list_signature(state.provider_ids) - if new_sig == old_sig then return true, nil, nil end + if new_sig == old_sig then return true, nil, ready_cond, true end if state.device_registered then local ok, err = fibers.perform(device_event_op('removed', {})) @@ -491,11 +220,10 @@ local function reconcile_device_caps(provider_ids) close_control_channels() state.provider_ids = {} - if #provider_ids == 0 then return true, nil, nil end + if #provider_ids == 0 then return true, nil, ready_cond, true end local caps = make_caps(provider_ids) spawn_control_loops(provider_ids) - local ready_cond = cond.new() local ok, err = fibers.perform(device_event_op('added', caps, ready_cond)) if ok == false or ok == nil then close_control_channels() @@ -504,7 +232,7 @@ local function reconcile_device_caps(provider_ids) state.provider_ids = provider_ids state.device_registered = true - return true, nil, ready_cond + return true, nil, ready_cond, false end function M.start_op(logger, dev_ev_ch, cap_emit_ch, opts) @@ -520,11 +248,8 @@ function M.start_op(logger, dev_ev_ch, cap_emit_ch, opts) state.cap_emit_ch = cap_emit_ch state.http_client_for = opts and opts.http_client_for or nil state.controls = {} - state.drivers = {} + state.runners = {} state.provider_ids = {} - state.pollers = {} - state.observations = {} - state.emitted = {} state.device_registered = false child:finally(function (_, status, primary) M.terminate(primary or status or 'wired manager closed') end) @@ -534,11 +259,18 @@ function M.start_op(logger, dev_ev_ch, cap_emit_ch, opts) end) end - local function terminate_prepared(prepared, reason) for _, rec in pairs(prepared or {}) do - local driver = rec and rec.driver - if driver and type(driver.terminate) == 'function' then driver:terminate(reason or 'discarded') end + local backend_handle = rec and rec.backend_handle + if backend_handle and not rec.owned_by_runner and type(backend_handle.terminate) == 'function' then + backend_handle:terminate(reason or 'discarded') + end + end +end + +local function terminate_runners(runners, reason) + for _, runner in pairs(runners or {}) do + if runner and type(runner.terminate) == 'function' then runner:terminate(reason or 'discarded') end end end @@ -563,13 +295,12 @@ local function prepare_providers(config, provider_ids) local driver_opts = { logger = state.logger, cap_emit_ch = state.cap_emit_ch, provider_id = id } if driver_config.provider == 'rtl8380m_http' then driver_opts.http_client_for = state.http_client_for end - local driver, err = driver_mod.new(driver_config, driver_opts) - if not driver then + local backend_handle, err = backend_mod.new(driver_config, driver_opts) + if not backend_handle then terminate_prepared(prepared, 'prepare failed') return nil, ('wired provider %s create failed: %s'):format(id, tostring(err)) end - driver.provider = driver_config.provider - prepared[id] = { driver = driver, poll_plan = poll_plan } + prepared[id] = { backend_handle = backend_handle, provider_name = backend_handle.provider_name, poll_plan = poll_plan } end return prepared, nil end @@ -584,32 +315,53 @@ function M.apply_config_op(config) local prepared, prep_err = prepare_providers(config or {}, provider_ids) if not prepared then return false, prep_err end - cancel_pollers('reconfigured') - stop_drivers('reconfigured') - state.observations = {} - state.emitted = {} - local ok, cerr, caps_ready_cond = reconcile_device_caps(provider_ids) - if ok ~= true then - terminate_prepared(prepared, 'capability reconcile failed') - return false, cerr - end - - state.drivers = {} + local caps_ready_cond = cond.new() + local runners = {} for i = 1, #provider_ids do local id = provider_ids[i] - state.drivers[id] = prepared[id].driver + local rec = prepared[id] + local runner, rerr = provider_runner.new({ + provider_id = id, + provider_name = rec.provider_name, + backend = rec.backend_handle, + poll_plan = rec.poll_plan, + ready_cond = caps_ready_cond, + parent_scope = state.scope, + emit_state = emit_provider_state, + log = log, + }) + if not runner then + terminate_runners(runners, 'runner create failed') + terminate_prepared(prepared, 'runner create failed') + return false, ('wired provider %s runner failed: %s'):format(id, tostring(rerr)) + end + rec.owned_by_runner = true + runners[id] = runner end for i = 1, #provider_ids do local id = provider_ids[i] - local rec = prepared[id] - local spawned, spawn_err = spawn_provider_poller(id, rec.driver, rec.poll_plan, caps_ready_cond) + local spawned, spawn_err = runners[id]:start() if spawned ~= true then - cancel_pollers('poller spawn failed') - stop_drivers('poller spawn failed') - return false, ('wired provider %s poller failed: %s'):format(id, tostring(spawn_err)) + terminate_runners(runners, 'runner spawn failed') + terminate_prepared(prepared, 'runner spawn failed') + return false, ('wired provider %s runner failed: %s'):format(id, tostring(spawn_err)) end end + + stop_runners('reconfigured') + local ok, cerr, _, ready_now = reconcile_device_caps(provider_ids, caps_ready_cond) + if ok ~= true then + terminate_runners(runners, 'capability reconcile failed') + return false, cerr + end + + state.runners = {} + for i = 1, #provider_ids do + local id = provider_ids[i] + state.runners[id] = runners[id] + end + if ready_now and caps_ready_cond then caps_ready_cond:signal() end log('info', { what = 'wired_manager_configured', providers = provider_ids }) return true, nil end):wrap(function (status, report, ok_or_primary, err) @@ -627,13 +379,9 @@ function M.shutdown_op(_timeout_s) end function M.terminate(reason) - cancel_pollers(reason or 'terminated') - stop_drivers(reason or 'terminated') + stop_runners(reason or 'terminated') close_control_channels() state.provider_ids = {} - state.pollers = {} - state.observations = {} - state.emitted = {} state.device_registered = false if state.scope then local scope = state.scope; state.scope = nil; scope:cancel(reason or 'terminated') end state.started = false @@ -651,6 +399,7 @@ end M._test = { normalise_provider_ids = normalise_provider_ids, provider_poll_plan = provider_poll_plan, + groups_for_plans = provider_runner._test.groups_for_plans, } return M diff --git a/src/services/hal/managers/wired/provider_runner.lua b/src/services/hal/managers/wired/provider_runner.lua new file mode 100644 index 00000000..87015ada --- /dev/null +++ b/src/services/hal/managers/wired/provider_runner.lua @@ -0,0 +1,359 @@ +-- services/hal/managers/wired/provider_runner.lua +-- +-- Owned runner for one HAL wired-provider backend. The runner is the sole +-- owner of the backend/session object: polling, snapshots and future controls +-- all pass through this mailbox, giving CML-style serialisation without locks. + +local fibers = require 'fibers' +local safe = require 'coxpcall' +local op = require 'fibers.op' +local channel = require 'fibers.channel' +local sleep = require 'fibers.sleep' +local runtime = require 'fibers.runtime' +local tablex = require 'shared.table' + +local M = {} +local Runner = {} +Runner.__index = Runner + +local function max(a, b) if a > b then return a end return b end + +local function copy(v) return tablex.deep_copy(v) end + +local function stable_signature(v) + local tv = type(v) + if tv == 'nil' or tv == 'boolean' or tv == 'number' or tv == 'string' then + return tv .. ':' .. tostring(v) + end + if tv ~= 'table' then return tv .. ':' .. tostring(v) end + local keys = {} + for k in pairs(v) do keys[#keys + 1] = k end + table.sort(keys, function(a, b) return tostring(a) < tostring(b) end) + local out = { 'table{' } + for i = 1, #keys do + local k = keys[i] + out[#out + 1] = stable_signature(k) + out[#out + 1] = '=' + out[#out + 1] = stable_signature(v[k]) + out[#out + 1] = ';' + end + out[#out + 1] = '}' + return table.concat(out) +end + +local function merge_table(dst, src) + dst = dst or {} + if type(src) ~= 'table' then return dst end + for k, v in pairs(src) do + if v ~= nil then + if type(v) == 'table' and type(dst[k]) == 'table' then + merge_table(dst[k], v) + else + dst[k] = copy(v) + end + end + end + return dst +end + +local function merge_observation(cache, snapshot) + cache = cache or { + status = {}, + identity = {}, + runtime = {}, + power = {}, + surfaces = {}, + topology = {}, + } + snapshot = snapshot or {} + if type(snapshot.status) == 'table' then merge_table(cache.status, snapshot.status) end + for _, key in ipairs({ 'identity', 'runtime', 'power', 'topology' }) do + if type(snapshot[key]) == 'table' then merge_table(cache[key], snapshot[key]) end + end + if type(snapshot.surfaces) == 'table' then + for surface_id, surface in pairs(snapshot.surfaces) do + local id = tostring(surface_id or '') + if id ~= '' and type(surface) == 'table' then + cache.surfaces[id] = merge_table(cache.surfaces[id] or {}, surface) + end + end + end + return cache +end + +local function emit_state_changed(self, key, payload) + local sig = stable_signature(payload or {}) + if self.emitted[key] == sig then return true, nil, false end + self.emitted[key] = sig + local ok, err = self.emit_state(self.provider_id, key, payload or {}) + if ok == false or ok == nil then + self.emitted[key] = nil + return nil, err, false + end + return true, nil, true +end + +local function emit_snapshot(self, snapshot, present) + snapshot = snapshot or {} + present = present or snapshot + local ok, err + if present.status ~= nil then + ok, err = emit_state_changed(self, 'status', snapshot.status or { state = 'available', available = snapshot.ok == true }) + if ok ~= true then return nil, err end + end + if present.identity ~= nil then + ok, err = emit_state_changed(self, 'identity', snapshot.identity or {}) + if ok == false or ok == nil then return nil, err end + end + if present.runtime ~= nil then + ok, err = emit_state_changed(self, 'runtime', snapshot.runtime or {}) + if ok == false or ok == nil then return nil, err end + end + if present.power ~= nil then + ok, err = emit_state_changed(self, 'power', snapshot.power or {}) + if ok == false or ok == nil then return nil, err end + end + if present.surfaces ~= nil then + ok, err = emit_state_changed(self, 'surfaces', { surfaces = snapshot.surfaces or {} }) + if ok == false or ok == nil then return nil, err end + end + if present.topology ~= nil then + ok, err = emit_state_changed(self, 'topology', snapshot.topology or {}) + if ok == false or ok == nil then return nil, err end + end + return true, nil +end + +local function publish_observation(self, snapshot) + self.observation = merge_observation(self.observation, snapshot) + return emit_snapshot(self, self.observation, snapshot or {}) +end + +local function emit_status(self, status) + local ok, err = emit_state_changed(self, 'status', status or { state = 'available', available = true }) + if ok == false or ok == nil then return nil, err end + return true, nil +end + +local function copy_list(list) + local out = {} + for i = 1, #(list or {}) do out[i] = list[i] end + return out +end + +local function append_unique(out, seen, value) + value = tostring(value or '') + if value ~= '' and not seen[value] then + seen[value] = true + out[#out + 1] = value + end +end + +local function perform_backend_method(backend, method, opts) + local opname = tostring(method) .. '_op' + local fn = backend and backend[opname] + if type(fn) ~= 'function' then return { ok = false, err = 'wired backend missing ' .. opname } end + local ok, backend_op = safe.pcall(function () return fn(backend, opts or {}) end) + if not ok then return { ok = false, err = tostring(backend_op) } end + if type(backend_op) ~= 'table' then return { ok = false, err = opname .. ' did not return an Op' } end + local ok2, result = safe.pcall(function () return fibers.perform(backend_op) end) + if not ok2 then return { ok = false, err = tostring(result) } end + if type(result) == 'table' then return result end + return { ok = result == true, result = result } +end + +local function failure_status(provider_id, groups, result) + local gs = groups or {} + local err = result and result.err or (result and result.status and result.status.err) or 'wired provider observation failed' + local unavailable = false + for _, group in ipairs(gs) do + if group == 'panel' or group == 'snapshot' then unavailable = true end + end + if #gs == 0 then unavailable = true end + return { + state = unavailable and 'unavailable' or 'degraded', + available = not unavailable, + err = err, + groups = copy_list(gs), + provider_id = provider_id, + polling = true, + } +end + +local function groups_for_plans(plans) + local groups, seen = {}, {} + for _, plan in ipairs(plans or {}) do + for _, group in ipairs(plan.groups or {}) do append_unique(groups, seen, group) end + end + return groups +end + +local function initialise_due_times(self) + local now = runtime.now() + for i, plan in ipairs(self.poll_plan or {}) do + self.next_due[plan.name] = now + math.min(1.0, (i - 1) * 0.25) + end +end + +local function next_due_time(self) + local due = nil + for _, plan in ipairs(self.poll_plan or {}) do + local t = self.next_due[plan.name] + if t ~= nil and (due == nil or t < due) then due = t end + end + if due == nil then due = runtime.now() + 3600 end + return max(due, self.idle_until or 0) +end + +local function due_plans(self, now) + local plans = {} + for _, plan in ipairs(self.poll_plan or {}) do + local t = self.next_due[plan.name] + if t ~= nil and t <= now then plans[#plans + 1] = plan end + end + return plans +end + +local function mark_plans_attempted(self, plans) + local now = runtime.now() + for _, plan in ipairs(plans or {}) do self.next_due[plan.name] = now + plan.interval_s end + self.idle_until = now + self.min_idle_s +end + +local function runner_request_op(self, verb, opts) + return op.guard(function () + if self.closed then return op.always({ ok = false, err = 'wired provider runner closed', code = 'closed' }) end + local reply_ch = channel.new(1) + local msg = { kind = 'request', verb = verb, opts = opts or {}, reply_ch = reply_ch } + return fibers.run_scope_op(function () + fibers.perform(self.request_ch:put_op(msg)) + return fibers.perform(reply_ch:get_op()) + end):wrap(function (status, _report, result_or_primary, err) + if status == 'ok' then return result_or_primary, err end + return { ok = false, err = tostring(result_or_primary or status or 'wired provider request failed') }, nil + end) + end) +end + +function Runner:snapshot_op(req) return runner_request_op(self, 'snapshot', req) end +function Runner:watch_op(req) return runner_request_op(self, 'snapshot', req) end +function Runner:observe_groups_op(req) return runner_request_op(self, 'observe_groups', req) end +function Runner:apply_attachments_op(req) return runner_request_op(self, 'apply_attachments', req) end +function Runner:set_poe_op(req) return runner_request_op(self, 'set_poe', req) end +function Runner:bounce_op(req) return runner_request_op(self, 'bounce', req) end + +function Runner:terminate(reason) + self.closed = true + if self.scope then local scope = self.scope; self.scope = nil; scope:cancel(reason or 'wired provider runner terminated') end + if self.backend and type(self.backend.terminate) == 'function' then self.backend:terminate(reason or 'wired provider runner terminated') end + return true, nil +end + +function Runner:emit_observing() + if self.observing_emitted then return true, nil end + local ok, err = emit_status(self, { + state = 'observing', + available = false, + driver = self.provider_name, + polling = true, + }) + if ok == true then self.observing_emitted = true end + return ok, err +end + +function Runner:poll_due() + local now = runtime.now() + local plans = due_plans(self, now) + if #plans == 0 then return true, nil end + local groups = groups_for_plans(plans) + local result = perform_backend_method(self.backend, 'observe_groups', { groups = groups }) + mark_plans_attempted(self, plans) + if result and result.ok == true then return publish_observation(self, result) end + return emit_status(self, failure_status(self.provider_id, groups, result)) +end + +function Runner:handle_request(msg) + if type(msg) ~= 'table' then return end + local verb = msg.verb + local opts = msg.opts or {} + local result + if verb == 'snapshot' or verb == 'watch' then + result = perform_backend_method(self.backend, 'snapshot', opts) + if result and result.ok == true then publish_observation(self, result) end + elseif verb == 'observe_groups' then + result = perform_backend_method(self.backend, 'observe_groups', opts) + if result and result.ok == true then publish_observation(self, result) end + elseif verb == 'apply_attachments' or verb == 'set_poe' or verb == 'bounce' then + result = perform_backend_method(self.backend, verb, opts) + else + result = { ok = false, err = 'unsupported wired-provider verb: ' .. tostring(verb) } + end + if msg.reply_ch then fibers.perform(msg.reply_ch:put_op(result)) end +end + +function Runner:run() + if self.ready_cond ~= nil then fibers.perform(self.ready_cond:wait_op()) end + if self.closed then return end + local ok, err = self:emit_observing() + if ok ~= true then self.log('error', { what = 'wired_provider_initial_status_emit_failed', provider = self.provider_id, err = err }) end + initialise_due_times(self) + while not self.closed do + local which, msg = fibers.perform(op.named_choice({ + request = self.request_ch:get_op(), + due = sleep.sleep_until_op(next_due_time(self)), + })) + if which == 'request' then + self:handle_request(msg) + elseif which == 'due' then + local pok, perr = self:poll_due() + if pok ~= true then self.log('error', { what = 'wired_provider_poll_emit_failed', provider = self.provider_id, err = perr }) end + end + end +end + +function Runner:start() + local ok, err = self.scope:spawn(function () self:run() end) + if not ok then + self:terminate(tostring(err or 'wired provider runner spawn failed')) + return nil, err or 'wired provider runner spawn failed' + end + return true, nil +end + +function M.new(opts) + opts = opts or {} + if type(opts.provider_id) ~= 'string' or opts.provider_id == '' then return nil, 'provider_id is required' end + if type(opts.provider_name) ~= 'string' or opts.provider_name == '' then return nil, 'provider_name is required' end + if type(opts.backend) ~= 'table' then return nil, 'backend is required' end + if type(opts.poll_plan) ~= 'table' then return nil, 'poll_plan is required' end + if type(opts.parent_scope) ~= 'table' then return nil, 'parent_scope is required' end + if type(opts.emit_state) ~= 'function' then return nil, 'emit_state callback is required' end + + local child, err = opts.parent_scope:child() + if not child then return nil, err or 'wired provider runner scope create failed' end + local self = setmetatable({ + provider_id = opts.provider_id, + provider_name = opts.provider_name, + backend = opts.backend, + poll_plan = opts.poll_plan, + ready_cond = opts.ready_cond, + emit_state = opts.emit_state, + observation = nil, + emitted = {}, + log = opts.log or function () end, + request_ch = channel.new(opts.request_queue_size or 32), + scope = child, + closed = false, + next_due = {}, + idle_until = 0, + min_idle_s = tonumber(opts.min_idle_s) or 0.05, + observing_emitted = false, + }, Runner) + return self, nil +end + +M._test = { + groups_for_plans = groups_for_plans, +} + +return M diff --git a/tests/integration/devhost/rtl8380m_switch_spec.lua b/tests/integration/devhost/rtl8380m_switch_spec.lua index 4012abe2..d20e5766 100644 --- a/tests/integration/devhost/rtl8380m_switch_spec.lua +++ b/tests/integration/devhost/rtl8380m_switch_spec.lua @@ -16,11 +16,13 @@ local busmod = require 'bus' local fibers = require 'fibers' +local channel = require 'fibers.channel' local runfibers = require 'tests.support.run_fibers' local probe = require 'tests.support.bus_probe' local http_service = require 'services.http.service' +local wired_manager = require 'services.hal.managers.wired' local wired_service = require 'services.wired.service' local wired_config = require 'services.wired.config' local wired_topics = require 'services.wired.topics' @@ -170,6 +172,22 @@ local function require_successful_snapshot(provider) return snap end +local function require_successful_observe_groups(provider, groups) + local result = fibers.perform(provider:observe_groups_op({ groups = groups })) + assert_not_nil(result, 'observe_groups should return a table') + if result.ok ~= true then + local status = result.status or {} + error('switch observe_groups failed: ' .. tostring(status.err or result.err or 'unknown error'), 2) + end + return result +end + +local function assert_command_once(commands, command) + local n = 0 + for _, cmd in ipairs(commands or {}) do if cmd == command then n = n + 1 end end + assert_eq(n, 1, 'expected command ' .. tostring(command) .. ' exactly once') +end + local function retain_switch_raw(conn, snap) conn:retain({ 'raw', 'host', 'wired', 'provider', 'switch-main', 'status' }, snap.status or {}) conn:retain({ 'raw', 'host', 'wired', 'provider', 'switch-main', 'state', 'identity' }, snap.identity or {}) @@ -227,6 +245,165 @@ local function retain_wired_config(conn) }) end +local function raw_provider_topic(id, suffix) + local topic = { 'raw', 'host', 'wired', 'provider', id } + for i = 1, #(suffix or {}) do topic[#topic + 1] = suffix[i] end + return topic +end + +local function start_wired_manager_hal_harness(scope, bus, dev_ev_ch, cap_emit_ch) + local child = assert(scope:child()) + local writer = bus:connect({ origin_base = { kind = 'local', component = 'wired-manager-hal-harness' } }) + + assert(child:spawn(function () + while true do + local ev = fibers.perform(dev_ev_ch:get_op()) + if ev == nil then return end + if ev.class == 'wired' and ev.id == 'main' then + if ev.event_type == 'added' then + for _, cap in ipairs(ev.capabilities or {}) do + if cap.class == 'wired-provider' then + writer:retain(raw_provider_topic(cap.id, { 'status' }), { + state = 'available', + available = true, + source_kind = 'host', + source = 'wired', + }) + writer:retain(raw_provider_topic(cap.id, { 'meta' }), { + offerings = cap.offerings or {}, + source_kind = 'host', + source = 'wired', + }) + end + end + if ev.ready_cond then ev.ready_cond:signal() end + elseif ev.event_type == 'removed' then + for _, cap in ipairs(ev.capabilities or {}) do + if cap.class == 'wired-provider' then writer:unretain(raw_provider_topic(cap.id, { 'status' })) end + end + if ev.ready_cond then ev.ready_cond:signal() end + end + end + end + end)) + + assert(child:spawn(function () + while true do + local emit = fibers.perform(cap_emit_ch:get_op()) + if emit == nil then return end + if emit.class == 'wired-provider' and emit.mode == 'state' then + if emit.key == 'status' then + writer:retain(raw_provider_topic(emit.id, { 'status' }), emit.data or {}) + else + writer:retain(raw_provider_topic(emit.id, { 'state', emit.key }), emit.data or {}) + end + end + end + end)) + + return child +end + +local function switch_manager_config(env) + return { + providers = { + ['switch-main'] = { + provider = 'rtl8380m_http', + base_url = env.base_url, + username = env.username, + password = env.password, + timeout_s = env.timeout_s, + openssl_bin = env.openssl_bin, + include_raw = true, + http = { capability = 'main', response_parser = 'legacy-http1-close' }, + poll = { + fast = { interval_s = 1.0, groups = { 'panel', 'poe', 'counters' } }, + }, + }, + }, + } +end + +function T.rtl8380m_real_switch_observe_groups_via_http_capability() + local env, err = required_env() + if not env then return skip(err) end + + runfibers.run(function () + local b = busmod.new() + local http = start_http_capability(b, env) + wait_http_available(b) + local provider = new_real_switch_provider(b, env) + local obs = require_successful_observe_groups(provider, { 'panel', 'poe', 'counters' }) + + assert_eq(obs.provider_id, 'switch-main') + assert_eq(obs.status.driver, 'rtl8380m_http') + assert_true(obs.status.available, 'observe_groups status should be available') + assert_not_nil(obs.surfaces, 'observe_groups should include surfaces') + assert_not_nil(obs.surfaces.GE8, 'observe_groups should include GE8') + assert_not_nil(obs.surfaces.GE9, 'observe_groups should include GE9') + assert_eq(obs.surfaces.GE9.link.media, 'fiber') + assert_not_nil(obs.power, 'poe group should include power') + assert_not_nil(obs.power.poe, 'poe group should include power.poe') + assert_not_nil(obs.raw, 'include_raw=true should preserve grouped source payloads') + assert_not_nil(obs.raw.home_main, 'grouped observation should capture home_main') + assert_not_nil(obs.raw.panel_info, 'grouped observation should capture panel_info') + assert_not_nil(obs.raw.poe_poe, 'grouped observation should capture poe_poe') + assert_not_nil(obs.raw.rmon_statistics, 'grouped observation should capture rmon_statistics') + assert_command_once(obs.commands, 'home_main') + assert_command_once(obs.commands, 'panel_info') + assert_command_once(obs.commands, 'poe_poe') + assert_command_once(obs.commands, 'rmon_statistics') + + provider:terminate('test complete') + http:terminate('test complete') + end, { timeout = env.run_timeout_s }) +end + +function T.rtl8380m_real_switch_runner_publishes_raw_observations() + local env, err = required_env() + if not env then return skip(err) end + + runfibers.run(function (scope) + local b = busmod.new() + local http = start_http_capability(b, env) + wait_http_available(b) + + local manager_conn = b:connect({ origin_base = { kind = 'local', component = 'wired-manager-real-switch-test' } }) + local resolver = assert(hal_deps.resolver(manager_conn)) + local dev_ev_ch = channel.new(16) + local cap_emit_ch = channel.new(32) + local harness = start_wired_manager_hal_harness(scope, b, dev_ev_ch, cap_emit_ch) + local reader = b:connect({ origin_base = { kind = 'local', component = 'wired-runner-test-reader' } }) + + wired_manager.terminate('test reset') + local ok_start, start_err = fibers.perform(wired_manager.start_op(nil, dev_ev_ch, cap_emit_ch, { + http_client_for = resolver:factory('http_client'), + })) + assert_true(ok_start, tostring(start_err)) + + local ok_apply, apply_err = fibers.perform(wired_manager.apply_config_op(switch_manager_config(env))) + assert_true(ok_apply, tostring(apply_err)) + + local status = probe.wait_retained_payload(reader, raw_provider_topic('switch-main', { 'status' }), { timeout = 3.0 }) + assert_not_nil(status, 'runner should publish raw provider status') + assert_true(status.available == true or status.state == 'observing', 'raw provider status should be observing or available') + + local surfaces_payload = probe.wait_retained_payload(reader, raw_provider_topic('switch-main', { 'state', 'surfaces' }), { timeout = env.run_timeout_s }) + local surfaces = assert_not_nil(surfaces_payload.surfaces, 'runner should publish raw surfaces') + assert_not_nil(surfaces.GE8, 'runner surfaces should include GE8') + assert_not_nil(surfaces.GE9, 'runner surfaces should include GE9') + assert_eq(surfaces.GE9.link.media, 'fiber') + + local power = probe.wait_retained_payload(reader, raw_provider_topic('switch-main', { 'state', 'power' }), { timeout = env.run_timeout_s }) + assert_not_nil(power.poe, 'runner should publish PoE power') + + wired_manager.terminate('test complete') + harness:cancel('test complete') + fibers.perform(harness:join_op()) + http:terminate('test complete') + end, { timeout = env.run_timeout_s + 5 }) +end + function T.rtl8380m_real_switch_snapshot_via_http_capability() local env, err = required_env() if not env then return skip(err) end diff --git a/tests/unit/hal/wired_provider_spec.lua b/tests/unit/hal/wired_provider_spec.lua index 58f50225..45ae4115 100644 --- a/tests/unit/hal/wired_provider_spec.lua +++ b/tests/unit/hal/wired_provider_spec.lua @@ -214,6 +214,56 @@ function tests.test_rtl8380m_http_accepts_narrow_http_client_factory() end + +function tests.test_wired_driver_separates_backend_provider_name_and_provider_id() + local driver_mod = require 'services.hal.drivers.wired' + local d, err = driver_mod.new({ + provider = 'static', + mode = 'read_only', + surfaces = { eth0 = { provider_surface_id = 'eth0' } }, + }, { provider_id = 'cm5-local-wired' }) + assert_not_nil(d, err) + assert_not_nil(d.backend) + assert_eq(d.provider, nil) + assert_eq(d.provider_name, 'static') + assert_eq(d.provider_id, 'cm5-local-wired') + assert_not_nil(d.snapshot_op) +end + +function tests.test_wired_driver_requires_observe_groups_backend_contract() + package.loaded['services.hal.backends.wired.providers.invalid_missing_observe_groups'] = { + new = function () + return { + snapshot_op = function () end, + watch_op = function () end, + apply_attachments_op = function () end, + set_poe_op = function () end, + bounce_op = function () end, + terminate = function () end, + } + end, + } + local driver_mod = require 'services.hal.drivers.wired' + local d, err = driver_mod.new({ provider = 'invalid_missing_observe_groups' }, { provider_id = 'bad' }) + assert_eq(d, nil) + assert_true(type(err) == 'string' and err:find('observe_groups_op', 1, true) ~= nil, tostring(err)) + package.loaded['services.hal.backends.wired.providers.invalid_missing_observe_groups'] = nil +end + +function tests.test_rtl8380m_observe_groups_deduplicates_shared_commands() + local commands, err = provider._test.commands_for_groups({ 'panel', 'poe', 'counters' }) + assert_not_nil(commands, err) + local seen = {} + for _, cmd in ipairs(commands) do + assert_eq(seen[cmd], nil, 'duplicate command ' .. tostring(cmd)) + seen[cmd] = true + end + assert_true(seen.home_main == true, 'home_main should be included once') + assert_true(seen.panel_info == true, 'panel_info should be included') + assert_true(seen.poe_poe == true, 'poe_poe should be included') + assert_true(seen.rmon_statistics == true, 'rmon_statistics should be included') +end + function tests.test_wired_manager_requires_canonical_poll_table() local manager = require 'services.hal.managers.wired' local plan, err = manager._test.provider_poll_plan({})