diff --git a/android/jni/mob_nif.zig b/android/jni/mob_nif.zig index 39d0e7d..127efbb 100644 --- a/android/jni/mob_nif.zig +++ b/android/jni/mob_nif.zig @@ -99,6 +99,95 @@ export fn nif_log2( return erts.ok(env); } +// ── NIF: resolve_ipv4/1 ────────────────────────────────────────────────── +// +// In-process IPv4 DNS via Bionic's getaddrinfo. Exists because BEAM's +// default DNS path — forking `inet_gethost` (a port program) — returns +// `:nxdomain` on physical Android devices we've tested, even though the +// app's own HTTP stack resolves the same hostnames fine. Suspected cause: +// Bionic's netd-routed resolver doesn't carry across to execve'd children +// of the app process the way it does to in-process calls. The emulator +// happens not to hit this, which is why it wasn't caught earlier. +// +// This NIF runs getaddrinfo in-process (same address space, same uid as +// the app), so it follows whatever DNS path the JVM and the app's libraries +// use. Mirrors iOS's `nif_resolve_ipv4` in `ios/mob_nif.m` and uses the +// same atom/error vocabulary so `Mob.DNS.resolve/1` is platform-agnostic. +// +// Dirty-scheduled because getaddrinfo can block on the resolver for the +// full timeout (seconds). Keep it off the regular schedulers. +// +// Returns: +// {:ok, {a, b, c, d}} +// {:error, :badarg} — host arg wasn't a string/charlist +// {:error, :nxdomain} — no such hostname +// {:error, :timeout} — TRY_AGAIN +// {:error, :no_address} — got a result but no IPv4 in the chain +// {:error, {:gai, code}} — anything else; raw EAI_* int +export fn nif_resolve_ipv4( + env: ?*erts.ErlNifEnv, + argc: c_int, + argv: [*]const erts.ERL_NIF_TERM, +) callconv(.c) erts.ERL_NIF_TERM { + _ = argc; + var host: [256]u8 = undefined; + const got = erts.enif_get_string(env, argv[0], &host, host.len, erts.ERL_NIF_LATIN1); + if (got <= 0) return erts.errorTuple(env, erts.atom(env, "badarg")); + + const hints: jni.AddrInfo = .{ + .ai_flags = 0, + .ai_family = jni.AF_INET, + .ai_socktype = jni.SOCK_STREAM, + .ai_protocol = 0, + .ai_addrlen = 0, + .ai_canonname = null, + .ai_addr = null, + .ai_next = null, + }; + + var result: ?*jni.AddrInfo = null; + const err = jni.getaddrinfo(@ptrCast(&host), null, &hints, &result); + if (err != 0) { + // `erts.atom` requires a comptime-known name, so each EAI_* maps to a + // literal string in its own switch arm rather than a runtime-selected + // pointer. + return switch (err) { + jni.EAI_NONAME, jni.EAI_NODATA => erts.errorTuple(env, erts.atom(env, "nxdomain")), + jni.EAI_AGAIN => erts.errorTuple(env, erts.atom(env, "timeout")), + else => blk: { + // Surface raw EAI_* so callers can log/branch on it. + const gai = erts.makeTuple(env, .{ erts.atom(env, "gai"), erts.enif_make_int(env, err) }); + break :blk erts.errorTuple(env, gai); + }, + }; + } + + // Walk the chain for the first AF_INET sockaddr. getaddrinfo with + // ai_family=AF_INET shouldn't return anything else, but be defensive. + var ai: ?*jni.AddrInfo = result; + var found: ?u32 = null; + while (ai) |entry| : (ai = entry.ai_next) { + if (entry.ai_family != jni.AF_INET) continue; + const sin: *jni.SockAddrIn = @ptrCast(@alignCast(entry.ai_addr)); + // sin_addr is network byte order; bigToNative is the ntohl-equivalent + // on little-endian Android. + found = std.mem.bigToNative(u32, sin.sin_addr); + break; + } + jni.freeaddrinfo(result); + + if (found) |addr| { + const ip_tuple = erts.makeTuple(env, .{ + erts.enif_make_int(env, @intCast((addr >> 24) & 0xFF)), + erts.enif_make_int(env, @intCast((addr >> 16) & 0xFF)), + erts.enif_make_int(env, @intCast((addr >> 8) & 0xFF)), + erts.enif_make_int(env, @intCast(addr & 0xFF)), + }); + return erts.makeTuple(env, .{ erts.ok(env), ip_tuple }); + } + return erts.errorTuple(env, erts.atom(env, "no_address")); +} + // ── Helpers ────────────────────────────────────────────────────────────── /// Pull a binary or charlist into a NUL-terminated buffer. Returns false if @@ -4925,6 +5014,8 @@ const nif_funcs = [_]erts.ErlNifFunc{ .{ .name = "bt_spp_write", .arity = 2, .fptr = nif_bt_spp_write, .flags = erts.ERL_NIF_DIRTY_JOB_IO_BOUND }, .{ .name = "bt_hid_connect", .arity = 1, .fptr = nif_bt_hid_connect, .flags = 0 }, .{ .name = "bt_hid_subscribe_raw", .arity = 1, .fptr = nif_bt_hid_subscribe_raw, .flags = 0 }, + // ── Mob.DNS (in-process IPv4 resolver via Bionic getaddrinfo) ──────── + .{ .name = "resolve_ipv4", .arity = 1, .fptr = nif_resolve_ipv4, .flags = erts.ERL_NIF_DIRTY_JOB_IO_BOUND }, }; var mob_nif_entry: erts.ErlNifEntry = .{ diff --git a/android/jni/mob_zig.zig b/android/jni/mob_zig.zig index 5db1c04..39de4c6 100644 --- a/android/jni/mob_zig.zig +++ b/android/jni/mob_zig.zig @@ -163,6 +163,66 @@ pub const RTLD_GLOBAL: c_int = 0x00100; pub extern fn dlopen(filename: [*:0]const u8, flags: c_int) ?*anyopaque; pub extern fn dlerror() ?[*:0]const u8; +// ── netdb (in-process DNS) ───────────────────────────────────────────────── +// Bindings for Bionic's getaddrinfo so a NIF can resolve hostnames inside +// the BEAM's process. BEAM's default DNS path forks the `inet_gethost` port +// program and that path returns NXDOMAIN on physical Android devices for +// reasons we haven't fully pinned down (libnetd_client routing through netd +// behaves differently for execve'd children of the app — works on emulator, +// fails on phones we've tested). Calling getaddrinfo in-process from the +// app's UID and address space sidesteps the issue: it's the same code path +// the app's own HTTP stack uses when it succeeds. +// +// Layout mirrors Bionic's `bionic/libc/include/netdb.h` exactly. Note that +// Bionic's `struct addrinfo` orders `ai_canonname` *before* `ai_addr`, which +// is the historical BSD layout — glibc swaps them. Verified against AOSP +// `bionic/libc/include/netdb.h` (NDK r25+). + +pub const AF_INET: c_int = 2; +pub const SOCK_STREAM: c_int = 1; + +/// `getaddrinfo` EAI_* error codes. Bionic values, which happen to match +/// Darwin BSD for the ones we care about — but we declare them here for +/// clarity at the Zig call site. +pub const EAI_AGAIN: c_int = 2; +pub const EAI_NODATA: c_int = 7; +pub const EAI_NONAME: c_int = 8; + +pub const AddrInfo = extern struct { + ai_flags: c_int, + ai_family: c_int, + ai_socktype: c_int, + ai_protocol: c_int, + ai_addrlen: u32, + ai_canonname: ?[*:0]u8, + ai_addr: ?*SockAddr, + ai_next: ?*AddrInfo, +}; + +/// Generic sockaddr used by getaddrinfo's result chain. +pub const SockAddr = extern struct { + sa_family: u16, + _padding: [14]u8, +}; + +/// IPv4 sockaddr layout (sa_family=AF_INET). +pub const SockAddrIn = extern struct { + sin_family: u16, + sin_port: u16, + /// IPv4 address, network byte order. + sin_addr: u32, + sin_zero: [8]u8, +}; + +pub extern fn getaddrinfo( + node: [*:0]const u8, + service: ?[*:0]const u8, + hints: ?*const AddrInfo, + res: *?*AddrInfo, +) c_int; + +pub extern fn freeaddrinfo(res: ?*AddrInfo) void; + // ── JNI ──────────────────────────────────────────────────────────────────── // AOSP source: frameworks/native/include/jni.h. We only declare the vtable // entries we actually call; future iters can add more as needed. diff --git a/common_fixes.md b/common_fixes.md index 0111fe7..03e0df6 100644 --- a/common_fixes.md +++ b/common_fixes.md @@ -990,3 +990,55 @@ places, lock-step: test enforces equality). 3. `mob_dev/scripts/release/openssl/_lib.sh` — `NDK_VERSION` default. + +## Android `:inet.getaddr/2` returns `:nxdomain` on physical devices (works on emulator) + +**Symptom** — On a deployed mob app on a physical Android device, the +BEAM can't resolve hostnames: + +```elixir +:inet.getaddr(~c"repo.hex.pm", :inet) +#=> {:error, :nxdomain} +``` + +…but the SAME app can `:gen_tcp.connect/3` to a hardcoded IP fine, and +`adb shell ping` from the device works. The Android emulator does NOT +hit this — it works there — which is why this didn't show in early +testing. Verified on Moto G Power 5G 2024 (Android 14). + +**Root cause** — BEAM's default DNS path forks `inet_gethost` (a port +program) and reads what its `getaddrinfo` returns. On a physical +Android device, Bionic's `getaddrinfo` *in the execve'd child* of the +app process doesn't pick up the per-network DNS servers the way the +app's own in-process HTTPS stack does. We suspect this is related to +how `libnetd_client.so` routing into `netd` survives across execve, +but we haven't pinned it down — happy to take a PR with the actual +diagnosis. + +**Fix** — Resolve in-process via `Mob.DNS.resolve/1`, which calls +Bionic's `getaddrinfo` from a NIF and seeds `:inet_db` with the +result. Subsequent `:inet.getaddr/2` lookups hit the seeded `:file` +entry and succeed: + +```elixir +def on_start do + # Preresolve the hosts your app/notebook needs at startup. Idempotent; + # cheap; works on iOS, Android-physical, and Android-emulator alike. + Mob.DNS.preresolve(["repo.hex.pm", "hex.pm", "api.example.com"]) + + # …rest of startup. Any subsequent Req/Finch/Mint/Mix.install call for + # these hosts will find the seeded entry. +end +``` + +For a host not known until request-time, call `Mob.DNS.resolve/1` +just before the request. See the `Mob.DNS` moduledoc for the +cellular caveat and the `configure_pure_beam/1` fallback. + +**Background-app caveat** — Android's App Standby / battery +optimizer blocks *all* outbound network from a backgrounded mob +app (TCP-by-IP too, not just DNS). The DNS fix above only matters +once the app is foregrounded or running under a foreground +service. Symptom: any socket attempt returns `:closed` / `:timeout` +immediately. Bring the app foreground or attach a foreground +service before triggering long-lived network work. diff --git a/lib/mob/dns.ex b/lib/mob/dns.ex index 74d703c..ad2f3c4 100644 --- a/lib/mob/dns.ex +++ b/lib/mob/dns.ex @@ -1,40 +1,47 @@ defmodule Mob.DNS do @moduledoc """ Hostname → IP resolution that works around BEAM's broken DNS path - on iOS. + on iOS and physical Android devices. ## Why this exists BEAM resolves hostnames by spawning an external helper called `inet_gethost` (a port program). On macOS, Linux, Windows that - works fine. On **iOS** it doesn't — the iOS app sandbox forbids - `execve` of any binary the app didn't get a special pass for, and - there's no equivalent of Android's `lib*.so` escape hatch. - Result: `:inet.getaddr/2` (and therefore Req, Finch, Mint, - ReqLLM, and basically every Elixir HTTP library) fails on iOS - the moment a request hits a hostname rather than a literal IP. - - This module side-steps the problem by calling Darwin's - `getaddrinfo` directly via a NIF, then seeding `:inet_db` with - the result so subsequent BEAM-level lookups for the same host - succeed from the in-process file table. - - Android isn't affected — `mob_beam.zig` ships `inet_gethost` as - `libinet_gethost.so` in `jniLibs/`, which the SELinux policy - allows to `execve`. The NIF here would work on Android too but - isn't wired up by default; the BEAM path is already functional - there. + works fine. On mobile it doesn't, for two distinct reasons: + + - **iOS** — the app sandbox forbids `execve` of any binary the app + didn't get a special pass for. `inet_gethost` never runs. + - **Physical Android** — `inet_gethost` *does* run (mob ships it as + `libinet_gethost.so`, allowed to `execve` by the `apk_data_file` + SELinux label), but Bionic's getaddrinfo from the execve'd child + process returns `:nxdomain` for hostnames the same app's in-process + HTTPS stack resolves fine. The Android emulator happens not to hit + this (its DNS proxy at `10.0.2.3` is reachable to anything), so + the fault doesn't show in the simulator. Confirmed on a Moto G + Power 5G 2024 (Android 14): app-uid TCP-by-IP succeeds, but BEAM's + `:inet.getaddr/2` fails with `:nxdomain`. + + In either case `:inet.getaddr/2` (and therefore Req, Finch, Mint, + ReqLLM, and basically every Elixir HTTP library) fails the moment + a request hits a hostname rather than a literal IP. + + This module side-steps the problem by calling the OS resolver + (`getaddrinfo`) **in-process via a NIF** — same address space, same + uid as the rest of the app — then seeding `:inet_db` with the result + so subsequent BEAM-level lookups for the same host succeed from the + in-process file table. ## How to use it ### Robust everywhere, incl. cellular: `resolve/1` · `preresolve/1` - `resolve/1` calls Darwin's `getaddrinfo` via a NIF — iOS's *own* - resolver — then seeds `:inet_db`'s `:file` table, so subsequent - `:inet.getaddr/2` lookups (Req / Finch / Mint) find the result. - Because it uses the OS resolver, it works wherever the OS does — - **including cellular** (it resolves via the carrier's DNS). This is - the recommended path and is device-verified on cellular. + `resolve/1` calls the OS's `getaddrinfo` via a NIF — Darwin's on iOS, + Bionic's on Android — then seeds `:inet_db`'s `:file` table, so + subsequent `:inet.getaddr/2` lookups (Req / Finch / Mint) find the + result. Because it uses the OS resolver, it works wherever the OS + does — **including iOS cellular** (carrier's DNS) and **physical + Android devices** where the forked `inet_gethost` path is broken. + This is the recommended path on both platforms. Preresolve your known hosts at startup — that's all most apps need: @@ -58,10 +65,11 @@ defmodule Mob.DNS do Two caveats that make this the *fallback*, not the default: - * **Gate it to iOS.** On Android `:native` works (mob ships - `inet_gethost` as a `.so`); forcing pure-`:dns` there *breaks* - lookups. (And never reset the chain to include `:native` on iOS — - exec'ing `inet_gethost` there is *fatal*, it crashes the BEAM.) + * **Don't reset the chain to include `:native` on iOS** — + exec'ing `inet_gethost` there is *fatal*, it crashes the BEAM. + On physical Android `:native` is non-fatal but unreliable + (returns `:nxdomain` for hostnames that do resolve in-process); + `Mob.DNS.resolve/1` is the recommended path there too. * **It can't resolve on cellular by default.** Its default nameservers are public (Google / Cloudflare), which carriers **commonly block** → `:nxdomain`. iOS exposes no reliable API to @@ -87,11 +95,17 @@ defmodule Mob.DNS do - **Doesn't help raw NIF networking.** If a third-party NIF calls libc `getaddrinfo` itself, it never goes through BEAM's DNS layer and doesn't need (or benefit from) this fix — it already - works on iOS. Only `:inet`-mediated lookups (which covers - almost all Elixir HTTP libraries) need our help. - - **iOS only effectively.** On Android and host (dev, macOS, - Linux) the NIF works but is unnecessary; BEAM's built-in path - is fine there. + works. Only `:inet`-mediated lookups (which covers almost all + Elixir HTTP libraries) need our help. + - **Background-app network restrictions still apply.** Android's + App Standby / battery optimizer can block *all* outbound network + from a backgrounded app, including TCP-by-IP — resolving a name + won't help if the OS is silently dropping the connect(). Use a + foreground service or keep the app foregrounded for sustained + DNS / connectivity. + - **Host dev (Mac, Linux) doesn't need this.** The NIF isn't + loaded off-device; callers get `{:error, :nif_not_loaded}` and + should fall back to BEAM's normal path (which works on dev). ## Errors