Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions android/jni/mob_nif.zig
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,95 @@ export fn nif_log2(
return erts.ok(env);
}

// ── NIF: resolve_ipv4/1 ──────────────────────────────────────────────────
//
// In-process IPv4 DNS via Bionic's getaddrinfo. Exists because BEAM's
// default DNS path — forking `inet_gethost` (a port program) — returns
// `:nxdomain` on physical Android devices we've tested, even though the
// app's own HTTP stack resolves the same hostnames fine. Suspected cause:
// Bionic's netd-routed resolver doesn't carry across to execve'd children
// of the app process the way it does to in-process calls. The emulator
// happens not to hit this, which is why it wasn't caught earlier.
//
// This NIF runs getaddrinfo in-process (same address space, same uid as
// the app), so it follows whatever DNS path the JVM and the app's libraries
// use. Mirrors iOS's `nif_resolve_ipv4` in `ios/mob_nif.m` and uses the
// same atom/error vocabulary so `Mob.DNS.resolve/1` is platform-agnostic.
//
// Dirty-scheduled because getaddrinfo can block on the resolver for the
// full timeout (seconds). Keep it off the regular schedulers.
//
// Returns:
// {:ok, {a, b, c, d}}
// {:error, :badarg} — host arg wasn't a string/charlist
// {:error, :nxdomain} — no such hostname
// {:error, :timeout} — TRY_AGAIN
// {:error, :no_address} — got a result but no IPv4 in the chain
// {:error, {:gai, code}} — anything else; raw EAI_* int
export fn nif_resolve_ipv4(
env: ?*erts.ErlNifEnv,
argc: c_int,
argv: [*]const erts.ERL_NIF_TERM,
) callconv(.c) erts.ERL_NIF_TERM {
_ = argc;
var host: [256]u8 = undefined;
const got = erts.enif_get_string(env, argv[0], &host, host.len, erts.ERL_NIF_LATIN1);
if (got <= 0) return erts.errorTuple(env, erts.atom(env, "badarg"));

const hints: jni.AddrInfo = .{
.ai_flags = 0,
.ai_family = jni.AF_INET,
.ai_socktype = jni.SOCK_STREAM,
.ai_protocol = 0,
.ai_addrlen = 0,
.ai_canonname = null,
.ai_addr = null,
.ai_next = null,
};

var result: ?*jni.AddrInfo = null;
const err = jni.getaddrinfo(@ptrCast(&host), null, &hints, &result);
if (err != 0) {
// `erts.atom` requires a comptime-known name, so each EAI_* maps to a
// literal string in its own switch arm rather than a runtime-selected
// pointer.
return switch (err) {
jni.EAI_NONAME, jni.EAI_NODATA => erts.errorTuple(env, erts.atom(env, "nxdomain")),
jni.EAI_AGAIN => erts.errorTuple(env, erts.atom(env, "timeout")),
else => blk: {
// Surface raw EAI_* so callers can log/branch on it.
const gai = erts.makeTuple(env, .{ erts.atom(env, "gai"), erts.enif_make_int(env, err) });
break :blk erts.errorTuple(env, gai);
},
};
}

// Walk the chain for the first AF_INET sockaddr. getaddrinfo with
// ai_family=AF_INET shouldn't return anything else, but be defensive.
var ai: ?*jni.AddrInfo = result;
var found: ?u32 = null;
while (ai) |entry| : (ai = entry.ai_next) {
if (entry.ai_family != jni.AF_INET) continue;
const sin: *jni.SockAddrIn = @ptrCast(@alignCast(entry.ai_addr));
// sin_addr is network byte order; bigToNative is the ntohl-equivalent
// on little-endian Android.
found = std.mem.bigToNative(u32, sin.sin_addr);
break;
}
jni.freeaddrinfo(result);

if (found) |addr| {
const ip_tuple = erts.makeTuple(env, .{
erts.enif_make_int(env, @intCast((addr >> 24) & 0xFF)),
erts.enif_make_int(env, @intCast((addr >> 16) & 0xFF)),
erts.enif_make_int(env, @intCast((addr >> 8) & 0xFF)),
erts.enif_make_int(env, @intCast(addr & 0xFF)),
});
return erts.makeTuple(env, .{ erts.ok(env), ip_tuple });
}
return erts.errorTuple(env, erts.atom(env, "no_address"));
}

// ── Helpers ──────────────────────────────────────────────────────────────

/// Pull a binary or charlist into a NUL-terminated buffer. Returns false if
Expand Down Expand Up @@ -4925,6 +5014,8 @@ const nif_funcs = [_]erts.ErlNifFunc{
.{ .name = "bt_spp_write", .arity = 2, .fptr = nif_bt_spp_write, .flags = erts.ERL_NIF_DIRTY_JOB_IO_BOUND },
.{ .name = "bt_hid_connect", .arity = 1, .fptr = nif_bt_hid_connect, .flags = 0 },
.{ .name = "bt_hid_subscribe_raw", .arity = 1, .fptr = nif_bt_hid_subscribe_raw, .flags = 0 },
// ── Mob.DNS (in-process IPv4 resolver via Bionic getaddrinfo) ────────
.{ .name = "resolve_ipv4", .arity = 1, .fptr = nif_resolve_ipv4, .flags = erts.ERL_NIF_DIRTY_JOB_IO_BOUND },
};

var mob_nif_entry: erts.ErlNifEntry = .{
Expand Down
60 changes: 60 additions & 0 deletions android/jni/mob_zig.zig
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,66 @@ pub const RTLD_GLOBAL: c_int = 0x00100;
pub extern fn dlopen(filename: [*:0]const u8, flags: c_int) ?*anyopaque;
pub extern fn dlerror() ?[*:0]const u8;

// ── netdb (in-process DNS) ─────────────────────────────────────────────────
// Bindings for Bionic's getaddrinfo so a NIF can resolve hostnames inside
// the BEAM's process. BEAM's default DNS path forks the `inet_gethost` port
// program and that path returns NXDOMAIN on physical Android devices for
// reasons we haven't fully pinned down (libnetd_client routing through netd
// behaves differently for execve'd children of the app — works on emulator,
// fails on phones we've tested). Calling getaddrinfo in-process from the
// app's UID and address space sidesteps the issue: it's the same code path
// the app's own HTTP stack uses when it succeeds.
//
// Layout mirrors Bionic's `bionic/libc/include/netdb.h` exactly. Note that
// Bionic's `struct addrinfo` orders `ai_canonname` *before* `ai_addr`, which
// is the historical BSD layout — glibc swaps them. Verified against AOSP
// `bionic/libc/include/netdb.h` (NDK r25+).

pub const AF_INET: c_int = 2;
pub const SOCK_STREAM: c_int = 1;

/// `getaddrinfo` EAI_* error codes. Bionic values, which happen to match
/// Darwin BSD for the ones we care about — but we declare them here for
/// clarity at the Zig call site.
pub const EAI_AGAIN: c_int = 2;
pub const EAI_NODATA: c_int = 7;
pub const EAI_NONAME: c_int = 8;

pub const AddrInfo = extern struct {
ai_flags: c_int,
ai_family: c_int,
ai_socktype: c_int,
ai_protocol: c_int,
ai_addrlen: u32,
ai_canonname: ?[*:0]u8,
ai_addr: ?*SockAddr,
ai_next: ?*AddrInfo,
};

/// Generic sockaddr used by getaddrinfo's result chain.
pub const SockAddr = extern struct {
sa_family: u16,
_padding: [14]u8,
};

/// IPv4 sockaddr layout (sa_family=AF_INET).
pub const SockAddrIn = extern struct {
sin_family: u16,
sin_port: u16,
/// IPv4 address, network byte order.
sin_addr: u32,
sin_zero: [8]u8,
};

pub extern fn getaddrinfo(
node: [*:0]const u8,
service: ?[*:0]const u8,
hints: ?*const AddrInfo,
res: *?*AddrInfo,
) c_int;

pub extern fn freeaddrinfo(res: ?*AddrInfo) void;

// ── JNI ────────────────────────────────────────────────────────────────────
// AOSP source: frameworks/native/include/jni.h. We only declare the vtable
// entries we actually call; future iters can add more as needed.
Expand Down
52 changes: 52 additions & 0 deletions common_fixes.md
Original file line number Diff line number Diff line change
Expand Up @@ -990,3 +990,55 @@ places, lock-step:
test enforces equality).
3. `mob_dev/scripts/release/openssl/_lib.sh` — `NDK_VERSION` default.


## Android `:inet.getaddr/2` returns `:nxdomain` on physical devices (works on emulator)

**Symptom** — On a deployed mob app on a physical Android device, the
BEAM can't resolve hostnames:

```elixir
:inet.getaddr(~c"repo.hex.pm", :inet)
#=> {:error, :nxdomain}
```

…but the SAME app can `:gen_tcp.connect/3` to a hardcoded IP fine, and
`adb shell ping` from the device works. The Android emulator does NOT
hit this — it works there — which is why this didn't show in early
testing. Verified on Moto G Power 5G 2024 (Android 14).

**Root cause** — BEAM's default DNS path forks `inet_gethost` (a port
program) and reads what its `getaddrinfo` returns. On a physical
Android device, Bionic's `getaddrinfo` *in the execve'd child* of the
app process doesn't pick up the per-network DNS servers the way the
app's own in-process HTTPS stack does. We suspect this is related to
how `libnetd_client.so` routing into `netd` survives across execve,
but we haven't pinned it down — happy to take a PR with the actual
diagnosis.

**Fix** — Resolve in-process via `Mob.DNS.resolve/1`, which calls
Bionic's `getaddrinfo` from a NIF and seeds `:inet_db` with the
result. Subsequent `:inet.getaddr/2` lookups hit the seeded `:file`
entry and succeed:

```elixir
def on_start do
# Preresolve the hosts your app/notebook needs at startup. Idempotent;
# cheap; works on iOS, Android-physical, and Android-emulator alike.
Mob.DNS.preresolve(["repo.hex.pm", "hex.pm", "api.example.com"])

# …rest of startup. Any subsequent Req/Finch/Mint/Mix.install call for
# these hosts will find the seeded entry.
end
```

For a host not known until request-time, call `Mob.DNS.resolve/1`
just before the request. See the `Mob.DNS` moduledoc for the
cellular caveat and the `configure_pure_beam/1` fallback.

**Background-app caveat** — Android's App Standby / battery
optimizer blocks *all* outbound network from a backgrounded mob
app (TCP-by-IP too, not just DNS). The DNS fix above only matters
once the app is foregrounded or running under a foreground
service. Symptom: any socket attempt returns `:closed` / `:timeout`
immediately. Bring the app foreground or attach a foreground
service before triggering long-lived network work.
80 changes: 47 additions & 33 deletions lib/mob/dns.ex
Original file line number Diff line number Diff line change
@@ -1,40 +1,47 @@
defmodule Mob.DNS do
@moduledoc """
Hostname → IP resolution that works around BEAM's broken DNS path
on iOS.
on iOS and physical Android devices.

## Why this exists

BEAM resolves hostnames by spawning an external helper called
`inet_gethost` (a port program). On macOS, Linux, Windows that
works fine. On **iOS** it doesn't — the iOS app sandbox forbids
`execve` of any binary the app didn't get a special pass for, and
there's no equivalent of Android's `lib*.so` escape hatch.
Result: `:inet.getaddr/2` (and therefore Req, Finch, Mint,
ReqLLM, and basically every Elixir HTTP library) fails on iOS
the moment a request hits a hostname rather than a literal IP.

This module side-steps the problem by calling Darwin's
`getaddrinfo` directly via a NIF, then seeding `:inet_db` with
the result so subsequent BEAM-level lookups for the same host
succeed from the in-process file table.

Android isn't affected — `mob_beam.zig` ships `inet_gethost` as
`libinet_gethost.so` in `jniLibs/`, which the SELinux policy
allows to `execve`. The NIF here would work on Android too but
isn't wired up by default; the BEAM path is already functional
there.
works fine. On mobile it doesn't, for two distinct reasons:

- **iOS** — the app sandbox forbids `execve` of any binary the app
didn't get a special pass for. `inet_gethost` never runs.
- **Physical Android** — `inet_gethost` *does* run (mob ships it as
`libinet_gethost.so`, allowed to `execve` by the `apk_data_file`
SELinux label), but Bionic's getaddrinfo from the execve'd child
process returns `:nxdomain` for hostnames the same app's in-process
HTTPS stack resolves fine. The Android emulator happens not to hit
this (its DNS proxy at `10.0.2.3` is reachable to anything), so
the fault doesn't show in the simulator. Confirmed on a Moto G
Power 5G 2024 (Android 14): app-uid TCP-by-IP succeeds, but BEAM's
`:inet.getaddr/2` fails with `:nxdomain`.

In either case `:inet.getaddr/2` (and therefore Req, Finch, Mint,
ReqLLM, and basically every Elixir HTTP library) fails the moment
a request hits a hostname rather than a literal IP.

This module side-steps the problem by calling the OS resolver
(`getaddrinfo`) **in-process via a NIF** — same address space, same
uid as the rest of the app — then seeding `:inet_db` with the result
so subsequent BEAM-level lookups for the same host succeed from the
in-process file table.

## How to use it

### Robust everywhere, incl. cellular: `resolve/1` · `preresolve/1`

`resolve/1` calls Darwin's `getaddrinfo` via a NIF — iOS's *own*
resolver — then seeds `:inet_db`'s `:file` table, so subsequent
`:inet.getaddr/2` lookups (Req / Finch / Mint) find the result.
Because it uses the OS resolver, it works wherever the OS does —
**including cellular** (it resolves via the carrier's DNS). This is
the recommended path and is device-verified on cellular.
`resolve/1` calls the OS's `getaddrinfo` via a NIF — Darwin's on iOS,
Bionic's on Android — then seeds `:inet_db`'s `:file` table, so
subsequent `:inet.getaddr/2` lookups (Req / Finch / Mint) find the
result. Because it uses the OS resolver, it works wherever the OS
does — **including iOS cellular** (carrier's DNS) and **physical
Android devices** where the forked `inet_gethost` path is broken.
This is the recommended path on both platforms.

Preresolve your known hosts at startup — that's all most apps need:

Expand All @@ -58,10 +65,11 @@ defmodule Mob.DNS do

Two caveats that make this the *fallback*, not the default:

* **Gate it to iOS.** On Android `:native` works (mob ships
`inet_gethost` as a `.so`); forcing pure-`:dns` there *breaks*
lookups. (And never reset the chain to include `:native` on iOS —
exec'ing `inet_gethost` there is *fatal*, it crashes the BEAM.)
* **Don't reset the chain to include `:native` on iOS** —
exec'ing `inet_gethost` there is *fatal*, it crashes the BEAM.
On physical Android `:native` is non-fatal but unreliable
(returns `:nxdomain` for hostnames that do resolve in-process);
`Mob.DNS.resolve/1` is the recommended path there too.
* **It can't resolve on cellular by default.** Its default
nameservers are public (Google / Cloudflare), which carriers
**commonly block** → `:nxdomain`. iOS exposes no reliable API to
Expand All @@ -87,11 +95,17 @@ defmodule Mob.DNS do
- **Doesn't help raw NIF networking.** If a third-party NIF calls
libc `getaddrinfo` itself, it never goes through BEAM's DNS
layer and doesn't need (or benefit from) this fix — it already
works on iOS. Only `:inet`-mediated lookups (which covers
almost all Elixir HTTP libraries) need our help.
- **iOS only effectively.** On Android and host (dev, macOS,
Linux) the NIF works but is unnecessary; BEAM's built-in path
is fine there.
works. Only `:inet`-mediated lookups (which covers almost all
Elixir HTTP libraries) need our help.
- **Background-app network restrictions still apply.** Android's
App Standby / battery optimizer can block *all* outbound network
from a backgrounded app, including TCP-by-IP — resolving a name
won't help if the OS is silently dropping the connect(). Use a
foreground service or keep the app foregrounded for sustained
DNS / connectivity.
- **Host dev (Mac, Linux) doesn't need this.** The NIF isn't
loaded off-device; callers get `{:error, :nif_not_loaded}` and
should fall back to BEAM's normal path (which works on dev).

## Errors

Expand Down
Loading