From 73c4cedc8aded3d92c9e0ac8f3f83375ceafe7d9 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Mon, 15 Jun 2026 13:53:08 +0200 Subject: [PATCH 01/21] Added logging of REST response containing host and negotiated HTTP protocol version. Fixed PubNubServerException handling to guard against a null response and fall back to the full exception message. --- src/PubNub/Endpoints/Endpoint.php | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/PubNub/Endpoints/Endpoint.php b/src/PubNub/Endpoints/Endpoint.php index efc38ba5..80dba331 100755 --- a/src/PubNub/Endpoints/Endpoint.php +++ b/src/PubNub/Endpoints/Endpoint.php @@ -356,6 +356,12 @@ protected function sendRequest(RequestInterface $request): PNEnvelope } else { $response = $client->sendRequest($request); } + $this->pubnub->getLogger()->debug(sprintf( + "%s response from %s negotiated HTTP/%s", + $this->getName(), + $request->getUri()->getHost(), + $response->getProtocolVersion() + )); $envelope = $this->parseResponse($response); } catch (NetworkExceptionInterface $exception) { return new PNEnvelope(null, $this->createStatus( @@ -368,10 +374,10 @@ protected function sendRequest(RequestInterface $request): PNEnvelope $statusCode = $exception->getCode(); $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); $pnServerException = new PubNubServerException(); - if (is_callable([$exception, 'getResponse'])) { + if (is_callable([$exception, 'getResponse']) && $exception->getResponse() !== null) { $response = $exception->getResponse()->getBody()->getContents(); } else { - $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); + $response = $exception->getMessage(); } $pnServerException->setRawBody($response); $pnServerException->setStatusCode($exception->getCode()); From c06517a8ad831bf34681d7dda10d8d091767aa40 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Mon, 15 Jun 2026 18:48:35 +0200 Subject: [PATCH 02/21] Disable HereNow tests because implicit heartbeat has been disabled on test keys. --- tests/e2e/HereNowE2eTest.php | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/e2e/HereNowE2eTest.php b/tests/e2e/HereNowE2eTest.php index c5118f24..3c8d1d2e 100644 --- a/tests/e2e/HereNowE2eTest.php +++ b/tests/e2e/HereNowE2eTest.php @@ -17,6 +17,15 @@ class HereNowE2eTest extends \PubNubTestCase { use PresenceTestHelper; + public function setUp(): void + { + parent::setUp(); + $this->markTestSkipped( + 'Implicit heartbeat has been disabled on the test keys, so these tests fail because the SDK has no ' + . 'Heartbeat support and no explicit Heartbeat is executed.' + ); + } + /** * Test pagination with multiple channels - verifies limit applies per-channel * This test requires real background clients to properly test the server-side pagination From 4ccc6ce32783a3d36b57bb7c71cd6bc56ab5c88f Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 13:34:36 +0200 Subject: [PATCH 03/21] Seems like server behaviour change on server side --- examples/Snippets.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Snippets.php b/examples/Snippets.php index 45e231ed..a4cb22bc 100644 --- a/examples/Snippets.php +++ b/examples/Snippets.php @@ -27,7 +27,7 @@ $pubnub->history() ->channel("my_channel") ->count(100) - ->start(-1) + ->start(1) ->end(13847168819178600) ->reverse(true) ->sync(); From 2a5988f775805680d89b512c19090fae05c0ee47 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 13:55:10 +0200 Subject: [PATCH 04/21] Add diagnostic --- .github/workflows/run-tests.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index bd760bb3..62f369e3 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -60,6 +60,28 @@ jobs: run: | composer self-update && composer --version composer install --prefer-dist + # TEMPORARY: diagnose intermittent `cURL error 28` timeouts to ps.pndsn.com + # from the self-hosted runner. Remove once the runner egress issue is resolved. + - name: Diagnose ps.pndsn.com connectivity + continue-on-error: true + run: | + echo "=== Runner public egress IP ===" + curl -s --max-time 15 https://api.ipify.org || echo "(could not determine egress IP)" + echo + echo "=== DNS resolution for ps.pndsn.com ===" + getent hosts ps.pndsn.com || nslookup ps.pndsn.com || true + echo + echo "=== Proxy-related environment ===" + env | grep -iE 'proxy' || echo "(no proxy env vars set)" + echo + echo "=== 10x probe of https://ps.pndsn.com/time/0 (catch intermittent hang) ===" + for i in $(seq 1 10); do + echo "--- attempt $i ---" + curl -sS -o /dev/null --max-time 15 \ + -w 'http_code=%{http_code} dns=%{time_namelookup}s connect=%{time_connect}s tls=%{time_appconnect}s total=%{time_total}s\n' \ + "https://ps.pndsn.com/time/0" \ + || echo "attempt $i FAILED (curl exit $?)" + done - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error From b6a3525b0fa1725c8ab63bb14eea94676fc54f01 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 14:19:11 +0200 Subject: [PATCH 05/21] Add diagnostic --- .github/workflows/run-tests.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 62f369e3..168c1fa3 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -82,6 +82,23 @@ jobs: "https://ps.pndsn.com/time/0" \ || echo "attempt $i FAILED (curl exit $?)" done + echo + echo "=== Burst probe: 200 requests, 20 concurrent (trip rate-limit/throttle under load) ===" + # If the idle probe above is fast but requests here stall ~10s / fail with + # exit 28, that points to server-side throttling under volume rather than a + # broken network path. http_code 429 would confirm explicit rate-limiting. + burst_log="$(mktemp)" + seq 1 200 | xargs -P 20 -I {} \ + curl -sS -o /dev/null --max-time 15 \ + -w 'http_code=%{http_code} total=%{time_total}s\n' \ + "https://ps.pndsn.com/time/0" \ + >> "$burst_log" 2>&1 || true + echo "--- HTTP status code distribution ---" + grep -oE 'http_code=[0-9]+' "$burst_log" | sort | uniq -c || true + echo "--- failures / non-200 / slow (>1s) responses ---" + grep -vE 'http_code=200 total=0' "$burst_log" | grep -E 'http_code=(000|4|5)|total=[1-9]' || echo "(none — all fast 200s)" + echo "--- slowest 5 responses ---" + sort -t= -k3 -rn "$burst_log" | head -5 || true - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error From f42b13b430a6f46e23e66c522794f19be5f5a6b7 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 15:35:01 +0200 Subject: [PATCH 06/21] Add diagnostic --- .github/workflows/run-tests.yml | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 168c1fa3..8a64f3b8 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -83,10 +83,9 @@ jobs: || echo "attempt $i FAILED (curl exit $?)" done echo - echo "=== Burst probe: 200 requests, 20 concurrent (trip rate-limit/throttle under load) ===" - # If the idle probe above is fast but requests here stall ~10s / fail with - # exit 28, that points to server-side throttling under volume rather than a - # broken network path. http_code 429 would confirm explicit rate-limiting. + echo "=== Burst probe A: 200x time/0, 20 concurrent (keyless baseline) ===" + # Cheap, unauthenticated endpoint. Establishes that the network path is fine + # under concurrent load. time/0 is NOT subject to per-key publish throttling. burst_log="$(mktemp)" seq 1 200 | xargs -P 20 -I {} \ curl -sS -o /dev/null --max-time 15 \ @@ -95,10 +94,25 @@ jobs: >> "$burst_log" 2>&1 || true echo "--- HTTP status code distribution ---" grep -oE 'http_code=[0-9]+' "$burst_log" | sort | uniq -c || true - echo "--- failures / non-200 / slow (>1s) responses ---" - grep -vE 'http_code=200 total=0' "$burst_log" | grep -E 'http_code=(000|4|5)|total=[1-9]' || echo "(none — all fast 200s)" - echo "--- slowest 5 responses ---" + echo "--- slowest 5 ---" sort -t= -k3 -rn "$burst_log" | head -5 || true + echo + echo "=== Burst probe B: 200x PUBLISH, 20 concurrent (the actual failing op) ===" + # Publishes to the real keyset — the same operation that times out in CI. + # If THIS bursts shows 429 / 000 / ~10s stalls while probe A stayed fast, + # the cause is per-key publish throttling, not the network or the runner. + pub_log="$(mktemp)" + seq 1 200 | xargs -P 20 -I {} \ + curl -sS -o /dev/null --max-time 15 \ + -w 'http_code=%{http_code} total=%{time_total}s\n' \ + "https://ps.pndsn.com/publish/$PUBLISH_KEY/$SUBSCRIBE_KEY/0/diag-burst/0/%22x%22?uuid=diag-burst" \ + >> "$pub_log" 2>&1 || true + echo "--- HTTP status code distribution ---" + grep -oE 'http_code=[0-9]+' "$pub_log" | sort | uniq -c || true + echo "--- non-200 / slow (>1s) responses ---" + grep -E 'http_code=(000|4|5)|total=[1-9]' "$pub_log" || echo "(none — all fast 200s)" + echo "--- slowest 5 ---" + sort -t= -k3 -rn "$pub_log" | head -5 || true - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error From 65f4a748a51716683318b46e71f5297f457319d2 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 16:35:14 +0200 Subject: [PATCH 07/21] Add diagnostic --- .github/workflows/run-tests.yml | 15 +++ scripts/diag-guzzle.php | 159 ++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 scripts/diag-guzzle.php diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 8a64f3b8..24c8327b 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -113,6 +113,21 @@ jobs: grep -E 'http_code=(000|4|5)|total=[1-9]' "$pub_log" || echo "(none — all fast 200s)" echo "--- slowest 5 ---" sort -t= -k3 -rn "$pub_log" | head -5 || true + # TEMPORARY: PHP-level probe that REUSES one Guzzle client across many + # requests, exactly like the SDK does for a whole PHPUnit run. The curl + # probes above open a fresh connection each time and so cannot reproduce a + # stale/half-closed keep-alive socket — the prime suspect for + # "connected fine, then 0 bytes received after 10s" (cURL error 28). + # DIAG_DELAY_MS inserts an idle gap to provoke LB idle-timeout of pooled + # sockets. Remove once the timeout cause is confirmed/fixed. + - name: Diagnose Guzzle connection reuse (PHP) + continue-on-error: true + run: | + echo "### back-to-back (no idle gap) ###" + DIAG_ITERATIONS=200 DIAG_DELAY_MS=0 php scripts/diag-guzzle.php + echo + echo "### with 5s idle gaps (provokes stale-socket timeout) ###" + DIAG_ITERATIONS=30 DIAG_DELAY_MS=5000 php scripts/diag-guzzle.php - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php new file mode 100644 index 00000000..f3c0b644 --- /dev/null +++ b/scripts/diag-guzzle.php @@ -0,0 +1,159 @@ +get($url, [ + RequestOptions::TIMEOUT => REQUEST_TIMEOUT, + RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, + 'version' => $version, + // num_connects is unavailable in some libcurl builds, so detect + // reuse via local_port: same client port == same pooled socket. + 'on_stats' => function ($stats) use (&$curlInfo) { + $h = $stats->getHandlerStats(); + $curlInfo = [ + 'port' => $h['local_port'] ?? null, + 'http' => $h['http_version'] ?? '?', // curl wire view (may differ from PSR-7) + 'ip' => $h['primary_ip'] ?? '?', + ]; + }, + ]); + $elapsed = (microtime(true) - $start) * 1000; + $times[] = $elapsed; + + $port = $curlInfo['port'] ?? null; + if ($port !== null) { + if (isset($seenPorts[$port])) { + $reuseCount++; // rode a previously-opened (pooled) socket + } + $seenPorts[$port] = true; + } + $hv = (string) ($curlInfo['http'] ?? '?'); + $httpVersions[$hv] = ($httpVersions[$hv] ?? 0) + 1; + } catch (\Throwable $e) { + $elapsed = (microtime(true) - $start) * 1000; + $failures[] = sprintf( + " #%d after %dms: %s", + $i, + (int) $elapsed, + $e->getMessage() + ); + } + + if ($delayMs > 0) { + usleep($delayMs * 1000); + } + } + + $ok = count($times); + $fail = count($failures); + sort($times); + $p50 = $times ? $times[(int) (count($times) * 0.50)] ?? end($times) : 0; + $p99 = $times ? $times[(int) (count($times) * 0.99)] ?? end($times) : 0; + $max = $times ? max($times) : 0; + + printf( + "\n=== %s (version=%s, %d iterations, delay=%dms) ===\n", + $label, + $version, + $iterations, + $delayMs + ); + $httpDist = []; + foreach ($httpVersions as $v => $n) { + $httpDist[] = 'http/' . $v . ' x' . $n; + } + printf(" ok=%d fail=%d reused-socket=%d distinct-sockets=%d\n", $ok, $fail, $reuseCount, count($seenPorts)); + printf(" curl-wire-protocol: %s\n", implode(' ', $httpDist) ?: '(n/a)'); + printf(" latency p50=%dms p99=%dms max=%dms\n", (int) $p50, (int) $p99, (int) $max); + if ($failures) { + echo " FAILURES:\n" . implode("\n", $failures) . "\n"; + } else { + echo " (no failures)\n"; + } +} + +$timeUrl = "https://ps.pndsn.com/time/0"; + +echo "######## PHP Guzzle reuse diagnostic ########\n"; +echo "Guzzle reuses one client across the whole run (like the SDK).\n"; + +// Baseline: keyless, both protocols. +probe("time/0 baseline", $timeUrl, '2', $iterations, $delayMs); +probe("time/0 baseline", $timeUrl, '1.1', $iterations, $delayMs); + +// The actual failing op: publish, both protocols. +if ($publishKey !== '' && $subscribeKey !== '') { + $publishUrl = sprintf( + "https://ps.pndsn.com/publish/%s/%s/0/diag-guzzle/0/%%22x%%22?uuid=diag-guzzle", + $publishKey, + $subscribeKey + ); + probe("publish (SDK-forced h2)", $publishUrl, '2', $iterations, $delayMs); + probe("publish (h1.1)", $publishUrl, '1.1', $iterations, $delayMs); +} + +echo "\n######## done ########\n"; From 6a4819e9bf7b89bf2086caade31ee1de2fc5c6b7 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 16:42:25 +0200 Subject: [PATCH 08/21] Add diagnostic --- scripts/diag-guzzle.php | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index f3c0b644..0bb3ddbf 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -66,17 +66,19 @@ function probe(string $label, string $url, string $version, int $iterations, int $start = microtime(true); $curlInfo = null; try { - $client->get($url, [ + $response = $client->get($url, [ RequestOptions::TIMEOUT => REQUEST_TIMEOUT, RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, 'version' => $version, - // num_connects is unavailable in some libcurl builds, so detect - // reuse via local_port: same client port == same pooled socket. + // Detect socket reuse via local_port (num_connects is absent in + // some libcurl builds). NOTE: do NOT trust the stat's + // 'http_version' field — it echoes the REQUESTED version, not the + // negotiated one. The PSR-7 response below reports the real wire + // protocol (ps.pndsn.com is HTTP/1.1 only — no h2 over ALPN). 'on_stats' => function ($stats) use (&$curlInfo) { $h = $stats->getHandlerStats(); $curlInfo = [ 'port' => $h['local_port'] ?? null, - 'http' => $h['http_version'] ?? '?', // curl wire view (may differ from PSR-7) 'ip' => $h['primary_ip'] ?? '?', ]; }, @@ -91,7 +93,8 @@ function probe(string $label, string $url, string $version, int $iterations, int } $seenPorts[$port] = true; } - $hv = (string) ($curlInfo['http'] ?? '?'); + // Real negotiated protocol, straight from the PSR-7 response. + $hv = $response->getProtocolVersion(); $httpVersions[$hv] = ($httpVersions[$hv] ?? 0) + 1; } catch (\Throwable $e) { $elapsed = (microtime(true) - $start) * 1000; @@ -127,7 +130,7 @@ function probe(string $label, string $url, string $version, int $iterations, int $httpDist[] = 'http/' . $v . ' x' . $n; } printf(" ok=%d fail=%d reused-socket=%d distinct-sockets=%d\n", $ok, $fail, $reuseCount, count($seenPorts)); - printf(" curl-wire-protocol: %s\n", implode(' ', $httpDist) ?: '(n/a)'); + printf(" negotiated-protocol (PSR-7): %s\n", implode(' ', $httpDist) ?: '(n/a)'); printf(" latency p50=%dms p99=%dms max=%dms\n", (int) $p50, (int) $p99, (int) $max); if ($failures) { echo " FAILURES:\n" . implode("\n", $failures) . "\n"; From 70f1f33c8ad2c8d90f4ddc7c7f12e55ca77cf924 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 16:48:58 +0200 Subject: [PATCH 09/21] Add diagnostic --- .github/workflows/run-tests.yml | 8 ++++---- scripts/diag-guzzle.php | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 24c8327b..f2ea5495 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -123,11 +123,11 @@ jobs: - name: Diagnose Guzzle connection reuse (PHP) continue-on-error: true run: | - echo "### back-to-back (no idle gap) ###" + # Reuses one Guzzle client like the SDK. The keyless time/0 baseline + # should be 0 failures; the publish run reproduces the cURL-28 timeout. + # The built-in spacing sweep then tells us whether spacing publishes + # further apart makes the failures disappear (== server rate limit). DIAG_ITERATIONS=200 DIAG_DELAY_MS=0 php scripts/diag-guzzle.php - echo - echo "### with 5s idle gaps (provokes stale-socket timeout) ###" - DIAG_ITERATIONS=30 DIAG_DELAY_MS=5000 php scripts/diag-guzzle.php - name: Run unit tests run: composer test - name: Cancel workflow runs for commit on error diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index 0bb3ddbf..40b67d36 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -148,15 +148,25 @@ function probe(string $label, string $url, string $version, int $iterations, int probe("time/0 baseline", $timeUrl, '2', $iterations, $delayMs); probe("time/0 baseline", $timeUrl, '1.1', $iterations, $delayMs); -// The actual failing op: publish, both protocols. +// The actual failing op: publish. if ($publishKey !== '' && $subscribeKey !== '') { $publishUrl = sprintf( "https://ps.pndsn.com/publish/%s/%s/0/diag-guzzle/0/%%22x%%22?uuid=diag-guzzle", $publishKey, $subscribeKey ); - probe("publish (SDK-forced h2)", $publishUrl, '2', $iterations, $delayMs); - probe("publish (h1.1)", $publishUrl, '1.1', $iterations, $delayMs); + + // Primary run honoring the env-provided delay. + probe("publish", $publishUrl, '1.1', $iterations, $delayMs); + + // Spacing sweep: distinguish a per-second RATE limit from a per-request / + // idle issue. If failures vanish as we space publishes further apart, the + // cap is rate-based (quota refills over time). If they persist regardless of + // spacing, it is NOT a simple rate limit. + echo "\n######## spacing sweep (publish, 30 req each) ########\n"; + foreach ([0, 200, 500, 1000, 2000] as $gapMs) { + probe("publish gap=" . $gapMs . "ms", $publishUrl, '1.1', 30, $gapMs); + } } echo "\n######## done ########\n"; From 785eb754abc1a0b71ed9364f6fc4e5dadcb4d099 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Tue, 16 Jun 2026 16:54:27 +0200 Subject: [PATCH 10/21] Add diagnostic --- scripts/diag-guzzle.php | 58 +++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index 40b67d36..fbef9e79 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -57,10 +57,13 @@ function probe(string $label, string $url, string $version, int $iterations, int $client = new GuzzleHttpClient(); $times = []; - $failures = []; + $failures = []; + $failedIndices = []; $reuseCount = 0; $seenPorts = []; // local TCP ports seen — a repeat == a reused socket - $httpVersions = []; // curl-level wire protocol distribution + $httpVersions = []; // negotiated protocol distribution (from PSR-7) + $ipOk = []; // backend IP => count of successful requests + $ipFail = []; // backend IP => count of timed-out requests for ($i = 0; $i < $iterations; $i++) { $start = microtime(true); @@ -96,14 +99,20 @@ function probe(string $label, string $url, string $version, int $iterations, int // Real negotiated protocol, straight from the PSR-7 response. $hv = $response->getProtocolVersion(); $httpVersions[$hv] = ($httpVersions[$hv] ?? 0) + 1; + + $ip = $curlInfo['ip'] ?? '?'; + $ipOk[$ip] = ($ipOk[$ip] ?? 0) + 1; } catch (\Throwable $e) { $elapsed = (microtime(true) - $start) * 1000; - $failures[] = sprintf( - " #%d after %dms: %s", - $i, - (int) $elapsed, - $e->getMessage() - ); + $failedIndices[] = $i; + // curl populates primary_ip once connected, so even a post-connect + // timeout tells us WHICH backend black-holed the request. + $ip = $curlInfo['ip'] ?? '?'; + $ipFail[$ip] = ($ipFail[$ip] ?? 0) + 1; + // Keep one representative full message; the rest are summarized by index. + if (count($failures) < 1) { + $failures[] = sprintf(" e.g. #%d after %dms (ip=%s): %s", $i, (int) $elapsed, $ip, $e->getMessage()); + } } if ($delayMs > 0) { @@ -132,8 +141,20 @@ function probe(string $label, string $url, string $version, int $iterations, int printf(" ok=%d fail=%d reused-socket=%d distinct-sockets=%d\n", $ok, $fail, $reuseCount, count($seenPorts)); printf(" negotiated-protocol (PSR-7): %s\n", implode(' ', $httpDist) ?: '(n/a)'); printf(" latency p50=%dms p99=%dms max=%dms\n", (int) $p50, (int) $p99, (int) $max); - if ($failures) { - echo " FAILURES:\n" . implode("\n", $failures) . "\n"; + // Per-backend breakdown: if the failures concentrate on one (or few) IPs + // while other IPs only ever succeed, the cause is a wedged backend node + // behind the LB — not a rate limit (which would return fast 429s, not 10s + // 0-byte hangs). + $allIps = array_unique(array_merge(array_keys($ipOk), array_keys($ipFail))); + sort($allIps); + echo " per-backend (ok/fail):"; + foreach ($allIps as $ip) { + printf(" %s=%d/%d", $ip, $ipOk[$ip] ?? 0, $ipFail[$ip] ?? 0); + } + echo "\n"; + if (!empty($failedIndices)) { + echo " failed-indices: " . implode(',', $failedIndices) . "\n"; + echo implode("\n", $failures) . "\n"; } else { echo " (no failures)\n"; } @@ -156,15 +177,18 @@ function probe(string $label, string $url, string $version, int $iterations, int $subscribeKey ); - // Primary run honoring the env-provided delay. + // The failures are 10s 0-byte HANGS, not 429s, so this is NOT a rate limit. + // The leading hypothesis is a wedged backend behind the publish LB: every + // ~10th request is routed to a node that accepts the connection but never + // responds. The per-backend (ok/fail) line in each run is the key evidence — + // watch whether fails concentrate on one IP. probe("publish", $publishUrl, '1.1', $iterations, $delayMs); - // Spacing sweep: distinguish a per-second RATE limit from a per-request / - // idle issue. If failures vanish as we space publishes further apart, the - // cap is rate-based (quota refills over time). If they persist regardless of - // spacing, it is NOT a simple rate limit. - echo "\n######## spacing sweep (publish, 30 req each) ########\n"; - foreach ([0, 200, 500, 1000, 2000] as $gapMs) { + // Spacing check: if it were a time/rate effect, gaps would change the rate + // of failure. For a per-request routing fault, the ~1-in-10 cadence persists + // regardless of spacing. + echo "\n######## spacing check (publish, 30 req each) ########\n"; + foreach ([0, 1000] as $gapMs) { probe("publish gap=" . $gapMs . "ms", $publishUrl, '1.1', 30, $gapMs); } } From bb7621d09bd18a67ad20b35d73c0e4b62b035cda Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Wed, 17 Jun 2026 13:02:22 +0200 Subject: [PATCH 11/21] Revert change to Endpoint --- src/PubNub/Endpoints/Endpoint.php | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/PubNub/Endpoints/Endpoint.php b/src/PubNub/Endpoints/Endpoint.php index 80dba331..f6dcbb53 100755 --- a/src/PubNub/Endpoints/Endpoint.php +++ b/src/PubNub/Endpoints/Endpoint.php @@ -356,12 +356,12 @@ protected function sendRequest(RequestInterface $request): PNEnvelope } else { $response = $client->sendRequest($request); } - $this->pubnub->getLogger()->debug(sprintf( - "%s response from %s negotiated HTTP/%s", - $this->getName(), - $request->getUri()->getHost(), - $response->getProtocolVersion() - )); +// $this->pubnub->getLogger()->debug(sprintf( +// "%s response from %s negotiated HTTP/%s", +// $this->getName(), +// $request->getUri()->getHost(), +// $response->getProtocolVersion() +// )); $envelope = $this->parseResponse($response); } catch (NetworkExceptionInterface $exception) { return new PNEnvelope(null, $this->createStatus( @@ -374,10 +374,10 @@ protected function sendRequest(RequestInterface $request): PNEnvelope $statusCode = $exception->getCode(); $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); $pnServerException = new PubNubServerException(); - if (is_callable([$exception, 'getResponse']) && $exception->getResponse() !== null) { + if (is_callable([$exception, 'getResponse'])) { $response = $exception->getResponse()->getBody()->getContents(); } else { - $response = $exception->getMessage(); + $response = substr($exception->getMessage(), strpos($exception->getMessage(), "\n") + 1); } $pnServerException->setRawBody($response); $pnServerException->setStatusCode($exception->getCode()); From 092b25bb988d2578945e7adef99d199775c57c24 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Wed, 17 Jun 2026 13:10:57 +0200 Subject: [PATCH 12/21] Diagnostic --- scripts/diag-guzzle.php | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index fbef9e79..84ae296b 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -178,11 +178,13 @@ function probe(string $label, string $url, string $version, int $iterations, int ); // The failures are 10s 0-byte HANGS, not 429s, so this is NOT a rate limit. - // The leading hypothesis is a wedged backend behind the publish LB: every - // ~10th request is routed to a node that accepts the connection but never - // responds. The per-backend (ok/fail) line in each run is the key evidence — - // watch whether fails concentrate on one IP. - probe("publish", $publishUrl, '1.1', $iterations, $delayMs); + // A raw `curl --http1.1` CLI reusing one connection runs 200/200 clean, so + // the hang is specific to the PHP/Guzzle path. The SDK forces 'version'=>'2' + // (Endpoint::requestOptions line ~501) while curl ran 1.1 — so probe BOTH + // here to isolate whether the forced HTTP/2 option is the trigger. + // Watch: does version=1.1 go clean while version=2 hangs? + probe("publish version=2", $publishUrl, '2', $iterations, $delayMs); + probe("publish version=1.1", $publishUrl, '1.1', $iterations, $delayMs); // Spacing check: if it were a time/rate effect, gaps would change the rate // of failure. For a per-request routing fault, the ~1-in-10 cadence persists From eeebc5a273ecce6e83b21b7d6d8e5e536e932ce4 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Wed, 17 Jun 2026 18:10:56 +0200 Subject: [PATCH 13/21] StreamHandler --- scripts/diag-guzzle.php | 55 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index 84ae296b..e3f761ad 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -17,6 +17,13 @@ * - version=2 : what the SDK forces today (Endpoint::requestOptions) * - version=1.1 : what ps.pndsn.com actually speaks * + * It also runs a handler swap (publish, version=1.1) through Guzzle's curl + * handler vs its StreamHandler (PHP stream wrapper, no libcurl). The raw curl + * CLI control uses a different libcurl+TLS stack than PHP, so it can't exonerate + * PHP's curl handler; this swap stays inside PHP and changes only the handler. + * If the hang vanishes on the stream handler, the fault is PHP's curl handler / + * its libcurl build; if it persists on both, the cause is above libcurl. + * * Usage (from project root, keys exported as in the CI workflow): * php scripts/diag-guzzle.php * @@ -31,6 +38,9 @@ require __DIR__ . '/../vendor/autoload.php'; use GuzzleHttp\Client as GuzzleHttpClient; +use GuzzleHttp\HandlerStack; +use GuzzleHttp\Handler\CurlHandler; +use GuzzleHttp\Handler\StreamHandler; use GuzzleHttp\RequestOptions; $publishKey = getenv('PUBLISH_KEY') ?: ''; @@ -50,11 +60,29 @@ * Fire $iterations requests at $url through ONE reused client and report. * * @param string $version '2' or '1.1' — passed to Guzzle's 'version' option. + * @param string $handler 'curl' (default, the SDK transport) or 'stream' + * (PHP stream wrapper — no libcurl). Swapping the + * handler isolates whether the hang lives in PHP's + * curl handler / its libcurl+OpenSSL build: if the + * stream handler runs clean where curl hangs, the + * fault is in the curl path, not the server. */ -function probe(string $label, string $url, string $version, int $iterations, int $delayMs): void -{ - // One client for the whole loop == SDK behavior == cURL keep-alive pool reuse. - $client = new GuzzleHttpClient(); +function probe( + string $label, + string $url, + string $version, + int $iterations, + int $delayMs, + string $handler = 'curl' +): void { + // One client for the whole loop == SDK behavior == keep-alive pool reuse. + if ($handler === 'stream') { + // StreamHandler uses PHP's HTTP stream wrapper (no libcurl at all). + $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new StreamHandler())]); + } else { + // Explicit CurlHandler == what Guzzle picks by default == the SDK path. + $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new CurlHandler())]); + } $times = []; $failures = []; @@ -128,12 +156,18 @@ function probe(string $label, string $url, string $version, int $iterations, int $max = $times ? max($times) : 0; printf( - "\n=== %s (version=%s, %d iterations, delay=%dms) ===\n", + "\n=== %s (handler=%s, version=%s, %d iterations, delay=%dms) ===\n", $label, + $handler, $version, $iterations, $delayMs ); + if ($handler === 'stream') { + // StreamHandler exposes no libcurl stats, so socket-reuse and per-backend + // IP columns below will read 0 / '?' — that's expected, not a finding. + echo " (note: stream handler reports no socket/IP stats — ignore those columns)\n"; + } $httpDist = []; foreach ($httpVersions as $v => $n) { $httpDist[] = 'http/' . $v . ' x' . $n; @@ -186,6 +220,17 @@ function probe(string $label, string $url, string $version, int $iterations, int probe("publish version=2", $publishUrl, '2', $iterations, $delayMs); probe("publish version=1.1", $publishUrl, '1.1', $iterations, $delayMs); + // Handler swap: run the SAME publish loop through the curl handler and then + // through the StreamHandler (PHP stream wrapper, no libcurl). The curl CLI + // control runs clean but uses a DIFFERENT libcurl+TLS stack than PHP, so it + // doesn't exonerate PHP's curl handler. This swap stays inside PHP and + // changes only the handler: if the every-10th hang VANISHES on the stream + // handler, it pins the fault to PHP's curl handler / its libcurl+OpenSSL + // build; if it persists on both, the cause is above libcurl (server/network). + echo "\n######## handler swap (publish, version=1.1) ########\n"; + probe("publish handler=curl", $publishUrl, '1.1', $iterations, $delayMs, 'curl'); + probe("publish handler=stream", $publishUrl, '1.1', $iterations, $delayMs, 'stream'); + // Spacing check: if it were a time/rate effect, gaps would change the rate // of failure. For a per-request routing fault, the ~1-in-10 cadence persists // regardless of spacing. From 24efecd55196ab3fe5d9d8a491e404fa1243336c Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Wed, 17 Jun 2026 18:52:00 +0200 Subject: [PATCH 14/21] LB check --- scripts/diag-guzzle.php | 103 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index e3f761ad..ac506362 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -27,12 +27,28 @@ * Usage (from project root, keys exported as in the CI workflow): * php scripts/diag-guzzle.php * + * Finally it runs a VERBOSE capture (publish, curl handler, ~12 reused requests) + * that dumps libcurl's wire-level trace per request — the `< Connection:`, + * `< Keep-Alive:`, and `* Connection #N ... left intact / will close` lines — + * so we can see EXACTLY what the load balancer sends on the connection it later + * black-holes. The suite ran clean for months until the LB moved nginx -> + * Pingora; the prime suspect is now a server-side keep-alive change (e.g. a low + * `keepalive_requests` cap, or retiring the connection without a clean + * `Connection: close`) that PHP's curl reuse trips over on the ~10th request. + * Watch request #8/#9 (the retirement boundary) for the header the SDK ignores. + * + * Usage (from project root, keys exported as in the CI workflow): + * php scripts/diag-guzzle.php + * * Env: * PUBLISH_KEY, SUBSCRIBE_KEY (required for the publish probe) * DIAG_ITERATIONS (optional, default 200) * DIAG_DELAY_MS (optional, idle gap between requests in ms, * default 0; try 5000 to provoke idle-timeout * of pooled sockets) + * DIAG_VERBOSE_ITERS (optional, default 12 — how many requests the + * verbose capture fires to span the ~10th + * retirement boundary) */ require __DIR__ . '/../vendor/autoload.php'; @@ -47,6 +63,7 @@ $subscribeKey = getenv('SUBSCRIBE_KEY') ?: ''; $iterations = (int) (getenv('DIAG_ITERATIONS') ?: 200); $delayMs = (int) (getenv('DIAG_DELAY_MS') ?: 0); +$verboseIters = (int) (getenv('DIAG_VERBOSE_ITERS') ?: 12); if ($publishKey === '' || $subscribeKey === '') { fwrite(STDERR, "WARNING: PUBLISH_KEY/SUBSCRIBE_KEY not set — publish probe will be skipped.\n"); @@ -194,6 +211,87 @@ function probe( } } +/** + * Fire $iterations publishes over ONE reused curl connection with libcurl + * VERBOSE tracing on, and print, per request, only the lines that reveal how the + * load balancer manages the keep-alive connection: + * - `< Connection:` / `< Keep-Alive:` response headers (does Pingora announce a + * request cap or signal close?) + * - `* Connection #N ... left intact` vs `* Closing connection` (when libcurl + * decides to drop / retire the pooled socket) + * - `* Re-using existing connection` (proof the request rode a pooled socket) + * - any `* ... timed out` / error line on the hung request + * + * This is the smoking-gun capture for the nginx->Pingora hypothesis: watch the + * boundary request (~#8/#9) for a server header the SDK's pooled handle ignores, + * or a silent close that leaves the next reused request writing into a dead + * socket (-> 0 bytes -> 10s timeout). + * + * Uses Guzzle's raw 'curl' option passthrough (CurlHandler only) to set + * CURLOPT_VERBOSE + CURLOPT_STDERR at a temp stream we then read back. + */ +function captureVerbose(string $url, string $version, int $iterations): void +{ + printf("\n######## verbose wire capture (publish, curl, version=%s, %d reused requests) ########\n", $version, $iterations); + echo "Watch the ~#8/#9 boundary: what does the LB send right before the hang?\n"; + + // One client == one keep-alive pool, so requests reuse the same socket. + $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new CurlHandler())]); + + // Only the lines worth reading from libcurl's (very chatty) verbose trace. + $keep = ['Connection', 'Keep-Alive', 'Re-using', 'Closing connection', 'left intact', + 'Connected to', 'timed out', 'Operation timed out', 'Empty reply', 'Connection died']; + + for ($i = 0; $i < $iterations; $i++) { + $trace = fopen('php://temp', 'w+'); + $start = microtime(true); + $status = '?'; + $err = null; + try { + $resp = $client->get($url, [ + RequestOptions::TIMEOUT => REQUEST_TIMEOUT, + RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, + 'version' => $version, + // CurlHandler passes these straight to the easy handle. + 'curl' => [ + CURLOPT_VERBOSE => true, + CURLOPT_STDERR => $trace, + ], + ]); + $status = (string) $resp->getStatusCode(); + } catch (\Throwable $e) { + $err = $e->getMessage(); + } + $elapsed = (int) ((microtime(true) - $start) * 1000); + + // Pull the interesting verbose lines out of the trace stream. + rewind($trace); + $raw = stream_get_contents($trace) ?: ''; + fclose($trace); + $lines = []; + foreach (explode("\n", $raw) as $line) { + $line = rtrim($line); + if ($line === '') { + continue; + } + // The TLS handshake line matches "Connection" but is per-request + // noise, not keep-alive signal — drop it. + if (stripos($line, 'SSL connection using') !== false) { + continue; + } + foreach ($keep as $needle) { + if (stripos($line, $needle) !== false) { + $lines[] = ' ' . $line; + break; + } + } + } + + printf(" #%d status=%s %dms%s\n", $i, $status, $elapsed, $err !== null ? " ERROR: $err" : ''); + echo $lines ? implode("\n", $lines) . "\n" : " (no keep-alive/connection lines in trace)\n"; + } +} + $timeUrl = "https://ps.pndsn.com/time/0"; echo "######## PHP Guzzle reuse diagnostic ########\n"; @@ -231,6 +329,11 @@ function probe( probe("publish handler=curl", $publishUrl, '1.1', $iterations, $delayMs, 'curl'); probe("publish handler=stream", $publishUrl, '1.1', $iterations, $delayMs, 'stream'); + // Verbose wire capture across the every-10th retirement boundary. This is the + // hand-to-ops evidence for the nginx->Pingora keep-alive hypothesis: it shows + // exactly what the LB sends on the connection right before it's black-holed. + captureVerbose($publishUrl, '1.1', $verboseIters); + // Spacing check: if it were a time/rate effect, gaps would change the rate // of failure. For a per-request routing fault, the ~1-in-10 cadence persists // regardless of spacing. From 35994573427ef85cb4546b2c62b341b4c53031f6 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 10:27:25 +0200 Subject: [PATCH 15/21] Reuse vs new-connection --- scripts/diag-guzzle.php | 56 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index ac506362..ca04728b 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -230,10 +230,24 @@ function probe( * Uses Guzzle's raw 'curl' option passthrough (CurlHandler only) to set * CURLOPT_VERBOSE + CURLOPT_STDERR at a temp stream we then read back. */ -function captureVerbose(string $url, string $version, int $iterations): void +function captureVerbose(string $url, string $version, int $iterations, int $gapMs = 0): void { - printf("\n######## verbose wire capture (publish, curl, version=%s, %d reused requests) ########\n", $version, $iterations); - echo "Watch the ~#8/#9 boundary: what does the LB send right before the hang?\n"; + printf( + "\n######## verbose wire capture (publish, curl, version=%s, %d requests, gap=%dms) ########\n", + $version, + $iterations, + $gapMs + ); + if ($gapMs > 0) { + // The point of the gap run: at gap=0 the every-10th request hangs; at + // gap=1000ms the loop is clean. The open question this answers is WHY — + // does the boundary request (~#9/#10) RE-USE the same socket (so the gap + // just keeps it healthy), or does the client CONNECT a NEW socket (so the + // gap lets a silent drop become visible and the client reconnects cleanly)? + echo "Question: at this gap, does the ~#9/#10 boundary REUSE the socket or open a NEW connection?\n"; + } else { + echo "Watch the ~#8/#9 boundary: what does the LB send right before the hang?\n"; + } // One client == one keep-alive pool, so requests reuse the same socket. $client = new GuzzleHttpClient(['handler' => HandlerStack::create(new CurlHandler())]); @@ -242,6 +256,9 @@ function captureVerbose(string $url, string $version, int $iterations): void $keep = ['Connection', 'Keep-Alive', 'Re-using', 'Closing connection', 'left intact', 'Connected to', 'timed out', 'Operation timed out', 'Empty reply', 'Connection died']; + $reused = 0; // requests that rode a pooled socket ("Re-using existing") + $newConns = 0; // requests that opened a fresh socket ("Connected to") + for ($i = 0; $i < $iterations; $i++) { $trace = fopen('php://temp', 'w+'); $start = microtime(true); @@ -268,6 +285,21 @@ function captureVerbose(string $url, string $version, int $iterations): void rewind($trace); $raw = stream_get_contents($trace) ?: ''; fclose($trace); + + // Classify reuse-vs-reconnect from the raw trace (independent of $keep + // filtering): "Re-using existing" == pooled socket; "Connected to" == a + // fresh TCP connect this request. This is the direct evidence for the gap + // question above. + $isReuse = stripos($raw, 'Re-using existing') !== false; + $isNew = stripos($raw, 'Connected to') !== false; + if ($isReuse) { + $reused++; + } + if ($isNew) { + $newConns++; + } + $conn = $isReuse ? 'REUSE' : ($isNew ? 'NEW-CONN' : '?'); + $lines = []; foreach (explode("\n", $raw) as $line) { $line = rtrim($line); @@ -287,9 +319,15 @@ function captureVerbose(string $url, string $version, int $iterations): void } } - printf(" #%d status=%s %dms%s\n", $i, $status, $elapsed, $err !== null ? " ERROR: $err" : ''); + printf(" #%d [%s] status=%s %dms%s\n", $i, $conn, $status, $elapsed, $err !== null ? " ERROR: $err" : ''); echo $lines ? implode("\n", $lines) . "\n" : " (no keep-alive/connection lines in trace)\n"; + + if ($gapMs > 0) { + usleep($gapMs * 1000); + } } + + printf(" -- summary: reused=%d new-connections=%d (of %d) --\n", $reused, $newConns, $iterations); } $timeUrl = "https://ps.pndsn.com/time/0"; @@ -332,7 +370,15 @@ function captureVerbose(string $url, string $version, int $iterations): void // Verbose wire capture across the every-10th retirement boundary. This is the // hand-to-ops evidence for the nginx->Pingora keep-alive hypothesis: it shows // exactly what the LB sends on the connection right before it's black-holed. - captureVerbose($publishUrl, '1.1', $verboseIters); + captureVerbose($publishUrl, '1.1', $verboseIters, 0); + + // Same capture WITH a 1000ms gap (the spacing that makes the loop clean). The + // per-request REUSE/NEW-CONN tag + summary answers: does the gap keep ONE + // socket alive across the boundary (all REUSE), or does it let a silent drop + // surface so the client reconnects (a NEW-CONN at ~#9/#10)? We currently have + // NO direct evidence either way — distinct-sockets=1 can't tell reuse from a + // same-port reconnect. This settles it. + captureVerbose($publishUrl, '1.1', $verboseIters, 1000); // Spacing check: if it were a time/rate effect, gaps would change the rate // of failure. For a per-request routing fault, the ~1-in-10 cadence persists From d658b9a3b9e9803de0f8655513790043316b47d6 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 11:55:28 +0200 Subject: [PATCH 16/21] run diag-curl.sh --- .github/workflows/run-tests.yml | 12 ++++ scripts/diag-curl.sh | 98 +++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100755 scripts/diag-curl.sh diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index f2ea5495..c3a18dc5 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -113,6 +113,18 @@ jobs: grep -E 'http_code=(000|4|5)|total=[1-9]' "$pub_log" || echo "(none — all fast 200s)" echo "--- slowest 5 ---" sort -t= -k3 -rn "$pub_log" | head -5 || true + # TEMPORARY: raw `curl` CLI control that reuses ONE keep-alive connection + # across many publishes (one curl process, N urls via -K). This is the + # PHP-free reproducer for the server team: if it hangs every ~10th here on + # the dense CI runner, the every-10th black-hole is reproducible with no + # PHP/Guzzle/SDK involvement at all. Locally (high RTT) it ran 200/200 + # clean — this step tests it on the same runner/pool that fails for Guzzle. + # Remove once the timeout cause is confirmed/fixed. + - name: Diagnose connection reuse (raw curl CLI) + continue-on-error: true + run: | + curl --version | head -1 + DIAG_ITERATIONS=200 ./scripts/diag-curl.sh # TEMPORARY: PHP-level probe that REUSES one Guzzle client across many # requests, exactly like the SDK does for a whole PHPUnit run. The curl # probes above open a fresh connection each time and so cannot reproduce a diff --git a/scripts/diag-curl.sh b/scripts/diag-curl.sh new file mode 100755 index 00000000..40f212d6 --- /dev/null +++ b/scripts/diag-curl.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# +# TEMPORARY diagnostic: reproduce the "every ~10th publish hangs 10s / 0 bytes" +# issue using the raw `curl` CLI instead of PHP/Guzzle. +# +# This is a CONTROL for scripts/diag-guzzle.php. The only way to reproduce the +# hang is to REUSE ONE keep-alive connection across many requests. A fresh curl +# per request opens a new socket every time and never reproduces it -- that is +# why the earlier per-request curl probes looked healthy. +# +# curl reuses the connection when you give it MANY URLs in ONE invocation, so +# this fires all N publishes down a single connection (like the SDK does). +# +# Usage (from project root): +# set -a; source .env.dev; set +a +# ./scripts/diag-curl.sh +# +# Env: +# PUBLISH_KEY, SUBSCRIBE_KEY (required) +# DIAG_ITERATIONS (optional, default 200) +# +# Output per request line: s +# - http_code 000 = no response (the hang / cURL error 28) +# - repeated local_port = same reused socket (keep-alive working) +# - a new local_port = curl had to reopen after a failure + +set -u + +PUBLISH_KEY="${PUBLISH_KEY:-}" +SUBSCRIBE_KEY="${SUBSCRIBE_KEY:-}" +ITER="${DIAG_ITERATIONS:-200}" + +if [[ -z "$PUBLISH_KEY" || -z "$SUBSCRIBE_KEY" ]]; then + echo "ERROR: set PUBLISH_KEY and SUBSCRIBE_KEY (e.g. 'set -a; source .env.dev; set +a')" >&2 + exit 1 +fi + +URL="https://ps.pndsn.com/publish/${PUBLISH_KEY}/${SUBSCRIBE_KEY}/0/diag-curl/0/%22x%22?uuid=diag-curl" + +# Build a curl-config file with N url entries -> ONE curl process, ONE connection. +CFG="$(mktemp)" +trap 'rm -f "$CFG"' EXIT +for ((i = 0; i < ITER; i++)); do + printf 'url = "%s"\n' "$URL" >> "$CFG" +done + +run_probe() { + local label="$1"; shift + echo "" + echo "=== ${label} (${ITER} iterations) ===" + + # -K $CFG : all N URLs, reusing the connection + # --max-time 12 : per-request ceiling (>10s server timeout so the hang shows as 000) + # -s -o /dev/null : discard bodies + # -w ...\n : one machine-readable line per request + # Fields: http_code | time_total | local_port | remote_ip + curl "$@" -K "$CFG" \ + --max-time 12 \ + -s -o /dev/null \ + -w '%{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ + | awk ' + { + idx = NR - 1 + code = $1; t = $2; port = $3; ip = $4; nconn = $5 + printf " #%-3d code=%s time=%ss port=%s ip=%s conns=%s\n", idx, code, t, port, ip, nconn + total++ + last_nconn = nconn + 0 # cumulative; final value = total TCP connections opened + ipseen[ip]++ + if (code == "000" || code == "" ) { # hang / no response + fail++; failidx = failidx (failidx=="" ? "" : ",") idx + ipfail[ip]++ + } else { + ok++; ipok[ip]++ + } + if (port != lastport && NR > 1) sockets++ + portseen[port]=1 + lastport = port + } + END { + ndist = 0; for (p in portseen) ndist++ + printf "\n ok=%d fail=%d distinct-sockets=%d tcp-connects-opened=%d (total=%d)\n", ok+0, fail+0, ndist, last_nconn, total + printf " (reuse proof: tcp-connects-opened << total => keep-alive working; ideally 1 + one reconnect per hang)\n" + printf " per-backend (ok/fail):" + for (ip in ipseen) printf " %s=%d/%d", ip, ipok[ip]+0, ipfail[ip]+0 + printf "\n" + if (fail+0 > 0) printf " failed-indices: %s\n", failidx + else printf " (no failures)\n" + }' +} + +echo "######## raw curl reuse diagnostic ########" +echo "One curl process reuses one keep-alive connection across ${ITER} publishes." +echo "Compare against scripts/diag-guzzle.php (the PHP/Guzzle path)." + +run_probe "publish (HTTP/1.1)" --http1.1 + +echo "" +echo "######## done ########" \ No newline at end of file From 9cd2f489dadbf73d6c59048520ec8b389d1b7706 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 12:37:33 +0200 Subject: [PATCH 17/21] run diag-curl.sh --- scripts/diag-curl.sh | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/scripts/diag-curl.sh b/scripts/diag-curl.sh index 40f212d6..8109ca8e 100755 --- a/scripts/diag-curl.sh +++ b/scripts/diag-curl.sh @@ -21,6 +21,9 @@ # # Output per request line: s # - http_code 000 = no response (the hang / cURL error 28) +# - http_code 429 = Balancer rate limit (fast) — curl reports these every +# ~10th publish; the PHP/Guzzle path black-holes (000/10s) +# at the same cadence instead. See the investigation doc. # - repeated local_port = same reused socket (keep-alive working) # - a new local_port = curl had to reopen after a failure @@ -53,22 +56,32 @@ run_probe() { # --max-time 12 : per-request ceiling (>10s server timeout so the hang shows as 000) # -s -o /dev/null : discard bodies # -w ...\n : one machine-readable line per request - # Fields: http_code | time_total | local_port | remote_ip + # + # CRITICAL: prefix the -w line with a unique sentinel ("@@STAT@@") and parse ONLY + # sentinel lines. With -K and many URLs, response bodies can still reach stdout and + # merge with the metadata line (an earlier version miscounted every-10th HTTP 429s + # as ok=200, e.g. `code={"status":429...`). Parsing on the sentinel makes the field + # positions reliable regardless of body content. + # Fields: http_code | time_total | local_port | remote_ip | num_connects curl "$@" -K "$CFG" \ --max-time 12 \ -s -o /dev/null \ - -w '%{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ + -w '@@STAT@@ %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ | awk ' + $1 != "@@STAT@@" { next } # ignore any body text that leaked onto stdout { - idx = NR - 1 - code = $1; t = $2; port = $3; ip = $4; nconn = $5 + idx = total # 0-based request index among sentinel lines + code = $2; t = $3; port = $4; ip = $5; nconn = $6 printf " #%-3d code=%s time=%ss port=%s ip=%s conns=%s\n", idx, code, t, port, ip, nconn total++ - last_nconn = nconn + 0 # cumulative; final value = total TCP connections opened + last_nconn = nconn + 0 # cumulative; final value = total TCP connections opened ipseen[ip]++ - if (code == "000" || code == "" ) { # hang / no response - fail++; failidx = failidx (failidx=="" ? "" : ",") idx + if (code == "000" || code == "") { # hang / no response (cURL 28) + hang++; hangidx = hangidx (hangidx=="" ? "" : ",") idx ipfail[ip]++ + } else if (code == "429") { # Balancer rate limit (fast) + rl++; rlidx = rlidx (rlidx=="" ? "" : ",") idx + ipok[ip]++ # server answered fast; not a hang } else { ok++; ipok[ip]++ } @@ -78,13 +91,15 @@ run_probe() { } END { ndist = 0; for (p in portseen) ndist++ - printf "\n ok=%d fail=%d distinct-sockets=%d tcp-connects-opened=%d (total=%d)\n", ok+0, fail+0, ndist, last_nconn, total + printf "\n ok=%d rate-limited(429)=%d hung(000)=%d distinct-sockets=%d tcp-connects-opened=%d (total=%d)\n", \ + ok+0, rl+0, hang+0, ndist, last_nconn, total printf " (reuse proof: tcp-connects-opened << total => keep-alive working; ideally 1 + one reconnect per hang)\n" - printf " per-backend (ok/fail):" + printf " per-backend (non-hang/hung):" for (ip in ipseen) printf " %s=%d/%d", ip, ipok[ip]+0, ipfail[ip]+0 printf "\n" - if (fail+0 > 0) printf " failed-indices: %s\n", failidx - else printf " (no failures)\n" + if (rl+0 > 0) printf " rate-limited(429) indices: %s\n", rlidx + if (hang+0 > 0) printf " hung(000) indices: %s\n", hangidx + if (rl+0 == 0 && hang+0 == 0) printf " (no 429s, no hangs)\n" }' } From 06f0bc79994eb4e1a391132aa8431b5fa3418189 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 13:01:18 +0200 Subject: [PATCH 18/21] run diag-curl.sh --- scripts/diag-curl.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/diag-curl.sh b/scripts/diag-curl.sh index 8109ca8e..a5ce242d 100755 --- a/scripts/diag-curl.sh +++ b/scripts/diag-curl.sh @@ -57,18 +57,21 @@ run_probe() { # -s -o /dev/null : discard bodies # -w ...\n : one machine-readable line per request # - # CRITICAL: prefix the -w line with a unique sentinel ("@@STAT@@") and parse ONLY + # CRITICAL: prefix the -w line with a unique sentinel ("STAT::") and parse ONLY # sentinel lines. With -K and many URLs, response bodies can still reach stdout and # merge with the metadata line (an earlier version miscounted every-10th HTTP 429s # as ok=200, e.g. `code={"status":429...`). Parsing on the sentinel makes the field # positions reliable regardless of body content. + # NOTE: the sentinel must NOT start with '@' — curl reads a -w value beginning with + # '@' as a FILENAME (`-w @file`), which fails with "error encountered when reading a + # file". So use "STAT::", not "@@STAT@@". # Fields: http_code | time_total | local_port | remote_ip | num_connects curl "$@" -K "$CFG" \ --max-time 12 \ -s -o /dev/null \ - -w '@@STAT@@ %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ + -w 'STAT:: %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ | awk ' - $1 != "@@STAT@@" { next } # ignore any body text that leaked onto stdout + $1 != "STAT::" { next } # ignore any body text that leaked onto stdout { idx = total # 0-based request index among sentinel lines code = $2; t = $3; port = $4; ip = $5; nconn = $6 From 025931e3faf68758e49b3d2c451728c81a8de0db Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 13:18:58 +0200 Subject: [PATCH 19/21] run diag-curl.sh --- scripts/diag-curl.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/diag-curl.sh b/scripts/diag-curl.sh index a5ce242d..cfa3155f 100755 --- a/scripts/diag-curl.sh +++ b/scripts/diag-curl.sh @@ -65,16 +65,28 @@ run_probe() { # NOTE: the sentinel must NOT start with '@' — curl reads a -w value beginning with # '@' as a FILENAME (`-w @file`), which fails with "error encountered when reading a # file". So use "STAT::", not "@@STAT@@". - # Fields: http_code | time_total | local_port | remote_ip | num_connects + # + # ROBUST PARSING: -o /dev/null does NOT reliably suppress bodies on this -K + # multi-URL path (the response body still reaches stdout), AND the body often has + # no trailing newline — so the NEXT request's "STAT::" line gets glued onto the end + # of the body line. A naive `$1=="STAT::"` then matches only the very first line + # (total=1 bug). Fix: lead the format with \n (so each STAT starts fresh where + # possible) and use awk match() to find "STAT::" ANYWHERE on the line, parsing the + # whitespace-separated fields that follow it. Body text before the sentinel is + # discarded. + # Fields after STAT:: -> http_code | time_total | local_port | remote_ip | num_connects curl "$@" -K "$CFG" \ --max-time 12 \ -s -o /dev/null \ - -w 'STAT:: %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ + -w '\nSTAT:: %{http_code} %{time_total} %{local_port} %{remote_ip} %{num_connects}\n' \ | awk ' - $1 != "STAT::" { next } # ignore any body text that leaked onto stdout { + p = index($0, "STAT:: ") + if (p == 0) next # body text with no sentinel — ignore + n = split(substr($0, p + 7), f, /[ \t]+/) + if (n < 5) next # malformed / truncated sentinel — skip + code = f[1]; t = f[2]; port = f[3]; ip = f[4]; nconn = f[5] idx = total # 0-based request index among sentinel lines - code = $2; t = $3; port = $4; ip = $5; nconn = $6 printf " #%-3d code=%s time=%ss port=%s ip=%s conns=%s\n", idx, code, t, port, ip, nconn total++ last_nconn = nconn + 0 # cumulative; final value = total TCP connections opened From aed3291e465abdcc22d19e2e6a312361fdf5b253 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 14:02:04 +0200 Subject: [PATCH 20/21] Increased requestTimeout to 15sec to get 429 --- scripts/diag-guzzle.php | 41 +++++++++++++++++++++++++++++++++++----- tests/PubNubTestCase.php | 12 ++++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/scripts/diag-guzzle.php b/scripts/diag-guzzle.php index ca04728b..178d5e4b 100644 --- a/scripts/diag-guzzle.php +++ b/scripts/diag-guzzle.php @@ -69,8 +69,13 @@ fwrite(STDERR, "WARNING: PUBLISH_KEY/SUBSCRIBE_KEY not set — publish probe will be skipped.\n"); } -// Mirror the SDK's timeout defaults (PNConfiguration: 10s request, 10s connect). -const REQUEST_TIMEOUT = 10; +// Request timeout RAISED to 15s (was 10s, the SDK default). The Balancer holds a +// rate-limited publish ~10s before returning a 429; at the old 10s timeout the SDK +// aborted (cURL 28 / 0 bytes) ~8ms before the 429 arrived. 15s lets the response +// land — so this run now CONFIRMS the mechanism: the every-10th request should show +// up as an HTTP 429 (or a slow ~10s success) instead of a timeout. Mirrors raw +// curl's --max-time 12 and the tests' setNonSubscribeRequestTimeout(15). +const REQUEST_TIMEOUT = 15; const CONNECT_TIMEOUT = 10; /** @@ -109,6 +114,7 @@ function probe( $httpVersions = []; // negotiated protocol distribution (from PSR-7) $ipOk = []; // backend IP => count of successful requests $ipFail = []; // backend IP => count of timed-out requests + $statusCodes = []; // HTTP status => count (429s now visible, not thrown) for ($i = 0; $i < $iterations; $i++) { $start = microtime(true); @@ -118,6 +124,11 @@ function probe( RequestOptions::TIMEOUT => REQUEST_TIMEOUT, RequestOptions::CONNECT_TIMEOUT => CONNECT_TIMEOUT, 'version' => $version, + // Do NOT throw on 4xx/5xx — we WANT to observe the Balancer's 429 + // as a normal response. With http_errors on (Guzzle default) a 429 + // throws ClientException and gets lumped into the opaque failure + // bucket; off, we can tally it by status code below. + RequestOptions::HTTP_ERRORS => false, // Detect socket reuse via local_port (num_connects is absent in // some libcurl builds). NOTE: do NOT trust the stat's // 'http_version' field — it echoes the REQUESTED version, not the @@ -145,6 +156,13 @@ function probe( $hv = $response->getProtocolVersion(); $httpVersions[$hv] = ($httpVersions[$hv] ?? 0) + 1; + // Tally status code — every 10th should now be 429 (~10s), the rest 200. + $code = $response->getStatusCode(); + $statusCodes[$code] = ($statusCodes[$code] ?? 0) + 1; + if ($code === 429) { + $failedIndices[] = $i; // record the cadence even though it didn't throw + } + $ip = $curlInfo['ip'] ?? '?'; $ipOk[$ip] = ($ipOk[$ip] ?? 0) + 1; } catch (\Throwable $e) { @@ -190,6 +208,15 @@ function probe( $httpDist[] = 'http/' . $v . ' x' . $n; } printf(" ok=%d fail=%d reused-socket=%d distinct-sockets=%d\n", $ok, $fail, $reuseCount, count($seenPorts)); + // Status-code distribution: with http_errors off and a 15s timeout, the + // every-10th rate-limited publish now shows as `429 x20` here instead of a + // timeout — the confirmation that the "hang" was a delayed 429 all along. + ksort($statusCodes); + $codeDist = []; + foreach ($statusCodes as $c => $n) { + $codeDist[] = $c . ' x' . $n; + } + printf(" status-codes: %s\n", implode(' ', $codeDist) ?: '(none — all threw)'); printf(" negotiated-protocol (PSR-7): %s\n", implode(' ', $httpDist) ?: '(n/a)'); printf(" latency p50=%dms p99=%dms max=%dms\n", (int) $p50, (int) $p99, (int) $max); // Per-backend breakdown: if the failures concentrate on one (or few) IPs @@ -203,11 +230,15 @@ function probe( printf(" %s=%d/%d", $ip, $ipOk[$ip] ?? 0, $ipFail[$ip] ?? 0); } echo "\n"; + // $failedIndices now holds both thrown failures (timeouts) AND 429s observed as + // normal responses. Label it as the every-10th cadence rather than "failures". if (!empty($failedIndices)) { - echo " failed-indices: " . implode(',', $failedIndices) . "\n"; - echo implode("\n", $failures) . "\n"; + echo " rate-limited/failed indices: " . implode(',', $failedIndices) . "\n"; + if (!empty($failures)) { + echo implode("\n", $failures) . "\n"; + } } else { - echo " (no failures)\n"; + echo " (no 429s, no failures)\n"; } } diff --git a/tests/PubNubTestCase.php b/tests/PubNubTestCase.php index 7f8f44cc..feb76ea1 100755 --- a/tests/PubNubTestCase.php +++ b/tests/PubNubTestCase.php @@ -82,27 +82,39 @@ public function setUp(): void parent::setUp(); + // The Balancer rate-limits dense publish bursts by holding the connection + // ~10s before returning a 429 (see .claude/features/cicd-publish-timeout- + // investigation.md). The SDK's default 10s non-subscribe timeout fires ~8ms + // before that 429 arrives, surfacing as `cURL error 28 / 0 bytes`. Raising + // the timeout to 15s lets the SDK actually receive the 429 instead of timing + // out — mirroring what raw curl (--max-time 12) does. + $nonSubscribeRequestTimeout = 15; + $this->config = new PNConfiguration(); $this->config->setSubscribeKey($subscribeKey); $this->config->setPublishKey($publishKey); $this->config->setUuid($uuidMock); + $this->config->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_enc = new PNConfiguration(); $this->config_enc->setSubscribeKey($subscribeKey); $this->config_enc->setPublishKey($publishKey); $this->config_enc->setCipherKey(static::CIPHER_KEY); $this->config_enc->setUuid($uuidMock); + $this->config_enc->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_pam = new PNConfiguration(); $this->config_pam->setSubscribeKey($subscribeKeyPam); $this->config_pam->setPublishKey($publishKeyPam); $this->config_pam->setSecretKey($secretKeyPam); $this->config_pam->setUuid($uuidMock); + $this->config_pam->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_demo = new PNConfiguration(); $this->config_demo->setSubscribeKey('demo'); $this->config_demo->setPublishKey('demo'); $this->config_demo->setUuid($uuidMock); + $this->config_demo->setNonSubscribeRequestTimeout($nonSubscribeRequestTimeout); $this->config_demo->disableImmutableCheck(); $this->pubnub = new PubNub($this->config); From 55a7733d72ca63b871e554688da9a6157e60bf13 Mon Sep 17 00:00:00 2001 From: "marcin.cebo" Date: Thu, 18 Jun 2026 18:23:25 +0200 Subject: [PATCH 21/21] Enable logging --- src/PubNub/Endpoints/Endpoint.php | 38 ++++++++++++++++++++++++++----- tests/PubNubTestCase.php | 5 +++- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/PubNub/Endpoints/Endpoint.php b/src/PubNub/Endpoints/Endpoint.php index f6dcbb53..0e55e706 100755 --- a/src/PubNub/Endpoints/Endpoint.php +++ b/src/PubNub/Endpoints/Endpoint.php @@ -20,6 +20,7 @@ use Psr\Http\Client\ClientExceptionInterface; use Psr\Http\Message\RequestInterface; use Psr\Http\Message\ResponseInterface; +use Psr\Http\Message\UriInterface; abstract class Endpoint { @@ -356,12 +357,13 @@ protected function sendRequest(RequestInterface $request): PNEnvelope } else { $response = $client->sendRequest($request); } -// $this->pubnub->getLogger()->debug(sprintf( -// "%s response from %s negotiated HTTP/%s", -// $this->getName(), -// $request->getUri()->getHost(), -// $response->getProtocolVersion() -// )); + $this->pubnub->getLogger()->debug(sprintf( + "%s %s response from %s negotiated HTTP/%s", + $this->getName(), + $this->redactUri($request->getUri()), + $request->getUri()->getHost(), + $response->getProtocolVersion() + )); $envelope = $this->parseResponse($response); } catch (NetworkExceptionInterface $exception) { return new PNEnvelope(null, $this->createStatus( @@ -409,6 +411,30 @@ protected function sendRequest(RequestInterface $request): PNEnvelope return $envelope; } + /** + * Build a log-safe URI string with sensitive query params masked. + * + * Masks the PAM `auth` token and request `signature` so they don't leak + * into debug logs. All other params (and the pub/sub keys in the path) + * are kept to preserve diagnostic value. + */ + protected function redactUri(UriInterface $uri): string + { + $query = $uri->getQuery(); + if ($query === '') { + return (string) $uri; + } + + parse_str($query, $params); + foreach (['auth', 'signature'] as $sensitive) { + if (isset($params[$sensitive]) && $params[$sensitive] !== '') { + $params[$sensitive] = 'REDACTED'; + } + } + + return (string) $uri->withQuery(http_build_query($params)); + } + /** * @param ResponseInterface $response * @return PNEnvelope diff --git a/tests/PubNubTestCase.php b/tests/PubNubTestCase.php index feb76ea1..fa1306ae 100755 --- a/tests/PubNubTestCase.php +++ b/tests/PubNubTestCase.php @@ -78,7 +78,10 @@ public function setUp(): void $uuidMock = getenv("UUID_MOCK") ?: "UUID_MOCK"; $logger = new Logger('PubNub'); - $logger->pushHandler(new ErrorLogHandler()); + // Log at DEBUG so the SDK's per-request debug lines (endpoint, redacted + // URL, negotiated HTTP version — see Endpoint::sendRequest) surface in + // the test output. Logger::DEBUG is valid in both Monolog 2 and 3. + $logger->pushHandler(new ErrorLogHandler(ErrorLogHandler::OPERATING_SYSTEM, Logger::DEBUG)); parent::setUp();