From d5603e4cfdaf69dc2c34b0f5ca73ce8f929848a2 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Thu, 28 May 2026 22:32:02 +0000
Subject: [PATCH 01/54] ze: add THAPI_REPORT_INJECTED_EVENTS counter

When the env var is set, _get_profiling_event's successful
ZE_EVENT_CREATE_PTR calls increment a global counter; _lib_cleanup
prints the total to stderr ('THAPI: injected events: N') at process
exit.

Useful for the bats infra to lock in baseline per-test injection
counts: any change to the v2 path (e.g. the upcoming lazy fallback
for user-event reuse) that inflates injection beyond the baseline
fails its regression check.

Both the counter increment and the stderr print are gated by the
env var read at init time so the steady-state hot path pays nothing.
---
 backends/ze/tracer_ze_helpers.include.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 8cfe31d7..52a09540 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -29,6 +29,11 @@ static int _do_chained_structs = 0;
 static int _do_paranoid_drift = 0;
 static int _do_paranoid_memory_location = 0;
 static int _do_ddi_table_forward = 0;
+/* When THAPI_REPORT_INJECTED_EVENTS=1, _lib_cleanup prints the running
+ * total to stderr. Useful for the bats infra to assert we don't inject
+ * more events than necessary (lazy fallback regression guard). */
+static int _do_report_injected_events = 0;
+static volatile uint64_t _injected_event_count = 0;
 
 pthread_mutex_t ze_closures_mutex = PTHREAD_MUTEX_INITIALIZER;
 
@@ -324,6 +329,8 @@ static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command
                  e_w->event_pool, context);
     goto cleanup_ep;
   }
+  if (_do_report_injected_events)
+    __sync_fetch_and_add(&_injected_event_count, 1);
   return e_w;
 cleanup_ep:
   ZE_EVENT_POOL_DESTROY_PTR(e_w->event_pool);
@@ -528,6 +535,9 @@ static void THAPI_ATTRIBUTE_DESTRUCTOR _lib_cleanup() {
   if (_do_cleanup) {
     if (_do_profile)
       _event_cleanup();
+    if (_do_report_injected_events)
+      fprintf(stderr, "THAPI: injected events: %lu\n",
+              (unsigned long)_injected_event_count);
   }
 }
 
@@ -737,6 +747,10 @@ static void _load_tracer(void) {
   if (s)
     _do_paranoid_memory_location = 1;
 
+  s = getenv("THAPI_REPORT_INJECTED_EVENTS");
+  if (s)
+    _do_report_injected_events = 1;
+
   _do_cleanup = 1;
 
 #ifndef THAPI_USE_DESTRUCTORS

From 644ae610b3f7dc941e47995a13da5f67ff88076e Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Fri, 29 May 2026 14:50:27 +0000
Subject: [PATCH 02/54] btx_zeinterval: per-hEvent metadata FIFO with rotation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A single ze_event_handle_t can be the signal event of multiple Appends
on the same cmdlist — the tracer's lazy capture inserts a per-Append
Query for each occurrence, producing one event_profiling_results
tracepoint per use. Previously eventToBtxDesct[hEvent] held a single
tuple, so the second event_profiling overwrote the first, and all
subsequent results attributed to the last kernel paired with that
handle (typical symptom: 4 Appends signaling one event tallied as
4 calls of the last kernel, 0 calls of the other three).

Switch the map's value type to std::deque<btx_event_desct_t>:
  - event_profiling pushes_back metadata for that hEvent.
  - event_profiling_result pops_front, attributes to that metadata,
    then push_backs the same entry. Rotation keeps the deque shape
    stable across cmdlist resubmits — N Appends produce N deque
    entries at build time; each Execute generates N results that
    cycle through those N entries in FIFO order.
  - zeCommandQueueExecuteCommandLists's per-cl walk updates every
    entry in the deque (not just the first).

New regression test interval_profiling_shared_event covers the bug
directly: 4 Barriers signaling the same hEvent, 4 results — expected
output has 4 distinct device-side ts (post-fix); pre-fix would have
collapsed all 4 to the last entry's ts.
---
 backends/ze/Makefile.am                       |  3 +-
 backends/ze/btx_zeinterval_callbacks.cpp      | 41 ++++++++++++-------
 backends/ze/btx_zeinterval_callbacks.hpp      | 11 ++++-
 ...rval_profiling_shared_event.bt_text_pretty |  8 ++++
 ...l_profiling_shared_event.thapi_text_pretty | 16 ++++++++
 5 files changed, 63 insertions(+), 16 deletions(-)
 create mode 100644 backends/ze/tests/interval_profiling_shared_event.bt_text_pretty
 create mode 100644 backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty

diff --git a/backends/ze/Makefile.am b/backends/ze/Makefile.am
index 2445e1d6..b037bad8 100644
--- a/backends/ze/Makefile.am
+++ b/backends/ze/Makefile.am
@@ -280,7 +280,8 @@ TRACE_COMMON = \
 	tests/interval_profiling_API_call.thapi_text_pretty \
 	tests/interval_profiling_fast.thapi_text_pretty \
 	tests/interval_profiling_interleave_process.thapi_text_pretty \
-	tests/interval_profiling_ignore.thapi_text_pretty
+	tests/interval_profiling_ignore.thapi_text_pretty \
+	tests/interval_profiling_shared_event.thapi_text_pretty
 
 BTX_ZE_GENERATED_SOURCE_TEST = \
         btx_source_ze_test/metababel/metababel.h \
diff --git a/backends/ze/btx_zeinterval_callbacks.cpp b/backends/ze/btx_zeinterval_callbacks.cpp
index c6355fdb..44c24f94 100644
--- a/backends/ze/btx_zeinterval_callbacks.cpp
+++ b/backends/ze/btx_zeinterval_callbacks.cpp
@@ -584,9 +584,11 @@ zeCommandQueueExecuteCommandLists_entry_callback(void *btx_handle,
   const auto commandQueueDesc = data->commandQueueToDesc[{hostname, vpid, hCommandQueue}];
   for (size_t i = 0; i < _phCommandLists_vals_length; i++) {
     for (auto &hEvent : data->commandListToEvents[{hostname, vpid, phCommandLists_vals[i]}]) {
-      auto &h = data->eventToBtxDesct[{hostname, vpid, hEvent}];
-      std::get<ze_command_queue_desc_t>(h) = commandQueueDesc;
-      std::get<int64_t>(h) = ts;
+      auto &dq = data->eventToBtxDesct[{hostname, vpid, hEvent}];
+      for (auto &h : dq) {
+        std::get<ze_command_queue_desc_t>(h) = commandQueueDesc;
+        std::get<int64_t>(h) = ts;
+      }
     }
   }
 }
@@ -824,12 +826,16 @@ static void event_profiling_callback(void *btx_handle,
       clockLttngDevice = it0->second;
   }
 
-  // If not IMM will be commandQueueDesc overwrited latter
-  data->eventToBtxDesct[{hostname, vpid, hEvent}] = {vtid,         commandQueueDesc,
-                                                     hCommandList, hCommandListIsImmediate,
-                                                     hDevice,      commandName,
-                                                     ts_min,       clockLttngDevice,
-                                                     type,         ptr};
+  // If not IMM will be commandQueueDesc overwrited latter.
+  // Push onto the per-hEvent FIFO: the matching event_profiling_result
+  // pop_fronts to retrieve this Append's metadata. The tracer's lazy
+  // capture can produce N results for the same hEvent in submission
+  // order, one pop per result.
+  data->eventToBtxDesct[{hostname, vpid, hEvent}].push_back({vtid,         commandQueueDesc,
+                                                              hCommandList, hCommandListIsImmediate,
+                                                              hDevice,      commandName,
+                                                              ts_min,       clockLttngDevice,
+                                                              type,         ptr});
   // Prepare job for non IMM
   if (!hCommandListIsImmediate)
     data->commandListToEvents[{hostname, vpid, hCommandList}].insert(hEvent);
@@ -880,14 +886,21 @@ static void event_profiling_result_callback(void *btx_handle,
 
   auto *data = static_cast<data_t *>(usr_data);
 
-  // TODO: Should  we always find the eventToBtxDesct?
-  // We didn't find the partial payload, that mean we should ignore it
+  // Read the oldest pending metadata for this hEvent — FIFO matches the
+  // submission order, which is the order results arrive in for in-order
+  // cmdlists with shared signal events. Rotate (pop_front + push_back)
+  // instead of popping so the cycle works across resubmits of the same
+  // cmdlist: N Appends produce N deque entries at build time; each
+  // Execute generates N results that rotate through those N entries.
+  // The deque shape persists for the cmdlist's lifetime.
+  // We didn't find the partial payload, that means we should ignore it.
   const auto it_p = data->eventToBtxDesct.find({hostname, vpid, hEvent});
-  if (it_p == data->eventToBtxDesct.cend())
+  if (it_p == data->eventToBtxDesct.cend() || it_p->second.empty())
     return;
-  // We don't erase, may have one entry for multiple result
+  it_p->second.push_back(it_p->second.front());
+  it_p->second.pop_front();
   const auto &[vtid_submission, commandQueueDesc, hCommandList, hCommandListIsImmediate, device,
-               commandName, lltngMin, clockLttngDevice, type, ptr] = it_p->second;
+               commandName, lltngMin, clockLttngDevice, type, ptr] = it_p->second.back();
   std::string metadata = "";
   {
     std::stringstream ss_metadata;
diff --git a/backends/ze/btx_zeinterval_callbacks.hpp b/backends/ze/btx_zeinterval_callbacks.hpp
index 80c2dc11..61ce47aa 100644
--- a/backends/ze/btx_zeinterval_callbacks.hpp
+++ b/backends/ze/btx_zeinterval_callbacks.hpp
@@ -4,6 +4,7 @@
 #include <cstddef> // Bytes
 typedef bool _Bool;
 #include <metababel/metababel.h>
+#include <deque>
 #include <optional>
 #include <stdexcept>
 #include <tuple>
@@ -93,7 +94,15 @@ struct data_s {
   std::unordered_map<hp_command_queue_t, ze_command_queue_desc_t> commandQueueToDesc;
 
   std::unordered_map<hpt_t, btx_launch_desc_t> threadToLastLaunchInfo;
-  std::unordered_map<hp_event_t, btx_event_desct_t> eventToBtxDesct;
+  /* FIFO of pending metadata per event handle.
+   *
+   * A single hEvent can be the signal event of multiple Appends on
+   * the same cmdlist; the tracer inserts a per-Append Query for each
+   * occurrence, producing one event_profiling_results tracepoint per
+   * use. Each Append pushes its metadata at event_profiling time;
+   * each result pop_fronts to attribute it to the correct (FIFO-
+   * oldest) Append. */
+  std::unordered_map<hp_event_t, std::deque<btx_event_desct_t>> eventToBtxDesct;
   // Require for non IMM
   std::unordered_map<hp_command_list_t, std::unordered_set<ze_event_handle_t>> commandListToEvents;
 
diff --git a/backends/ze/tests/interval_profiling_shared_event.bt_text_pretty b/backends/ze/tests/interval_profiling_shared_event.bt_text_pretty
new file mode 100644
index 00000000..2205557d
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event.bt_text_pretty
@@ -0,0 +1,8 @@
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000200, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000300, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000200, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000300, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 50, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
diff --git a/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
new file mode 100644
index 00000000..341d8d3b
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
@@ -0,0 +1,16 @@
+12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.110000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.120000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.210000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.220000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.300000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.310000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.320000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.400000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
+12:00:00.410000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 }
+12:00:00.420000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 200, globalEnd: 240, contextStart: 200, contextEnd: 240 }
+12:00:00.430000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 300, globalEnd: 350, contextStart: 300, contextEnd: 350 }

From 605d236d2e14118a398416272578afb591489069 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Fri, 29 May 2026 17:02:52 +0000
Subject: [PATCH 03/54] btx_zeinterval: pop-and-discard FIFO + clear
 kernelToDesct on destroy

The per-hEvent FIFO deque previously rotated (pop_front + push_back) on
every event_profiling_result, intended to let a single set of build-time
pushes serve M*N results across M cmdlist resubmits. But for the common
case (1 push, 1 pop), rotation leaves stale entries in the deque, so the
next push reads the wrong metadata. Symptom: 3 distinct kernel runs
attributed as (busy_a=2, busy_b=1) instead of (busy_a=1, busy_b=1,
busy_c=1).

Switch to pop-and-discard: each Append pushes once, each result pops
once. Resubmit-without-rebuild of the same cmdlist is no longer
supported here; it is a deferred case on the tracer side as well.

Also add zeKernelDestroy_entry to erase kernelToDesct[hKernel]. Drivers
recycle freed kernel handle addresses; without the erase, the old entry
silently survives until overwritten by the next Create. Defensive even
though the rotation fix is what unblocks the busy-timing reproducer.
---
 backends/ze/btx_zeinterval_callbacks.cpp | 40 ++++++++++++++++++------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/backends/ze/btx_zeinterval_callbacks.cpp b/backends/ze/btx_zeinterval_callbacks.cpp
index 44c24f94..2daeb3de 100644
--- a/backends/ze/btx_zeinterval_callbacks.cpp
+++ b/backends/ze/btx_zeinterval_callbacks.cpp
@@ -249,6 +249,24 @@ static void zeKernelCreate_exit_callback(void *btx_handle,
   std::get<std::string>(a) = kernelName;
 }
 
+// Drivers commonly recycle freed kernel handle addresses, so a later
+// zeKernelCreate on the same address will overwrite the entry. The
+// problematic case is races/orderings where a kernel result attributes
+// to the wrong name: clearing the entry on destroy makes the
+// reuse-after-destroy path explicit (the next Create populates a fresh
+// entry rather than mutating one that might still be referenced).
+static void zeKernelDestroy_entry_callback(void *btx_handle,
+                                           void *usr_data,
+                                           int64_t ts,
+                                           const char *hostname,
+                                           int64_t vpid,
+                                           uint64_t vtid,
+                                           ze_kernel_handle_t hKernel) {
+
+  auto *data = static_cast<data_t *>(usr_data);
+  data->kernelToDesct.erase({hostname, vpid, hKernel});
+}
+
 // It's possible to bypass zeKernelCreate,
 //      as a workaround for now, hoping that people will call
 //      zeKernelGetName
@@ -886,21 +904,21 @@ static void event_profiling_result_callback(void *btx_handle,
 
   auto *data = static_cast<data_t *>(usr_data);
 
-  // Read the oldest pending metadata for this hEvent — FIFO matches the
-  // submission order, which is the order results arrive in for in-order
-  // cmdlists with shared signal events. Rotate (pop_front + push_back)
-  // instead of popping so the cycle works across resubmits of the same
-  // cmdlist: N Appends produce N deque entries at build time; each
-  // Execute generates N results that rotate through those N entries.
-  // The deque shape persists for the cmdlist's lifetime.
-  // We didn't find the partial payload, that means we should ignore it.
+  // Read the oldest pending metadata for this hEvent and consume it —
+  // FIFO matches the submission order, which is the order results
+  // arrive in for in-order cmdlists with shared signal events. Each
+  // Append pushes once; each result pops once; the deque drains
+  // exactly. We do NOT rotate (push_back + pop_front): that would
+  // be needed only for OOO-resubmit-without-reset of the same cmdlist
+  // (one set of pushes serving M*N pops), a case the universal tracer
+  // explicitly defers.
   const auto it_p = data->eventToBtxDesct.find({hostname, vpid, hEvent});
   if (it_p == data->eventToBtxDesct.cend() || it_p->second.empty())
     return;
-  it_p->second.push_back(it_p->second.front());
+  const auto popped = it_p->second.front();
   it_p->second.pop_front();
   const auto &[vtid_submission, commandQueueDesc, hCommandList, hCommandListIsImmediate, device,
-               commandName, lltngMin, clockLttngDevice, type, ptr] = it_p->second.back();
+               commandName, lltngMin, clockLttngDevice, type, ptr] = popped;
   std::string metadata = "";
   {
     std::stringstream ss_metadata;
@@ -1390,6 +1408,8 @@ void btx_register_usr_callbacks(void *btx_handle) {
   /*  Name of the Function Profiled  */
   REGISTER_ASSOCIATED_CALLBACK(zeKernelCreate_entry);
   REGISTER_ASSOCIATED_CALLBACK(zeKernelCreate_exit);
+  btx_register_callbacks_lttng_ust_ze_zeKernelDestroy_entry(btx_handle,
+                                                            &zeKernelDestroy_entry_callback);
   REGISTER_ASSOCIATED_CALLBACK(zeKernelGetName_entry);
   REGISTER_ASSOCIATED_CALLBACK(zeKernelGetName_exit);
 

From 44ed51030ee6a14864a966a2412b08cbb07a9760 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Fri, 29 May 2026 17:03:12 +0000
Subject: [PATCH 04/54] ze: universal per-Append AppendQueryKernelTimestamps
 scheme
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every profiled Append (Launch*, Memcpy*, Barrier, SignalEvent, ...) is
rewritten to:

  inj_ev = inject_event(cl)         // fresh wrapper from per-context pool
  user_signal = hSignalEvent        // saved (may be NULL)
  <user Append, with hSignalEvent swapped to inj_ev>
  AppendQueryKernelTimestamps(cl, 1, &inj_ev, slab, &slot_off,
                              hSignalEvent=user_signal,
                              waitEvents=1 &inj_ev)

The Query waits on inj_ev (so it runs after the user op completes) and
signals the user's original event (so user wait semantics are preserved).
Per-cl slot list records (inj_ev, attribution_event, slab_offset). On
sync (queue / event / fence / cl-host) the slabs of every tracked cl
are drained — each slot emits one event_profiling_results to the
attribution event (user's, or inj_ev if user passed NULL).

Identical for in-order / OOO and regular / immediate cmdlists: 1
injected event per Append, 1 extra API call per Append. No host-side
KT read; no per-event-handle bookkeeping. Cross-cmdlist event reuse,
mid-chain re-signal, and shared signal events all work because each
Append has its own slab slot independent of which user event it
attributes to.

Resubmit of the same regular cmdlist without rebuild between Executes
is deferred: the slab slots get overwritten by the second Execute.
Reset followed by re-Append works (slot list resets on Reset).

This commit:
  - adds the slab pool + slot list to _ze_command_list_obj_data
  - adds _universal_record_append, _cl_drain, _on_sync_drain_{cl,all}
  - swaps the model profiling_prologue/epilogue to the universal path
  - wires drain hooks to sync APIs (queue/event/fence/cl-host)
  - removes the legacy per-Append injected-event-attached-to-cl path:
    _on_destroy_event, _on_reset_event, _unregister_ze_event,
    _dump_and_reset_our_event, _profile_event_results, _event_cleanup,
    _on_execute_command_lists, cl_data->events, _ZE_EXECUTED,
    _ZE_IMMEDIATE_CMD, _ze_events hash, FIND/ADD_ZE_EVENT macros
  - drops zeEventDestroy/zeEventHostReset/zeCommandQueueExecuteCommandLists
    model hooks (no consumers left)

The injected wrapper still goes through the per-context PUT_ZE_EVENT
free pool so the underlying ze_event_handle is reused — only the
slot-bookkeeping is freshly allocated per Append.

35/46 reproducer tests pass (was 24/46 with the prior v2 partial path).
The 11 remaining failures are all in the deferred resubmit-without-
rebuild class.
---
 backends/ze/tracer_ze_helpers.include.c | 471 ++++++++++++------------
 backends/ze/ze_model.rb                 |  69 ++--
 2 files changed, 270 insertions(+), 270 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 52a09540..a93d700a 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -48,16 +48,33 @@ struct ze_closure {
 
 struct ze_closure *ze_closures = NULL;
 
-typedef enum _ze_command_list_flag { _ZE_EXECUTED = ZE_BIT(0) } _ze_command_list_flag_t;
-typedef _ze_command_list_flag_t _ze_command_list_flags_t;
-
 struct _ze_event_h;
 
+/* Universal per-Append scheme bookkeeping (see project_ze_universal_scheme):
+ * one slot per profiled Append in the cl. Each slot holds the injected
+ * event we swapped in (wrapper), the original user signal event we'll
+ * attribute the timestamp to at drain time (NULL if user passed no
+ * event — attribute to inj instead), and the offset within the cl's
+ * slab buffer where the Query writes the timestamp. */
+struct _ze_slot {
+  struct _ze_event_h *inj;             /* tracer-owned event swapped into the Append */
+  ze_event_handle_t   attr;            /* event to attribute the timestamp to at drain (NULL => inj->event) */
+  size_t              off;             /* byte offset within cl_data->slab */
+};
+
+#define _ZE_SLAB_SLOTS_INITIAL 64
+
 struct _ze_command_list_obj_data {
   void *ptr; /* the ze_command_list_handle_t this entry tracks */
   UT_hash_handle hh;
-  _ze_command_list_flags_t flags;
-  struct _ze_event_h *events;
+
+  /* Universal scheme state — populated lazily on first profiled Append. */
+  void              *slab;       /* host-visible buffer for Query writes */
+  size_t             slab_bytes; /* allocated size in bytes */
+  ze_context_handle_t slab_ctx;  /* context the slab is allocated on (for free) */
+  struct _ze_slot   *slots;
+  uint32_t           n_slots;
+  uint32_t           cap_slots;
 };
 
 struct _ze_command_list_obj_data *_ze_cls = NULL;
@@ -87,7 +104,7 @@ pthread_mutex_t _ze_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
     pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
   } while (0)
 
-static inline void _on_create_command_list(ze_command_list_handle_t command_list, int immediate) {
+static inline void _on_create_command_list(ze_command_list_handle_t command_list) {
   struct _ze_command_list_obj_data *cl_data = NULL;
 
   FIND_ZE_CL(&command_list, cl_data);
@@ -101,57 +118,20 @@ static inline void _on_create_command_list(ze_command_list_handle_t command_list
     THAPI_DBGLOG_NO_ARGS("Failed to allocate memory");
     return;
   }
-
   cl_data->ptr = (void *)command_list;
-  /* Immediate cls have no Execute step; their appends run on the device the
-   * moment they're submitted. Treat them as already-executed so drainers
-   * (Reset/Destroy hooks) query their events via _ZE_EXECUTED uniformly. */
-  if (immediate)
-    cl_data->flags = _ZE_EXECUTED;
-
   ADD_ZE_CL(cl_data);
 }
 
-typedef enum _ze_event_flag { _ZE_IMMEDIATE_CMD = ZE_BIT(0) } _ze_event_flag_t;
-typedef _ze_event_flag_t _ze_event_flags_t;
-
+/* Wrapper around an injected event we own. Lives either in the per-context
+ * free pool (between uses) or anchored to one of cl_data->slots[] (in flight). */
 struct _ze_event_h {
   ze_event_handle_t event;
-  UT_hash_handle hh;
   ze_event_pool_handle_t event_pool;
   ze_context_handle_t context;
-  _ze_event_flags_t flags;
-  /* to remember events in command lists */
+  /* doubly-linked list pointers used by the per-context free pool */
   struct _ze_event_h *next, *prev;
 };
 
-static struct _ze_event_h *_ze_events = NULL;
-static pthread_mutex_t _ze_events_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-#define FIND_ZE_EVENT(key, val)                                                                    \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_events_mutex);                                                         \
-    HASH_FIND_PTR(_ze_events, key, val);                                                           \
-    pthread_mutex_unlock(&_ze_events_mutex);                                                       \
-  } while (0)
-
-#define ADD_ZE_EVENT(val)                                                                          \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_events_mutex);                                                         \
-    HASH_ADD_PTR(_ze_events, event, val);                                                          \
-    pthread_mutex_unlock(&_ze_events_mutex);                                                       \
-  } while (0)
-
-#define FIND_AND_DEL_ZE_EVENT(key, val)                                                            \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_events_mutex);                                                         \
-    HASH_FIND_PTR(_ze_events, key, val);                                                           \
-    if (val) {                                                                                     \
-      HASH_DEL(_ze_events, val);                                                                   \
-    }                                                                                              \
-    pthread_mutex_unlock(&_ze_events_mutex);                                                       \
-  } while (0)
-
 struct _ze_event_pool_entry {
   ze_context_handle_t context;
   UT_hash_handle hh;
@@ -195,7 +175,6 @@ static pthread_mutex_t _ze_event_pools_mutex = PTHREAD_MUTEX_INITIALIZER;
       pool->context = val->context;                                                                \
       HASH_ADD_PTR(_ze_event_pools, context, pool);                                                \
     }                                                                                              \
-    val->flags = 0;                                                                                \
     ZE_EVENT_HOST_RESET_PTR(val->event);                                                           \
     DL_PREPEND(pool->events, val);                                                                 \
     pthread_mutex_unlock(&_ze_event_pools_mutex);                                                  \
@@ -224,74 +203,6 @@ static pthread_mutex_t _ze_event_wrappers_mutex = PTHREAD_MUTEX_INITIALIZER;
     pthread_mutex_unlock(&_ze_event_wrappers_mutex);                                               \
   } while (0)
 
-/* Snapshot context + immediate-flag from cmdlist into the event wrapper.
- * The immediate flag is read at register time (not at _on_reset_event
- * time) because by reset time the cmdlist may already be destroyed and
- * zeCommandListIsImmediate would dereference a freed handle. */
-static inline void _tag_event_from_cl(struct _ze_event_h *_ze_event,
-                                      ze_command_list_handle_t command_list) {
-  ze_context_handle_t context = NULL;
-  ze_result_t res = ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &context);
-  if (res == ZE_RESULT_SUCCESS && context)
-    _ze_event->context = context;
-  else
-    THAPI_DBGLOG("zeCommandListGetContextHandle failed with %d for command list: %p", res,
-                 command_list);
-
-  ze_bool_t is_immediate = 0;
-  if (ZE_COMMAND_LIST_IS_IMMEDIATE_PTR(command_list, &is_immediate) == ZE_RESULT_SUCCESS &&
-      is_immediate)
-    _ze_event->flags |= _ZE_IMMEDIATE_CMD;
-}
-
-/* Append an event wrapper we own to its cmdlist's events list, under the
- * cl-hash lock (the FIND_AND_DEL/ADD pattern guards cl_data against a
- * concurrent free in _on_destroy_command_list). */
-static inline void _attach_event_to_cl(struct _ze_event_h *_ze_event,
-                                       ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
-  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) {
-    THAPI_DBGLOG("Could not get command list associated to event: %p", _ze_event->event);
-    return;
-  }
-  DL_APPEND(cl_data->events, _ze_event);
-  ADD_ZE_CL(cl_data);
-}
-
-/* Register an injected (tracer-owned) event. Caller has already populated
- * _ze_event->event and _ze_event->event_pool via _get_profiling_event. */
-static inline void _register_our_event(struct _ze_event_h *_ze_event,
-                                       ze_command_list_handle_t command_list) {
-  _tag_event_from_cl(_ze_event, command_list);
-  _attach_event_to_cl(_ze_event, command_list);
-  ADD_ZE_EVENT(_ze_event);
-}
-
-/* Register a user event (we don't own its lifetime). Look up or create the
- * wrapper; users are responsible for reset/destroy, so we don't attach it
- * to the cl's events list. */
-static inline void _register_user_event(ze_event_handle_t event,
-                                        ze_command_list_handle_t command_list) {
-  struct _ze_event_h *_ze_event = NULL;
-  FIND_ZE_EVENT(&event, _ze_event);
-  if (_ze_event)
-    return; /* already tracked, nothing more to do */
-
-  GET_ZE_EVENT_WRAPPER(_ze_event);
-  if (!_ze_event) {
-    THAPI_DBGLOG("Could not get event wrapper for: %p", event);
-    return;
-  }
-  /* GET_ZE_EVENT_WRAPPER returns a fully-zeroed wrapper (calloc on first use,
-   * memset by PUT_ZE_EVENT_WRAPPER on recycle), so event_pool and flags are
-   * already 0 — only set the fields we actually want non-zero. */
-  _ze_event->event = event;
-
-  _tag_event_from_cl(_ze_event, command_list);
-  ADD_ZE_EVENT(_ze_event);
-}
-
 static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command_list) {
   struct _ze_event_h *e_w;
 
@@ -339,86 +250,174 @@ static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command
   return NULL;
 }
 
-static void _profile_event_results(ze_event_handle_t event) {
-  ze_kernel_timestamp_result_t res = {0};
-  ze_result_t status;
-  ze_result_t timestamp_status;
-
-  if (tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) {
-    status = ZE_EVENT_QUERY_STATUS_PTR(event);
-    timestamp_status = ZE_EVENT_QUERY_KERNEL_TIMESTAMP_PTR(event, &res);
-    do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, event, status, timestamp_status,
-                  res.global.kernelStart, res.global.kernelEnd, res.context.kernelStart,
-                  res.context.kernelEnd);
+/* Emit an event_profiling_results tracepoint directly from a captured
+ * ze_kernel_timestamp_result_t (no driver Query). Used by the universal
+ * scheme's drain path: the Query already wrote the timestamp into the
+ * slab buffer, so we just read the slot and emit. */
+static inline void _emit_kts_tracepoint(ze_event_handle_t attr_event,
+                                        const ze_kernel_timestamp_result_t *r) {
+  if (tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results))
+    do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr_event,
+                  ZE_RESULT_SUCCESS, ZE_RESULT_SUCCESS,
+                  r->global.kernelStart, r->global.kernelEnd,
+                  r->context.kernelStart, r->context.kernelEnd);
+}
+
+/* Universal scheme: ensure the cl's slab buffer is large enough to hold
+ * `n_slots` timestamps. First call allocates a host-visible buffer in
+ * `ctx`; later calls grow if needed. Returns 0 on success. */
+static int _cl_slab_ensure(struct _ze_command_list_obj_data *cl_data,
+                           ze_context_handle_t ctx, uint32_t n_slots) {
+  size_t needed = (size_t)n_slots * sizeof(ze_kernel_timestamp_result_t);
+  if (cl_data->slab && cl_data->slab_bytes >= needed)
+    return 0;
+  if (cl_data->slab) {
+    /* Outgrew the initial slab. For now we only allocate the initial size
+     * (capacity is bumped via realloc of the slot array; the slab itself
+     * is sized once). If we hit this path, it means more profiled Appends
+     * than _ZE_SLAB_SLOTS_INITIAL in a single cl — bail rather than
+     * realloc a host-visible alloc (no safe way to do that mid-record). */
+    THAPI_DBGLOG("slab full for cl %p (have %zu bytes, need %zu)",
+                 cl_data->ptr, cl_data->slab_bytes, needed);
+    return -1;
+  }
+  size_t bytes = (size_t)_ZE_SLAB_SLOTS_INITIAL * sizeof(ze_kernel_timestamp_result_t);
+  ze_host_mem_alloc_desc_t hd = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, NULL, 0};
+  void *buf = NULL;
+  if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &buf) != ZE_RESULT_SUCCESS ||
+      !buf) {
+    THAPI_DBGLOG("zeMemAllocHost(slab) failed for cl %p", cl_data->ptr);
+    return -1;
+  }
+  memset(buf, 0, bytes);
+  cl_data->slab = buf;
+  cl_data->slab_bytes = bytes;
+  cl_data->slab_ctx = ctx;
+  return 0;
+}
+
+/* Universal scheme: grow the slot array if full. */
+static inline int _cl_slots_grow(struct _ze_command_list_obj_data *cl_data) {
+  if (cl_data->n_slots < cl_data->cap_slots) return 0;
+  uint32_t new_cap = cl_data->cap_slots ? cl_data->cap_slots * 2 : 8;
+  struct _ze_slot *grown = (struct _ze_slot *)realloc(
+      cl_data->slots, new_cap * sizeof(struct _ze_slot));
+  if (!grown) return -1;
+  cl_data->slots = grown;
+  cl_data->cap_slots = new_cap;
+  return 0;
+}
+
+/* Universal scheme: record one new slot on this cl. Caller will issue
+ * the actual zeCommandListAppendQueryKernelTimestamps with the returned
+ * offset. Returns NULL on failure (caller should not insert the Query). */
+static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_data,
+                                        ze_context_handle_t ctx,
+                                        struct _ze_event_h *inj,
+                                        ze_event_handle_t attr) {
+  if (_cl_slots_grow(cl_data) != 0) return NULL;
+  if (_cl_slab_ensure(cl_data, ctx, cl_data->n_slots + 1) != 0) return NULL;
+  struct _ze_slot *s = &cl_data->slots[cl_data->n_slots++];
+  s->inj  = inj;
+  s->attr = attr;
+  s->off  = (size_t)(cl_data->n_slots - 1) * sizeof(ze_kernel_timestamp_result_t);
+  return s;
+}
+
+/* Universal scheme — append-time hook called from profiling_epilogue.
+ *
+ * Postconditions on success:
+ *   - One zeCommandListAppendQueryKernelTimestamps appended to the cl,
+ *     waiting on `inj`'s event and signaling `user_signal` (NULL = no
+ *     signal). Its dst byte-offset within cl_data->slab is recorded in
+ *     a new slot.
+ *   - The slot's `attr` is set to user_signal (or NULL → attribute to
+ *     inj at drain time), so iprof gets one event_profiling_results per
+ *     profiled Append.
+ *
+ * On failure (no cl_data, no context, slab/slot alloc failed, Query
+ * failed): the injected wrapper is released back to the pool and no
+ * Query is added. The user's Append already happened; we just lose the
+ * timestamp for this one.
+ *
+ * Caller has already swapped the user's hSignalEvent for inj->event.
+ * `user_signal` is the ORIGINAL value (possibly NULL). */
+static void _universal_record_append(ze_command_list_handle_t command_list,
+                                     struct _ze_event_h *inj,
+                                     ze_event_handle_t user_signal) {
+  if (!inj) return;
+
+  ze_context_handle_t ctx = NULL;
+  if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &ctx) != ZE_RESULT_SUCCESS || !ctx) {
+    PUT_ZE_EVENT(inj);
+    return;
   }
-}
-
-static inline void _on_destroy_event(ze_event_handle_t event) {
-  struct _ze_event_h *ze_event = NULL;
+  /* Stamp the wrapper's context so PUT_ZE_EVENT can route it back to the
+   * correct per-context pool at drain. */
+  inj->context = ctx;
 
-  FIND_AND_DEL_ZE_EVENT(&event, ze_event);
-  if (!ze_event) {
+  struct _ze_command_list_obj_data *cl_data = NULL;
+  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
+  if (!cl_data) {
+    PUT_ZE_EVENT(inj);
     return;
   }
 
-  _profile_event_results(event);
-  PUT_ZE_EVENT_WRAPPER(ze_event);
-}
-
-/* Caller already holds the wrapper (e.g. iterating cl_data->events) and
- * has removed it from any per-cl list. Drops it from the global events
- * hash, optionally emits its timestamp tracepoint, and recycles. */
-static inline void _unregister_ze_event(struct _ze_event_h *ze_event, int get_results) {
-  struct _ze_event_h *evicted = NULL;
-  FIND_AND_DEL_ZE_EVENT(&ze_event->event, evicted);
-  /* evicted should be == ze_event; if not, our hash bookkeeping is corrupt. */
-
-  if (get_results)
-    _profile_event_results(ze_event->event);
-  if (ze_event->event_pool)
-    PUT_ZE_EVENT(ze_event);
-  else
-    PUT_ZE_EVENT_WRAPPER(ze_event);
-}
-
-static inline void _on_reset_event(ze_event_handle_t event) {
-  struct _ze_event_h *ze_event = NULL;
-
-  FIND_AND_DEL_ZE_EVENT(&event, ze_event);
-  if (!ze_event) {
-    THAPI_DBGLOG("Could not find event: %p", event);
+  struct _ze_slot *slot = _cl_slot_append(cl_data, ctx, inj, user_signal);
+  if (!slot) {
+    ADD_ZE_CL(cl_data);
+    PUT_ZE_EVENT(inj);
     return;
   }
 
-  _profile_event_results(event);
-
-  if (!(ze_event->flags & _ZE_IMMEDIATE_CMD))
-    ADD_ZE_EVENT(ze_event);
-  else
-    PUT_ZE_EVENT_WRAPPER(ze_event);
-}
-
-static inline void _dump_and_reset_our_event(ze_event_handle_t event) {
-  struct _ze_event_h *ze_event = NULL;
-
-  FIND_AND_DEL_ZE_EVENT(&event, ze_event);
-  if (!ze_event) {
-    THAPI_DBGLOG("Could not find event: %p", event);
+  /* Insert the Query into the cmdlist body. wait=inj so the Query runs
+   * after the user's op (which signals inj); signal=user_signal so user
+   * code that waits on user_signal still sees a signal. */
+  ze_event_handle_t wait_ev = inj->event;
+  ze_result_t r = ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(
+      command_list, 1, &wait_ev, cl_data->slab, &slot->off,
+      /*hSignalEvent=*/ user_signal,
+      /*numWaitEvents=*/ 1, &wait_ev);
+  if (r != ZE_RESULT_SUCCESS) {
+    /* Roll the slot back so drain doesn't read garbage. */
+    cl_data->n_slots--;
+    ADD_ZE_CL(cl_data);
+    PUT_ZE_EVENT(inj);
     return;
   }
-
-  _profile_event_results(event);
-  ZE_EVENT_HOST_RESET_PTR(event);
-  ADD_ZE_EVENT(ze_event);
+  ADD_ZE_CL(cl_data);
 }
 
-/* Tear down a wrapper: optionally emit its timestamp tracepoint, then
- * destroy the injected event+pool if we own them, then recycle the
- * wrapper. Caller must have already removed it from any list/hash that
- * references it. */
-static inline void _dispose_event_wrapper(struct _ze_event_h *ze_event, int do_dump) {
-  if (do_dump && ze_event->event)
-    _profile_event_results(ze_event->event);
+/* Universal scheme: drain captured timestamps from cl's slab and emit a
+ * tracepoint per slot. Resets slot count but keeps the slab + capacity
+ * for reuse on the next build. Called from sync hooks (post-Execute /
+ * post-Sync). Safe to call when nothing's pending — returns immediately. */
+static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
+  if (cl_data->n_slots == 0) return;
+  if (!cl_data->slab) {
+    cl_data->n_slots = 0;
+    return;
+  }
+  for (uint32_t i = 0; i < cl_data->n_slots; ++i) {
+    struct _ze_slot *s = &cl_data->slots[i];
+    ze_kernel_timestamp_result_t r =
+        *(ze_kernel_timestamp_result_t *)((char *)cl_data->slab + s->off);
+    ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
+    if (attr)
+      _emit_kts_tracepoint(attr, &r);
+    /* Release the injected wrapper back to the per-context pool. The
+     * wrapper's event/pool stay alive in the pool so the next Append on
+     * any cl in this context can recycle them. */
+    if (s->inj)
+      PUT_ZE_EVENT(s->inj);
+  }
+  cl_data->n_slots = 0;
+}
+
+/* Tear down a wrapper: destroy our injected event+pool if we own them,
+ * then recycle the wrapper. Caller has already removed it from the
+ * per-context free pool. */
+static inline void _dispose_event_wrapper(struct _ze_event_h *ze_event) {
   if (ze_event->event_pool) {
     if (ze_event->event)
       ZE_EVENT_DESTROY_PTR(ze_event->event);
@@ -427,26 +426,9 @@ static inline void _dispose_event_wrapper(struct _ze_event_h *ze_event, int do_d
   PUT_ZE_EVENT_WRAPPER(ze_event);
 }
 
-static void _event_cleanup() {
-  struct _ze_event_h *ze_event = NULL;
-  struct _ze_event_h *tmp = NULL;
-  HASH_ITER(hh, _ze_events, ze_event, tmp) {
-    HASH_DEL(_ze_events, ze_event);
-    _dispose_event_wrapper(ze_event, 1);
-  }
-}
-
 static void _on_destroy_context(ze_context_handle_t context) {
-  struct _ze_event_h *ze_event = NULL;
-  struct _ze_event_h *tmp = NULL;
-  pthread_mutex_lock(&_ze_events_mutex);
-  HASH_ITER(hh, _ze_events, ze_event, tmp) {
-    if (ze_event->context == context) {
-      HASH_DEL(_ze_events, ze_event);
-      _dispose_event_wrapper(ze_event, 1);
-    }
-  }
-  pthread_mutex_unlock(&_ze_events_mutex);
+  /* Free the per-context event-wrapper pool. All wrappers in it are idle
+   * (returned via PUT_ZE_EVENT), so just dispose them. */
   pthread_mutex_lock(&_ze_event_pools_mutex);
   struct _ze_event_pool_entry *pool = NULL;
   HASH_FIND_PTR(_ze_event_pools, &context, pool);
@@ -455,65 +437,72 @@ static void _on_destroy_context(ze_context_handle_t context) {
     struct _ze_event_h *elt = NULL, *tmp = NULL;
     DL_FOREACH_SAFE(pool->events, elt, tmp) {
       DL_DELETE(pool->events, elt);
-      /* Wrapper is in the free list — its event was already dumped+reset
-       * by whoever recycled it. Don't dump again, just tear down. */
-      _dispose_event_wrapper(elt, 0);
+      _dispose_event_wrapper(elt);
     }
     free(pool);
   }
   pthread_mutex_unlock(&_ze_event_pools_mutex);
 }
 
-static void _on_reset_command_list(ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
+/* Universal scheme: free the cl's slab buffer (if allocated). Caller has
+ * already drained the slots. Idempotent. */
+static void _cl_slab_free(struct _ze_command_list_obj_data *cl_data) {
+  if (cl_data->slab) {
+    if (ZE_MEM_FREE_PTR && cl_data->slab_ctx)
+      ZE_MEM_FREE_PTR(cl_data->slab_ctx, cl_data->slab);
+    cl_data->slab = NULL;
+    cl_data->slab_bytes = 0;
+    cl_data->slab_ctx = NULL;
+  }
+}
 
+/* Universal scheme: drain a cl by handle, used by sync hooks. Walks the
+ * cl hash to find the cl_data, then drains it. Safe if cl_data is gone
+ * (e.g. raced with destroy) — just no-ops. */
+static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
+  struct _ze_command_list_obj_data *cl_data = NULL;
   FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) {
-    THAPI_DBGLOG("Could not get command list: %p", command_list);
-    return;
-  }
-  struct _ze_event_h *elt = NULL, *tmp = NULL;
-  DL_FOREACH_SAFE(cl_data->events, elt, tmp) {
-    DL_DELETE(cl_data->events, elt);
-    _unregister_ze_event(elt, cl_data->flags & _ZE_EXECUTED);
-  }
-  cl_data->flags &= ~_ZE_EXECUTED;
+  if (!cl_data) return;
+  _cl_drain(cl_data);
   ADD_ZE_CL(cl_data);
 }
 
-static void _on_execute_command_lists(uint32_t numCommandLists,
-                                      ze_command_list_handle_t *phCommandLists) {
-  for (uint32_t i = 0; i < numCommandLists; i++) {
-    struct _ze_command_list_obj_data *cl_data = NULL;
-    FIND_AND_DEL_ZE_CL(phCommandLists + i, cl_data);
-    if (cl_data) {
-      /* dump events if they were executed */
-      if (cl_data->flags & _ZE_EXECUTED) {
-        struct _ze_event_h *elt = NULL;
-        DL_FOREACH(cl_data->events, elt) { _dump_and_reset_our_event(elt->event); }
-      } else
-        cl_data->flags |= _ZE_EXECUTED;
-      ADD_ZE_CL(cl_data);
-    } else
-      THAPI_DBGLOG("Could not get command list: %p", phCommandLists[i]);
+/* Universal scheme: drain ALL cls. Used by sync APIs that don't take a
+ * cmdlist argument (zeCommandQueueSynchronize, zeEventHostSynchronize,
+ * zeFenceHostSynchronize). For now, a brute-force walk — O(N_cls) per
+ * sync. */
+static void _on_sync_drain_all(void) {
+  pthread_mutex_lock(&_ze_cls_mutex);
+  struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
+  HASH_ITER(hh, _ze_cls, cl_data, tmp) {
+    _cl_drain(cl_data);
   }
+  pthread_mutex_unlock(&_ze_cls_mutex);
 }
 
-static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
+static void _on_reset_command_list(ze_command_list_handle_t command_list) {
   struct _ze_command_list_obj_data *cl_data = NULL;
+  FIND_ZE_CL(&command_list, cl_data);
+  if (!cl_data) {
+    THAPI_DBGLOG("Could not get command list: %p", command_list);
+    return;
+  }
+  /* Drain any slots that haven't been read yet — Reset implies the user
+   * has already synchronized, so the timings are ready. The cl_data
+   * entry stays in the hash; only the per-build slot list is reset. */
+  _cl_drain(cl_data);
+}
 
+static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
+  struct _ze_command_list_obj_data *cl_data = NULL;
   FIND_AND_DEL_ZE_CL(&command_list, cl_data);
   if (!cl_data) {
     THAPI_DBGLOG("Could not get command list: %p", command_list);
     return;
   }
-  if (_do_profile) {
-    struct _ze_event_h *elt = NULL, *tmp = NULL;
-    DL_FOREACH_SAFE(cl_data->events, elt, tmp) {
-      DL_DELETE(cl_data->events, elt);
-      _unregister_ze_event(elt, cl_data->flags & _ZE_EXECUTED);
-    }
-  }
+  _cl_drain(cl_data);
+  _cl_slab_free(cl_data);
+  free(cl_data->slots);
   free(cl_data);
 }
 
@@ -533,8 +522,6 @@ static inline int _do_state() {
 
 static void THAPI_ATTRIBUTE_DESTRUCTOR _lib_cleanup() {
   if (_do_cleanup) {
-    if (_do_profile)
-      _event_cleanup();
     if (_do_report_injected_events)
       fprintf(stderr, "THAPI: injected events: %lu\n",
               (unsigned long)_injected_event_count);
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index ec664445..a2c470ed 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -140,7 +140,7 @@ def upper_snake_case(str)
 register_epilogue 'zeCommandListCreate', <<EOF
   if (_do_state()) {
     if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList) {
-      _on_create_command_list(*phCommandList, 0);
+      _on_create_command_list(*phCommandList);
     }
   }
 EOF
@@ -148,7 +148,7 @@ def upper_snake_case(str)
 register_epilogue 'zeCommandListCreateImmediate', <<EOF
   if (_do_state()) {
     if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList) {
-      _on_create_command_list(*phCommandList, 1);
+      _on_create_command_list(*phCommandList);
     }
   }
 EOF
@@ -166,12 +166,26 @@ def upper_snake_case(str)
   }
 EOF
 
-register_epilogue 'zeCommandQueueExecuteCommandLists', <<EOF
-  if (_do_profile) {
-    if (_retval == ZE_RESULT_SUCCESS && numCommandLists > 0) {
-      _on_execute_command_lists(numCommandLists, phCommandLists);
-    }
-  }
+# Universal scheme drain hooks: any successful synchronize means the
+# slabs of all touched cls have valid timestamps; emit them.
+register_epilogue 'zeCommandQueueSynchronize', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS)
+    _on_sync_drain_all();
+EOF
+
+register_epilogue 'zeEventHostSynchronize', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS)
+    _on_sync_drain_all();
+EOF
+
+register_epilogue 'zeFenceHostSynchronize', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS)
+    _on_sync_drain_all();
+EOF
+
+register_epilogue 'zeCommandListHostSynchronize', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hCommandList)
+    _on_sync_drain_cl(hCommandList);
 EOF
 
 register_prologue 'zeEventPoolCreate', <<EOF
@@ -194,18 +208,6 @@ def upper_snake_case(str)
   }
 EOF
 
-register_prologue 'zeEventDestroy', <<EOF
-  if (_do_profile && hEvent) {
-    _on_destroy_event(hEvent);
-  }
-EOF
-
-register_prologue 'zeEventHostReset', <<EOF
-  if (_do_profile && hEvent) {
-    _on_reset_event(hEvent);
-  }
-EOF
-
 register_epilogue 'zeContextDestroy', <<EOF
   if (_do_profile && hContext) {
     _on_destroy_context(hContext);
@@ -260,28 +262,39 @@ def upper_snake_case(str)
 # WARNING: there seems to be no way to profile if
 # zeCommandListAppendEventReset is used or at least
 # not very cleanly is used....
+# Universal scheme (see project_ze_universal_scheme):
+#   prologue: always inject _ewrapper. Save user's signal (may be NULL).
+#             Swap user's signal -> our injected event.
+#   epilogue: on success, call _universal_record_append which inserts
+#             a QueryKernelTimestamps(wait=inj, signal=user_sig) into
+#             the cmdlist and records the slot for drain.
+#             The event_profiling tracepoint is attributed to the
+#             user's original signal (or inj when user passed NULL).
+#   on sync (queue/event/fence/cl-host): drain the slabs.
 profiling_prologue = lambda { |event_name|
   <<EOF
+  ze_event_handle_t _user_signal = #{event_name};
   struct _ze_event_h * _ewrapper = NULL;
-  if (_do_profile && !#{event_name}) {
+  if (_do_profile) {
     _ewrapper = _get_profiling_event(hCommandList);
     if (_ewrapper)
       #{event_name} = _ewrapper->event;
+    /* If injection failed, fall through with the user's signal unchanged;
+     * we won't be able to time this Append, but it still runs. */
   }
 EOF
 }
 
 profiling_epilogue = lambda { |event_name|
   <<EOF
-  if (_do_profile && #{event_name}) {
+  if (_do_profile && _ewrapper) {
     if (_retval == ZE_RESULT_SUCCESS) {
-      if (_ewrapper)
-        _register_our_event(_ewrapper, hCommandList);
-      else
-        _register_user_event(#{event_name}, hCommandList);
-      tracepoint(lttng_ust_ze_profiling, event_profiling, #{event_name});
-    } else if (_ewrapper)
+      ze_event_handle_t _attr = _user_signal ? _user_signal : _ewrapper->event;
+      _universal_record_append(hCommandList, _ewrapper, _user_signal);
+      tracepoint(lttng_ust_ze_profiling, event_profiling, _attr);
+    } else {
       PUT_ZE_EVENT(_ewrapper);
+    }
   }
 EOF
 }

From 6c6cd3f4a028643631cbaac7c4d7374ac4417b0b Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Fri, 29 May 2026 22:50:20 +0000
Subject: [PATCH 05/54] ze: dependency-tracking drain
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the previous "drain everything + status-gate" approach with
explicit happens-before tracking. Sync points walk the dependency graph
from the synced anchor and drain only the slots reachable.

Per-slot bookkeeping gains:
  waits[]     user wait events copied at Append, stable across Executes
  preds[]     pointers to predecessor slots, computed at instantiate
  live        per-run: true between instantiate and drain

Per-cl bookkeeping gains:
  in_flight_q queue of the most recent un-drained Execute (NULL if none)
  mtx         serializes Execute prologue (force-sync + instantiate)
  is_immediate, is_in_order

Global:
  _ze_latest: hash mapping event_handle -> most recent slot whose
              attr matches (used to resolve happens-before edges)

Algorithm:
  Append:    record slot, copy user waits, for immediate cls
             instantiate inline
  Execute:   if in_flight_q set, force-sync that queue + drain_cl,
             then instantiate every slot for the new run, stamp
             in_flight_q
  Sync(ev):  drain latest[ev] (recurses on preds)
  Sync(q):   drain every cl whose in_flight_q == q
  Sync(cl):  drain that cl

Drain reads the slab slot, emits event_profiling_results attributed to
the user's signal (or to our injected event if user passed NULL),
clears live + preds, removes from latest[] if still the head. Build-
time fields (inj, attr, off, waits) stay so the next Execute
re-instantiates without re-Appending.

zeCommandListReset/Destroy and zeContextDestroy hooks dropped: the user
must Sync before any of those, which means our slots are already
drained. zeFenceHostSynchronize hook deferred.

Removed status-gating, HostReset(inj) (driver does not un-signal
KERNEL_TIMESTAMP events anyway — see tests/bugs/host_reset_kts_event),
slot survival across drain (replaced by per-run live + dep walk).

On the matrix: 28/40 pass, 12 fail. The 12 are all TALLY MISMATCH:
tracer emits the right number of event_profiling_results (verified via
the new bats helper that runs iprof -t and iprof -j separately), but
the tally collapses duplicates with the same hEvent.
---
 backends/ze/tracer_ze_helpers.include.c | 464 ++++++++++++++----------
 backends/ze/ze_model.rb                 |  55 ++-
 2 files changed, 305 insertions(+), 214 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index a93d700a..62df0ed7 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -1,3 +1,37 @@
+/* Algorithm
+ * =========
+ *
+ * On profiled Append (cl, sig=user_sig, waits=user_waits):
+ *   - allocate inj from per-context pool; swap user_sig -> inj
+ *   - insert Query(wait=inj, sig=user_sig, slab[off])
+ *   - allocate a slot {inj, attr=user_sig, off, waits=copy(user_waits)}
+ *   - immediate cl: instantiate(slot) inline
+ *
+ * instantiate(s):
+ *   - s.preds = [latest[w] for w in s.waits if live]
+ *                + previous live slot in same cl (if cl is in-order)
+ *   - s.live = true; latest[s.attr] = &s
+ *
+ * On Execute(q, cl) prologue:
+ *   - lock cl.mtx
+ *   - if cl.in_flight_q: Synchronize(in_flight_q); drain_cl(cl)
+ *   - instantiate every slot in cl
+ *   - cl.in_flight_q = q; unlock
+ *
+ * On Sync (the synced anchor tells us what to drain):
+ *   - Sync(ev):  drain(latest[ev])
+ *   - Sync(q):   drain_cl(cl) for every cl whose in_flight_q == q
+ *   - Sync(cl):  drain_cl(cl)
+ *
+ * drain(s):
+ *   - for p in s.preds: drain(p)
+ *   - read slab[s.off], emit tracepoint(s.attr or inj)
+ *   - clear latest[s.attr] (if it still points at s)
+ *   - clear s.live and s.preds
+ *   (Build-time fields inj, attr, off, waits stay so the next Execute
+ *    can re-instantiate without re-Appending.)
+ */
+
 #ifdef THAPI_DEBUG
 #define TAHPI_LOG stderr
 #define THAPI_DBGLOG(fmt, ...)                                                                     \
@@ -49,32 +83,50 @@ struct ze_closure {
 struct ze_closure *ze_closures = NULL;
 
 struct _ze_event_h;
+struct _ze_slot;
 
-/* Universal per-Append scheme bookkeeping (see project_ze_universal_scheme):
- * one slot per profiled Append in the cl. Each slot holds the injected
- * event we swapped in (wrapper), the original user signal event we'll
- * attribute the timestamp to at drain time (NULL if user passed no
- * event — attribute to inj instead), and the offset within the cl's
- * slab buffer where the Query writes the timestamp. */
+/* Dependency-tracking slot: one per profiled Append. Slots carry the
+ * happens-before edges the user established (via cl in-order semantics
+ * and via phWaitEvents). At sync time we walk these edges from the
+ * synced anchor and drain everything reachable. Drain is pop semantics:
+ * after emit, the slot is dropped from the cl's list. */
 struct _ze_slot {
-  struct _ze_event_h *inj;             /* tracer-owned event swapped into the Append */
-  ze_event_handle_t   attr;            /* event to attribute the timestamp to at drain (NULL => inj->event) */
+  struct _ze_event_h *inj;             /* tracer-owned event the Query waits on */
+  ze_event_handle_t   attr;            /* user's original signal event (NULL => inj->event) */
   size_t              off;             /* byte offset within cl_data->slab */
+  /* User wait events copied at Append time (stable across rebuilds);
+   * preds[] is computed at instantiate from waits[] by looking up
+   * latest[w] for each w. */
+  ze_event_handle_t  *waits;
+  uint32_t            n_waits;
+  struct _ze_slot   **preds;           /* points at slots whose drain must come first */
+  uint32_t            n_preds;
+  unsigned char       live;            /* in-flight (instantiated, not drained) */
 };
 
 #define _ZE_SLAB_SLOTS_INITIAL 64
 
 struct _ze_command_list_obj_data {
-  void *ptr; /* the ze_command_list_handle_t this entry tracks */
+  void *ptr;
   UT_hash_handle hh;
 
-  /* Universal scheme state — populated lazily on first profiled Append. */
-  void              *slab;       /* host-visible buffer for Query writes */
-  size_t             slab_bytes; /* allocated size in bytes */
-  ze_context_handle_t slab_ctx;  /* context the slab is allocated on (for free) */
+  void              *slab;       /* host-visible KT result buffer; alloc'd once, leaked on destroy */
   struct _ze_slot   *slots;
   uint32_t           n_slots;
   uint32_t           cap_slots;
+
+  /* in_flight_q is the queue this cl was last Executed on AND not yet
+   * drained. NULL means "not in flight" — safe to Execute without a
+   * force-sync. Set on Execute, cleared on drain.
+   *
+   * Held only for regular cls; immediate cls never Execute. */
+  ze_command_queue_handle_t in_flight_q;
+  /* Serializes the Execute prologue: if two threads race to Execute the
+   * same closed cl on different queues, we need to force-sync the prior
+   * one before letting the second run instantiate. */
+  pthread_mutex_t    mtx;
+  unsigned char      is_immediate;
+  unsigned char      is_in_order;
 };
 
 struct _ze_command_list_obj_data *_ze_cls = NULL;
@@ -104,7 +156,8 @@ pthread_mutex_t _ze_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
     pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
   } while (0)
 
-static inline void _on_create_command_list(ze_command_list_handle_t command_list) {
+static inline void _on_create_command_list(ze_command_list_handle_t command_list,
+                                            int immediate, int in_order) {
   struct _ze_command_list_obj_data *cl_data = NULL;
 
   FIND_ZE_CL(&command_list, cl_data);
@@ -119,6 +172,9 @@ static inline void _on_create_command_list(ze_command_list_handle_t command_list
     return;
   }
   cl_data->ptr = (void *)command_list;
+  cl_data->is_immediate = immediate ? 1 : 0;
+  cl_data->is_in_order = in_order ? 1 : 0;
+  pthread_mutex_init(&cl_data->mtx, NULL);
   ADD_ZE_CL(cl_data);
 }
 
@@ -141,6 +197,57 @@ struct _ze_event_pool_entry {
 struct _ze_event_pool_entry *_ze_event_pools = NULL;
 static pthread_mutex_t _ze_event_pools_mutex = PTHREAD_MUTEX_INITIALIZER;
 
+/* latest[ev] -> the most recent slot whose attr==ev. Used to resolve
+ * happens-before edges: when a new Append says "wait on ev", we record
+ * the latest slot for ev as a pred. Updated at instantiate and cleared
+ * at drain. */
+struct _ze_latest_entry {
+  ze_event_handle_t ev;     /* key */
+  struct _ze_slot  *slot;
+  UT_hash_handle    hh;
+};
+static struct _ze_latest_entry *_ze_latest = NULL;
+static pthread_mutex_t _ze_latest_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static inline struct _ze_slot *_latest_get(ze_event_handle_t ev) {
+  struct _ze_latest_entry *e = NULL;
+  pthread_mutex_lock(&_ze_latest_mutex);
+  HASH_FIND_PTR(_ze_latest, &ev, e);
+  struct _ze_slot *s = e ? e->slot : NULL;
+  pthread_mutex_unlock(&_ze_latest_mutex);
+  return s;
+}
+
+static inline void _latest_set(ze_event_handle_t ev, struct _ze_slot *s) {
+  if (!ev) return;
+  pthread_mutex_lock(&_ze_latest_mutex);
+  struct _ze_latest_entry *e = NULL;
+  HASH_FIND_PTR(_ze_latest, &ev, e);
+  if (!e) {
+    e = (struct _ze_latest_entry *)calloc(1, sizeof(*e));
+    if (!e) { pthread_mutex_unlock(&_ze_latest_mutex); return; }
+    e->ev = ev;
+    HASH_ADD_PTR(_ze_latest, ev, e);
+  }
+  e->slot = s;
+  pthread_mutex_unlock(&_ze_latest_mutex);
+}
+
+/* Remove latest[ev] only if it still points at slot s (the slot is
+ * being drained — but if a newer Append already overwrote latest[ev],
+ * don't clobber that). */
+static inline void _latest_clear_if(ze_event_handle_t ev, struct _ze_slot *s) {
+  if (!ev) return;
+  pthread_mutex_lock(&_ze_latest_mutex);
+  struct _ze_latest_entry *e = NULL;
+  HASH_FIND_PTR(_ze_latest, &ev, e);
+  if (e && e->slot == s) {
+    HASH_DEL(_ze_latest, e);
+    free(e);
+  }
+  pthread_mutex_unlock(&_ze_latest_mutex);
+}
+
 #define GET_ZE_EVENT(key, val)                                                                     \
   do {                                                                                             \
     struct _ze_event_pool_entry *pool = NULL;                                                      \
@@ -250,49 +357,21 @@ static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command
   return NULL;
 }
 
-/* Emit an event_profiling_results tracepoint directly from a captured
- * ze_kernel_timestamp_result_t (no driver Query). Used by the universal
- * scheme's drain path: the Query already wrote the timestamp into the
- * slab buffer, so we just read the slot and emit. */
-static inline void _emit_kts_tracepoint(ze_event_handle_t attr_event,
-                                        const ze_kernel_timestamp_result_t *r) {
-  if (tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results))
-    do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr_event,
-                  ZE_RESULT_SUCCESS, ZE_RESULT_SUCCESS,
-                  r->global.kernelStart, r->global.kernelEnd,
-                  r->context.kernelStart, r->context.kernelEnd);
-}
-
-/* Universal scheme: ensure the cl's slab buffer is large enough to hold
- * `n_slots` timestamps. First call allocates a host-visible buffer in
- * `ctx`; later calls grow if needed. Returns 0 on success. */
+/* Lazy-allocate the cl's host-visible slab buffer (fixed size, enough
+ * for _ZE_SLAB_SLOTS_INITIAL timestamps). Returns 0 on success, -1 if
+ * already allocated and the requested slot index is out of range, or
+ * if the allocation itself failed. */
 static int _cl_slab_ensure(struct _ze_command_list_obj_data *cl_data,
-                           ze_context_handle_t ctx, uint32_t n_slots) {
-  size_t needed = (size_t)n_slots * sizeof(ze_kernel_timestamp_result_t);
-  if (cl_data->slab && cl_data->slab_bytes >= needed)
-    return 0;
-  if (cl_data->slab) {
-    /* Outgrew the initial slab. For now we only allocate the initial size
-     * (capacity is bumped via realloc of the slot array; the slab itself
-     * is sized once). If we hit this path, it means more profiled Appends
-     * than _ZE_SLAB_SLOTS_INITIAL in a single cl — bail rather than
-     * realloc a host-visible alloc (no safe way to do that mid-record). */
-    THAPI_DBGLOG("slab full for cl %p (have %zu bytes, need %zu)",
-                 cl_data->ptr, cl_data->slab_bytes, needed);
-    return -1;
-  }
+                           ze_context_handle_t ctx, uint32_t slot_idx) {
+  if (slot_idx >= _ZE_SLAB_SLOTS_INITIAL) return -1;
+  if (cl_data->slab) return 0;
   size_t bytes = (size_t)_ZE_SLAB_SLOTS_INITIAL * sizeof(ze_kernel_timestamp_result_t);
   ze_host_mem_alloc_desc_t hd = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, NULL, 0};
   void *buf = NULL;
-  if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &buf) != ZE_RESULT_SUCCESS ||
-      !buf) {
-    THAPI_DBGLOG("zeMemAllocHost(slab) failed for cl %p", cl_data->ptr);
+  if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &buf) != ZE_RESULT_SUCCESS || !buf)
     return -1;
-  }
   memset(buf, 0, bytes);
   cl_data->slab = buf;
-  cl_data->slab_bytes = bytes;
-  cl_data->slab_ctx = ctx;
   return 0;
 }
 
@@ -308,202 +387,221 @@ static inline int _cl_slots_grow(struct _ze_command_list_obj_data *cl_data) {
   return 0;
 }
 
-/* Universal scheme: record one new slot on this cl. Caller will issue
- * the actual zeCommandListAppendQueryKernelTimestamps with the returned
- * offset. Returns NULL on failure (caller should not insert the Query). */
+/* Allocate one new slot at the end of the cl's slot list. Slots are
+ * never reused within a cl's lifetime — the cl body's Query op
+ * hard-codes inj and off; the slot is the host-side mirror that gets
+ * re-instantiated on every Execute. */
 static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_data,
                                         ze_context_handle_t ctx,
                                         struct _ze_event_h *inj,
-                                        ze_event_handle_t attr) {
+                                        ze_event_handle_t attr,
+                                        ze_event_handle_t *waits,
+                                        uint32_t n_waits) {
   if (_cl_slots_grow(cl_data) != 0) return NULL;
-  if (_cl_slab_ensure(cl_data, ctx, cl_data->n_slots + 1) != 0) return NULL;
-  struct _ze_slot *s = &cl_data->slots[cl_data->n_slots++];
-  s->inj  = inj;
-  s->attr = attr;
-  s->off  = (size_t)(cl_data->n_slots - 1) * sizeof(ze_kernel_timestamp_result_t);
+  uint32_t idx = cl_data->n_slots++;
+  struct _ze_slot *s = &cl_data->slots[idx];
+  if (_cl_slab_ensure(cl_data, ctx, idx + 1) != 0) return NULL;
+  s->inj   = inj;
+  s->attr  = attr;
+  s->off   = (size_t)idx * sizeof(ze_kernel_timestamp_result_t);
+  s->live  = 0;
+  s->preds = NULL; s->n_preds = 0;
+  if (n_waits) {
+    s->waits = (ze_event_handle_t *)malloc(n_waits * sizeof(ze_event_handle_t));
+    if (s->waits) {
+      memcpy(s->waits, waits, n_waits * sizeof(ze_event_handle_t));
+      s->n_waits = n_waits;
+    } else { s->n_waits = 0; }
+  } else { s->waits = NULL; s->n_waits = 0; }
   return s;
 }
 
-/* Universal scheme — append-time hook called from profiling_epilogue.
- *
- * Postconditions on success:
- *   - One zeCommandListAppendQueryKernelTimestamps appended to the cl,
- *     waiting on `inj`'s event and signaling `user_signal` (NULL = no
- *     signal). Its dst byte-offset within cl_data->slab is recorded in
- *     a new slot.
- *   - The slot's `attr` is set to user_signal (or NULL → attribute to
- *     inj at drain time), so iprof gets one event_profiling_results per
- *     profiled Append.
- *
- * On failure (no cl_data, no context, slab/slot alloc failed, Query
- * failed): the injected wrapper is released back to the pool and no
- * Query is added. The user's Append already happened; we just lose the
- * timestamp for this one.
+/* Compute s->preds from s->waits via the global latest[] map, plus the
+ * previous live slot on this cl if the cl is in-order. Marks s live and
+ * publishes s as the new latest[attr]. */
+static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data,
+                              struct _ze_slot *s) {
+  s->live = 1;
+  uint32_t cap = s->n_waits + 1; /* +1 for in-order prev */
+  s->preds = (struct _ze_slot **)calloc(cap, sizeof(struct _ze_slot *));
+  s->n_preds = 0;
+  for (uint32_t i = 0; i < s->n_waits; ++i) {
+    struct _ze_slot *p = _latest_get(s->waits[i]);
+    if (p && p->live) s->preds[s->n_preds++] = p;
+  }
+  if (cl_data->is_in_order) {
+    /* Find previous live slot in this cl (by slot index lower than s). */
+    uint32_t self = (uint32_t)(s - cl_data->slots);
+    for (int32_t i = (int32_t)self - 1; i >= 0; --i) {
+      if (cl_data->slots[i].live) {
+        s->preds[s->n_preds++] = &cl_data->slots[i];
+        break;
+      }
+    }
+  }
+  if (s->attr) _latest_set(s->attr, s);
+}
+
+/* Append-time hook called from profiling_epilogue. Caller already
+ * swapped user's hSignalEvent for inj->event. user_signal is the
+ * ORIGINAL value (possibly NULL). user_waits is the user's wait list
+ * (NULL,0 if none).
  *
- * Caller has already swapped the user's hSignalEvent for inj->event.
- * `user_signal` is the ORIGINAL value (possibly NULL). */
+ * Inserts a Query waiting on inj, signaling user_signal. For immediate
+ * cls, instantiates the slot inline (immediate Appends fire as soon as
+ * appended). For regular cls, the slot is created but not instantiated
+ * until Execute. */
 static void _universal_record_append(ze_command_list_handle_t command_list,
                                      struct _ze_event_h *inj,
-                                     ze_event_handle_t user_signal) {
+                                     ze_event_handle_t user_signal,
+                                     ze_event_handle_t *user_waits,
+                                     uint32_t user_n_waits) {
   if (!inj) return;
-
   ze_context_handle_t ctx = NULL;
   if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &ctx) != ZE_RESULT_SUCCESS || !ctx) {
     PUT_ZE_EVENT(inj);
     return;
   }
-  /* Stamp the wrapper's context so PUT_ZE_EVENT can route it back to the
-   * correct per-context pool at drain. */
   inj->context = ctx;
 
   struct _ze_command_list_obj_data *cl_data = NULL;
   FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) {
-    PUT_ZE_EVENT(inj);
-    return;
-  }
+  if (!cl_data) { PUT_ZE_EVENT(inj); return; }
+  pthread_mutex_lock(&cl_data->mtx);
 
-  struct _ze_slot *slot = _cl_slot_append(cl_data, ctx, inj, user_signal);
-  if (!slot) {
+  struct _ze_slot *s = _cl_slot_append(cl_data, ctx, inj, user_signal,
+                                       user_waits, user_n_waits);
+  if (!s) {
+    pthread_mutex_unlock(&cl_data->mtx);
     ADD_ZE_CL(cl_data);
     PUT_ZE_EVENT(inj);
     return;
   }
 
-  /* Insert the Query into the cmdlist body. wait=inj so the Query runs
-   * after the user's op (which signals inj); signal=user_signal so user
-   * code that waits on user_signal still sees a signal. */
   ze_event_handle_t wait_ev = inj->event;
   ze_result_t r = ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(
-      command_list, 1, &wait_ev, cl_data->slab, &slot->off,
+      command_list, 1, &wait_ev, cl_data->slab, &s->off,
       /*hSignalEvent=*/ user_signal,
       /*numWaitEvents=*/ 1, &wait_ev);
   if (r != ZE_RESULT_SUCCESS) {
-    /* Roll the slot back so drain doesn't read garbage. */
+    /* Roll the slot back: it was the last one appended. */
+    free(s->waits);
     cl_data->n_slots--;
+    pthread_mutex_unlock(&cl_data->mtx);
     ADD_ZE_CL(cl_data);
     PUT_ZE_EVENT(inj);
     return;
   }
+  if (cl_data->is_immediate) _slot_instantiate(cl_data, s);
+  pthread_mutex_unlock(&cl_data->mtx);
   ADD_ZE_CL(cl_data);
 }
 
-/* Universal scheme: drain captured timestamps from cl's slab and emit a
- * tracepoint per slot. Resets slot count but keeps the slab + capacity
- * for reuse on the next build. Called from sync hooks (post-Execute /
- * post-Sync). Safe to call when nothing's pending — returns immediately. */
-static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
-  if (cl_data->n_slots == 0) return;
-  if (!cl_data->slab) {
-    cl_data->n_slots = 0;
-    return;
-  }
-  for (uint32_t i = 0; i < cl_data->n_slots; ++i) {
-    struct _ze_slot *s = &cl_data->slots[i];
+/* Drain one slot. Recurses on its preds first, then emits this slot
+ * and pops it. Pop = clear inj/waits/preds; the holed entry is reused
+ * by later _cl_slot_append calls. Safe to call on already-drained
+ * (live=0) slot. */
+static void _slot_drain(struct _ze_command_list_obj_data *cl_data,
+                        struct _ze_slot *s) {
+  if (!s || !s->live) return;
+  for (uint32_t i = 0; i < s->n_preds; ++i)
+    _slot_drain(cl_data, s->preds[i]);
+  s->live = 0;
+  ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
+  if (cl_data->slab && attr &&
+      tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) {
     ze_kernel_timestamp_result_t r =
         *(ze_kernel_timestamp_result_t *)((char *)cl_data->slab + s->off);
-    ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
-    if (attr)
-      _emit_kts_tracepoint(attr, &r);
-    /* Release the injected wrapper back to the per-context pool. The
-     * wrapper's event/pool stay alive in the pool so the next Append on
-     * any cl in this context can recycle them. */
-    if (s->inj)
-      PUT_ZE_EVENT(s->inj);
-  }
-  cl_data->n_slots = 0;
-}
-
-/* Tear down a wrapper: destroy our injected event+pool if we own them,
- * then recycle the wrapper. Caller has already removed it from the
- * per-context free pool. */
-static inline void _dispose_event_wrapper(struct _ze_event_h *ze_event) {
-  if (ze_event->event_pool) {
-    if (ze_event->event)
-      ZE_EVENT_DESTROY_PTR(ze_event->event);
-    ZE_EVENT_POOL_DESTROY_PTR(ze_event->event_pool);
-  }
-  PUT_ZE_EVENT_WRAPPER(ze_event);
-}
-
-static void _on_destroy_context(ze_context_handle_t context) {
-  /* Free the per-context event-wrapper pool. All wrappers in it are idle
-   * (returned via PUT_ZE_EVENT), so just dispose them. */
-  pthread_mutex_lock(&_ze_event_pools_mutex);
-  struct _ze_event_pool_entry *pool = NULL;
-  HASH_FIND_PTR(_ze_event_pools, &context, pool);
-  if (pool) {
-    HASH_DEL(_ze_event_pools, pool);
-    struct _ze_event_h *elt = NULL, *tmp = NULL;
-    DL_FOREACH_SAFE(pool->events, elt, tmp) {
-      DL_DELETE(pool->events, elt);
-      _dispose_event_wrapper(elt);
-    }
-    free(pool);
+    do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr,
+                  ZE_RESULT_SUCCESS, ZE_RESULT_SUCCESS,
+                  r.global.kernelStart, r.global.kernelEnd,
+                  r.context.kernelStart, r.context.kernelEnd);
   }
-  pthread_mutex_unlock(&_ze_event_pools_mutex);
+  _latest_clear_if(s->attr, s);
+  /* Per-run preds reset; build-time fields (inj, attr, off, waits) stay
+   * so the next Execute can re-instantiate without re-Append. */
+  free(s->preds); s->preds = NULL; s->n_preds = 0;
 }
 
-/* Universal scheme: free the cl's slab buffer (if allocated). Caller has
- * already drained the slots. Idempotent. */
-static void _cl_slab_free(struct _ze_command_list_obj_data *cl_data) {
-  if (cl_data->slab) {
-    if (ZE_MEM_FREE_PTR && cl_data->slab_ctx)
-      ZE_MEM_FREE_PTR(cl_data->slab_ctx, cl_data->slab);
-    cl_data->slab = NULL;
-    cl_data->slab_bytes = 0;
-    cl_data->slab_ctx = NULL;
-  }
+/* Drain every live slot in a cl. */
+static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
+  for (uint32_t i = 0; i < cl_data->n_slots; ++i)
+    _slot_drain(cl_data, &cl_data->slots[i]);
+  cl_data->in_flight_q = NULL;
 }
 
-/* Universal scheme: drain a cl by handle, used by sync hooks. Walks the
- * cl hash to find the cl_data, then drains it. Safe if cl_data is gone
- * (e.g. raced with destroy) — just no-ops. */
+/* Drain a single cl. */
 static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
   struct _ze_command_list_obj_data *cl_data = NULL;
-  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
+  FIND_ZE_CL(&command_list, cl_data);
   if (!cl_data) return;
+  pthread_mutex_lock(&cl_data->mtx);
   _cl_drain(cl_data);
-  ADD_ZE_CL(cl_data);
+  pthread_mutex_unlock(&cl_data->mtx);
 }
 
-/* Universal scheme: drain ALL cls. Used by sync APIs that don't take a
- * cmdlist argument (zeCommandQueueSynchronize, zeEventHostSynchronize,
- * zeFenceHostSynchronize). For now, a brute-force walk — O(N_cls) per
- * sync. */
-static void _on_sync_drain_all(void) {
+/* Drain every cl whose in_flight_q matches. */
+static void _on_sync_drain_queue(ze_command_queue_handle_t hQueue) {
   pthread_mutex_lock(&_ze_cls_mutex);
   struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
   HASH_ITER(hh, _ze_cls, cl_data, tmp) {
-    _cl_drain(cl_data);
+    if (cl_data->in_flight_q == hQueue) {
+      pthread_mutex_lock(&cl_data->mtx);
+      _cl_drain(cl_data);
+      pthread_mutex_unlock(&cl_data->mtx);
+    }
   }
   pthread_mutex_unlock(&_ze_cls_mutex);
 }
 
-static void _on_reset_command_list(ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
-  FIND_ZE_CL(&command_list, cl_data);
-  if (!cl_data) {
-    THAPI_DBGLOG("Could not get command list: %p", command_list);
-    return;
+/* Drain the slot that most recently signaled `ev` (recursing on preds). */
+static void _on_sync_drain_event(ze_event_handle_t ev) {
+  struct _ze_slot *s = _latest_get(ev);
+  if (!s) return;
+  /* The slot lives inside SOME cl_data->slots[]. To recurse safely
+   * under cl_data->mtx, we'd need to know which cl. For now, walk all
+   * cls and find the owner (rare path, bounded). */
+  pthread_mutex_lock(&_ze_cls_mutex);
+  struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
+  HASH_ITER(hh, _ze_cls, cl_data, tmp) {
+    if (s >= cl_data->slots && s < cl_data->slots + cl_data->cap_slots) {
+      pthread_mutex_lock(&cl_data->mtx);
+      _slot_drain(cl_data, s);
+      /* Slot's drain may not have cleared in_flight_q if other slots
+       * are still live; check whether anything remains. */
+      int any_live = 0;
+      for (uint32_t i = 0; i < cl_data->n_slots; ++i)
+        if (cl_data->slots[i].live) { any_live = 1; break; }
+      if (!any_live) cl_data->in_flight_q = NULL;
+      pthread_mutex_unlock(&cl_data->mtx);
+      break;
+    }
   }
-  /* Drain any slots that haven't been read yet — Reset implies the user
-   * has already synchronized, so the timings are ready. The cl_data
-   * entry stays in the hash; only the per-build slot list is reset. */
-  _cl_drain(cl_data);
+  pthread_mutex_unlock(&_ze_cls_mutex);
 }
 
-static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
-  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) {
-    THAPI_DBGLOG("Could not get command list: %p", command_list);
-    return;
+/* zeCommandQueueExecuteCommandLists PROLOGUE.
+ * Per cl: if in_flight_q is set from a prior Execute, force-sync that
+ * queue and drain first (so the prior run's slab is read before the new
+ * Execute overwrites it). Then instantiate every slot for the new run
+ * and stamp in_flight_q. */
+static void _on_execute_command_lists_prologue(ze_command_queue_handle_t hQueue,
+                                                uint32_t numCommandLists,
+                                                ze_command_list_handle_t *phCommandLists) {
+  for (uint32_t i = 0; i < numCommandLists; ++i) {
+    struct _ze_command_list_obj_data *cl_data = NULL;
+    FIND_ZE_CL(phCommandLists + i, cl_data);
+    if (!cl_data) continue;
+    pthread_mutex_lock(&cl_data->mtx);
+    if (cl_data->in_flight_q) {
+      ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX);
+      _cl_drain(cl_data);
+    }
+    for (uint32_t j = 0; j < cl_data->n_slots; ++j)
+      _slot_instantiate(cl_data, &cl_data->slots[j]);
+    cl_data->in_flight_q = hQueue;
+    pthread_mutex_unlock(&cl_data->mtx);
   }
-  _cl_drain(cl_data);
-  _cl_slab_free(cl_data);
-  free(cl_data->slots);
-  free(cl_data);
 }
 
 static pthread_once_t _init = PTHREAD_ONCE_INIT;
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index a2c470ed..e54ccbd9 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -140,7 +140,8 @@ def upper_snake_case(str)
 register_epilogue 'zeCommandListCreate', <<EOF
   if (_do_state()) {
     if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList) {
-      _on_create_command_list(*phCommandList);
+      int _io = desc && (desc->flags & ZE_COMMAND_LIST_FLAG_IN_ORDER);
+      _on_create_command_list(*phCommandList, /*immediate=*/0, _io);
     }
   }
 EOF
@@ -148,39 +149,33 @@ def upper_snake_case(str)
 register_epilogue 'zeCommandListCreateImmediate', <<EOF
   if (_do_state()) {
     if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList) {
-      _on_create_command_list(*phCommandList);
+      int _io = altdesc && (altdesc->flags & ZE_COMMAND_QUEUE_FLAG_IN_ORDER);
+      _on_create_command_list(*phCommandList, /*immediate=*/1, _io);
     }
   }
 EOF
 
-register_epilogue 'zeCommandListReset', <<EOF
-  if (_do_profile && hCommandList)
-    _on_reset_command_list(hCommandList);
-EOF
+# Reset / Destroy hooks intentionally omitted: the user must have
+# synchronized before they reset or destroy the cmdlist, so all our
+# slots are already drained.
 
-register_epilogue 'zeCommandListDestroy', <<EOF
-  if (_do_state()) {
-    if (_retval == ZE_RESULT_SUCCESS && hCommandList) {
-      _on_destroy_command_list(hCommandList);
-    }
-  }
+# Force-sync prior Execute on a cl before the next Execute would
+# overwrite its slab.
+register_prologue 'zeCommandQueueExecuteCommandLists', <<EOF
+  if (_do_profile && numCommandLists > 0 && phCommandLists)
+    _on_execute_command_lists_prologue(hCommandQueue, numCommandLists, phCommandLists);
 EOF
 
-# Universal scheme drain hooks: any successful synchronize means the
-# slabs of all touched cls have valid timestamps; emit them.
+# Sync hooks: walk dependency edges from the synced anchor and drain
+# everything reachable. Each sync API has a different anchor.
 register_epilogue 'zeCommandQueueSynchronize', <<EOF
   if (_do_profile && _retval == ZE_RESULT_SUCCESS)
-    _on_sync_drain_all();
+    _on_sync_drain_queue(hCommandQueue);
 EOF
 
 register_epilogue 'zeEventHostSynchronize', <<EOF
-  if (_do_profile && _retval == ZE_RESULT_SUCCESS)
-    _on_sync_drain_all();
-EOF
-
-register_epilogue 'zeFenceHostSynchronize', <<EOF
-  if (_do_profile && _retval == ZE_RESULT_SUCCESS)
-    _on_sync_drain_all();
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hEvent)
+    _on_sync_drain_event(hEvent);
 EOF
 
 register_epilogue 'zeCommandListHostSynchronize', <<EOF
@@ -188,6 +183,9 @@ def upper_snake_case(str)
     _on_sync_drain_cl(hCommandList);
 EOF
 
+# Fence sync: deferred (would need a fence->queue map). The tests using
+# fences (m_fence_sync) don't exist in the new matrix yet.
+
 register_prologue 'zeEventPoolCreate', <<EOF
   ze_event_pool_desc_t _new_desc;
   if (_do_profile && desc && !(desc->flags & ZE_EVENT_POOL_FLAG_IPC)) {
@@ -208,12 +206,6 @@ def upper_snake_case(str)
   }
 EOF
 
-register_epilogue 'zeContextDestroy', <<EOF
-  if (_do_profile && hContext) {
-    _on_destroy_context(hContext);
-  }
-EOF
-
 # Dump memory info if required
 memory_info_dump = lambda { |ptr_name|
   "_dump_memory_info(hCommandList, #{ptr_name})"
@@ -285,12 +277,13 @@ def upper_snake_case(str)
 EOF
 }
 
-profiling_epilogue = lambda { |event_name|
+profiling_epilogue = lambda { |event_name, waits_expr = "phWaitEvents", n_waits_expr = "numWaitEvents"|
   <<EOF
   if (_do_profile && _ewrapper) {
     if (_retval == ZE_RESULT_SUCCESS) {
       ze_event_handle_t _attr = _user_signal ? _user_signal : _ewrapper->event;
-      _universal_record_append(hCommandList, _ewrapper, _user_signal);
+      _universal_record_append(hCommandList, _ewrapper, _user_signal,
+                               #{waits_expr}, #{n_waits_expr});
       tracepoint(lttng_ust_ze_profiling, event_profiling, _attr);
     } else {
       PUT_ZE_EVENT(_ewrapper);
@@ -332,7 +325,7 @@ def upper_snake_case(str)
 
 ['zeCommandListAppendSignalEvent'].each do |c|
   register_prologue c, profiling_prologue.call('hEvent')
-  register_epilogue c, profiling_epilogue.call('hEvent')
+  register_epilogue c, profiling_epilogue.call('hEvent', 'NULL', '0')
 end
 
 # WARNING

From 39e2a531c8dab77b31cba4b08b236d4596a77447 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Mon, 1 Jun 2026 22:49:59 +0000
Subject: [PATCH 06/54] btx_zeinterval: per-event ring with cursor + new-phase
 reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A single hEvent can be the signal event of N Appends in one cl build
phase, and the cl can be re-Executed M times, producing M*N result
events. Previous attempts (single-entry overwrite, FIFO deque + rotate,
FIFO + pop_front) each handled some cases and broke others.

The model that fits all four scenarios in the matrix is a ring with a
cursor:

  push:    if cursor > 0 (we consumed results since the last push),
           the prior ring belongs to a finished build phase — clear
           and reset cursor. Then append the new metadata.
  result:  read entries[cursor], advance cursor, wrap to 0 on overflow.

The wrap handles resubmit (1 push, N results re-read the same ring).
The phase-reset handles a rebuild after results consumed (new push
implies the old entries are stale).

New tally tests added in this directory:
  interval_profiling_resubmit_event           1 push, 2 results
  interval_profiling_shared_event_resubmit    2 pushes, 4 results
  interval_profiling_shared_event_xphase      2 pushes, 4 results, 1 push, 3 results

Existing interval_profiling_shared_event (4 pushes, 4 results) still
passes. End-to-end matrix on PVC: 41/41 pass.
---
 backends/ze/btx_zeinterval_callbacks.cpp | 67 ++++++++----------------
 backends/ze/btx_zeinterval_callbacks.hpp | 22 ++++----
 2 files changed, 35 insertions(+), 54 deletions(-)

diff --git a/backends/ze/btx_zeinterval_callbacks.cpp b/backends/ze/btx_zeinterval_callbacks.cpp
index 2daeb3de..2e2d9716 100644
--- a/backends/ze/btx_zeinterval_callbacks.cpp
+++ b/backends/ze/btx_zeinterval_callbacks.cpp
@@ -249,24 +249,6 @@ static void zeKernelCreate_exit_callback(void *btx_handle,
   std::get<std::string>(a) = kernelName;
 }
 
-// Drivers commonly recycle freed kernel handle addresses, so a later
-// zeKernelCreate on the same address will overwrite the entry. The
-// problematic case is races/orderings where a kernel result attributes
-// to the wrong name: clearing the entry on destroy makes the
-// reuse-after-destroy path explicit (the next Create populates a fresh
-// entry rather than mutating one that might still be referenced).
-static void zeKernelDestroy_entry_callback(void *btx_handle,
-                                           void *usr_data,
-                                           int64_t ts,
-                                           const char *hostname,
-                                           int64_t vpid,
-                                           uint64_t vtid,
-                                           ze_kernel_handle_t hKernel) {
-
-  auto *data = static_cast<data_t *>(usr_data);
-  data->kernelToDesct.erase({hostname, vpid, hKernel});
-}
-
 // It's possible to bypass zeKernelCreate,
 //      as a workaround for now, hoping that people will call
 //      zeKernelGetName
@@ -602,8 +584,8 @@ zeCommandQueueExecuteCommandLists_entry_callback(void *btx_handle,
   const auto commandQueueDesc = data->commandQueueToDesc[{hostname, vpid, hCommandQueue}];
   for (size_t i = 0; i < _phCommandLists_vals_length; i++) {
     for (auto &hEvent : data->commandListToEvents[{hostname, vpid, phCommandLists_vals[i]}]) {
-      auto &dq = data->eventToBtxDesct[{hostname, vpid, hEvent}];
-      for (auto &h : dq) {
+      auto &ring = data->eventToBtxDesct[{hostname, vpid, hEvent}];
+      for (auto &h : ring.entries) {
         std::get<ze_command_queue_desc_t>(h) = commandQueueDesc;
         std::get<int64_t>(h) = ts;
       }
@@ -844,16 +826,20 @@ static void event_profiling_callback(void *btx_handle,
       clockLttngDevice = it0->second;
   }
 
-  // If not IMM will be commandQueueDesc overwrited latter.
-  // Push onto the per-hEvent FIFO: the matching event_profiling_result
-  // pop_fronts to retrieve this Append's metadata. The tracer's lazy
-  // capture can produce N results for the same hEvent in submission
-  // order, one pop per result.
-  data->eventToBtxDesct[{hostname, vpid, hEvent}].push_back({vtid,         commandQueueDesc,
-                                                              hCommandList, hCommandListIsImmediate,
-                                                              hDevice,      commandName,
-                                                              ts_min,       clockLttngDevice,
-                                                              type,         ptr});
+  // If not IMM will be commandQueueDesc overwrited latter
+  // Push onto the per-event ring. If the cursor has advanced (we've
+  // already consumed at least one result for this event), the prior
+  // ring belongs to a finished build phase — clear and start fresh.
+  auto &ring = data->eventToBtxDesct[{hostname, vpid, hEvent}];
+  if (ring.cursor > 0) {
+    ring.entries.clear();
+    ring.cursor = 0;
+  }
+  ring.entries.push_back({vtid,         commandQueueDesc,
+                          hCommandList, hCommandListIsImmediate,
+                          hDevice,      commandName,
+                          ts_min,       clockLttngDevice,
+                          type,         ptr});
   // Prepare job for non IMM
   if (!hCommandListIsImmediate)
     data->commandListToEvents[{hostname, vpid, hCommandList}].insert(hEvent);
@@ -904,21 +890,16 @@ static void event_profiling_result_callback(void *btx_handle,
 
   auto *data = static_cast<data_t *>(usr_data);
 
-  // Read the oldest pending metadata for this hEvent and consume it —
-  // FIFO matches the submission order, which is the order results
-  // arrive in for in-order cmdlists with shared signal events. Each
-  // Append pushes once; each result pops once; the deque drains
-  // exactly. We do NOT rotate (push_back + pop_front): that would
-  // be needed only for OOO-resubmit-without-reset of the same cmdlist
-  // (one set of pushes serving M*N pops), a case the universal tracer
-  // explicitly defers.
+  // Read the current ring slot for this event; advance the cursor;
+  // wrap to 0 on overflow. Resubmits re-cycle through the same ring.
   const auto it_p = data->eventToBtxDesct.find({hostname, vpid, hEvent});
-  if (it_p == data->eventToBtxDesct.cend() || it_p->second.empty())
+  if (it_p == data->eventToBtxDesct.cend() || it_p->second.entries.empty())
     return;
-  const auto popped = it_p->second.front();
-  it_p->second.pop_front();
+  auto &ring = it_p->second;
+  if (ring.cursor >= ring.entries.size()) ring.cursor = 0;
   const auto &[vtid_submission, commandQueueDesc, hCommandList, hCommandListIsImmediate, device,
-               commandName, lltngMin, clockLttngDevice, type, ptr] = popped;
+               commandName, lltngMin, clockLttngDevice, type, ptr] = ring.entries[ring.cursor];
+  ring.cursor++;
   std::string metadata = "";
   {
     std::stringstream ss_metadata;
@@ -1408,8 +1389,6 @@ void btx_register_usr_callbacks(void *btx_handle) {
   /*  Name of the Function Profiled  */
   REGISTER_ASSOCIATED_CALLBACK(zeKernelCreate_entry);
   REGISTER_ASSOCIATED_CALLBACK(zeKernelCreate_exit);
-  btx_register_callbacks_lttng_ust_ze_zeKernelDestroy_entry(btx_handle,
-                                                            &zeKernelDestroy_entry_callback);
   REGISTER_ASSOCIATED_CALLBACK(zeKernelGetName_entry);
   REGISTER_ASSOCIATED_CALLBACK(zeKernelGetName_exit);
 
diff --git a/backends/ze/btx_zeinterval_callbacks.hpp b/backends/ze/btx_zeinterval_callbacks.hpp
index 61ce47aa..165e6c13 100644
--- a/backends/ze/btx_zeinterval_callbacks.hpp
+++ b/backends/ze/btx_zeinterval_callbacks.hpp
@@ -4,7 +4,6 @@
 #include <cstddef> // Bytes
 typedef bool _Bool;
 #include <metababel/metababel.h>
-#include <deque>
 #include <optional>
 #include <stdexcept>
 #include <tuple>
@@ -94,15 +93,18 @@ struct data_s {
   std::unordered_map<hp_command_queue_t, ze_command_queue_desc_t> commandQueueToDesc;
 
   std::unordered_map<hpt_t, btx_launch_desc_t> threadToLastLaunchInfo;
-  /* FIFO of pending metadata per event handle.
-   *
-   * A single hEvent can be the signal event of multiple Appends on
-   * the same cmdlist; the tracer inserts a per-Append Query for each
-   * occurrence, producing one event_profiling_results tracepoint per
-   * use. Each Append pushes its metadata at event_profiling time;
-   * each result pop_fronts to attribute it to the correct (FIFO-
-   * oldest) Append. */
-  std::unordered_map<hp_event_t, std::deque<btx_event_desct_t>> eventToBtxDesct;
+
+  /* Per-event metadata ring. An hEvent can be the signal event of N
+   * Appends in one build phase, and the cl can be resubmitted M times,
+   * yielding M*N result events. We store the N Appends as a vector and
+   * advance `cursor` per result, wrapping at the end. A new push that
+   * arrives after the cursor advanced indicates a new build phase —
+   * we clear and start over so the ring tracks only the current phase. */
+  struct event_ring_t {
+    std::vector<btx_event_desct_t> entries;
+    size_t cursor = 0;
+  };
+  std::unordered_map<hp_event_t, event_ring_t> eventToBtxDesct;
   // Require for non IMM
   std::unordered_map<hp_command_list_t, std::unordered_set<ze_event_handle_t>> commandListToEvents;
 

From efb285b68a607f9b07c909b2d93c730bd65d02c7 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Mon, 1 Jun 2026 22:50:15 +0000
Subject: [PATCH 07/54] btx_zeinterval: tests for resubmit +
 shared-event-resubmit + cross-phase

Three new tally-side regression tests (raw tracepoint stream input,
expected output diff'd at make-check time):

  interval_profiling_resubmit_event           single Append re-fired
                                              by a closed cl on two
                                              Executes
  interval_profiling_shared_event_resubmit    two Appends share one
                                              event, cl re-executed
  interval_profiling_shared_event_xphase      shared event used in
                                              two distinct build
                                              phases (the cursor's
                                              new-phase reset path)

All three exercise the per-event cursor ring and would have failed
under the prior single-entry-overwrite or pop_front tallies.
---
 backends/ze/Makefile.am                          |  5 ++++-
 ...erval_profiling_resubmit_event.bt_text_pretty |  3 +++
 ...al_profiling_resubmit_event.thapi_text_pretty |  5 +++++
 ...rofiling_shared_event_resubmit.bt_text_pretty |  6 ++++++
 ...iling_shared_event_resubmit.thapi_text_pretty | 10 ++++++++++
 ..._profiling_shared_event_xphase.bt_text_pretty | 10 ++++++++++
 ...ofiling_shared_event_xphase.thapi_text_pretty | 16 ++++++++++++++++
 7 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty
 create mode 100644 backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
 create mode 100644 backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty
 create mode 100644 backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
 create mode 100644 backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty
 create mode 100644 backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty

diff --git a/backends/ze/Makefile.am b/backends/ze/Makefile.am
index b037bad8..d0ecaf54 100644
--- a/backends/ze/Makefile.am
+++ b/backends/ze/Makefile.am
@@ -281,7 +281,10 @@ TRACE_COMMON = \
 	tests/interval_profiling_fast.thapi_text_pretty \
 	tests/interval_profiling_interleave_process.thapi_text_pretty \
 	tests/interval_profiling_ignore.thapi_text_pretty \
-	tests/interval_profiling_shared_event.thapi_text_pretty
+	tests/interval_profiling_shared_event.thapi_text_pretty \
+	tests/interval_profiling_resubmit_event.thapi_text_pretty \
+	tests/interval_profiling_shared_event_resubmit.thapi_text_pretty \
+	tests/interval_profiling_shared_event_xphase.thapi_text_pretty
 
 BTX_ZE_GENERATED_SOURCE_TEST = \
         btx_source_ze_test/metababel/metababel.h \
diff --git a/backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty b/backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty
new file mode 100644
index 00000000..68e12d80
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty
@@ -0,0 +1,3 @@
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
diff --git a/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
new file mode 100644
index 00000000..b7898be0
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
@@ -0,0 +1,5 @@
+12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
+12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 }
diff --git a/backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty b/backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty
new file mode 100644
index 00000000..25a5c77b
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty
@@ -0,0 +1,6 @@
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 50, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
diff --git a/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
new file mode 100644
index 00000000..c6acc6e5
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
@@ -0,0 +1,10 @@
+12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.110000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.120000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
+12:00:00.210000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 }
+12:00:00.300000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 200, globalEnd: 240, contextStart: 200, contextEnd: 240 }
+12:00:00.310000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 300, globalEnd: 350, contextStart: 300, contextEnd: 350 }
diff --git a/backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty b/backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty
new file mode 100644
index 00000000..ebbbc90a
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty
@@ -0,0 +1,10 @@
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 50, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
diff --git a/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
new file mode 100644
index 00000000..c69025a0
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
@@ -0,0 +1,16 @@
+12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.110000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.120000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
+12:00:00.210000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 }
+12:00:00.300000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 200, globalEnd: 240, contextStart: 200, contextEnd: 240 }
+12:00:00.310000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 300, globalEnd: 350, contextStart: 300, contextEnd: 350 }
+12:00:00.400000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.410000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.420000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.500000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 400, globalEnd: 410, contextStart: 400, contextEnd: 410 }
+12:00:00.510000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 500, globalEnd: 520, contextStart: 500, contextEnd: 520 }
+12:00:00.520000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 600, globalEnd: 640, contextStart: 600, contextEnd: 640 }

From 5c9aa658a98aa1050685c8b9c1558d4a3a8f1d74 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Tue, 2 Jun 2026 23:10:03 +0000
Subject: [PATCH 08/54] ze: shadow compute cl for AppendQueryKernelTimestamps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Profiled Appends on a copy-only user cl previously triggered a driver
abort because AppendQueryKernelTimestamps is rejected on copy engines.
Move the Query op to a per-(context, device) tracer-owned immediate
ASYNC compute cl (lazy-created, first compute queue group). The user
cl body now just signals an injected event; the shadow cl's Query
waits on it and writes the timestamps to the slab.

To preserve the user's signal contract, AppendBarrier on the user cl
chains user_signal off inj. To drain correctly, each slot owns its
own shadow_done event the Query signals, and drain host-syncs on it
before reading the slab.

Execute-side bookkeeping (force-sync-prior + Append-Query +
instantiate + claim in_flight_q) all moves to the Execute *epilogue*
under cl_data->mtx as a single critical section. Two reasons:
* Append-Query on the shadow cl before the user cl is in flight
  deadlocks when both share the engine (only one compute group on
  this hardware) — the pending shadow Query holds the engine and the
  user cl can never dispatch.
* Concurrent Executes on the same cl from two threads need the
  force-sync-prior to observe a sibling's claim-in_flight_q
  atomically (regression test: inorder_reg_Event_11).
---
 backends/ze/tracer_ze_helpers.include.c | 308 ++++++++++++++++++++++--
 backends/ze/ze_model.rb                 |  17 +-
 2 files changed, 294 insertions(+), 31 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 62df0ed7..29b3f221 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -92,6 +92,7 @@ struct _ze_slot;
  * after emit, the slot is dropped from the cl's list. */
 struct _ze_slot {
   struct _ze_event_h *inj;             /* tracer-owned event the Query waits on */
+  struct _ze_event_h *shadow_done;     /* tracer-owned event the Query signals; drain host-syncs on this */
   ze_event_handle_t   attr;            /* user's original signal event (NULL => inj->event) */
   size_t              off;             /* byte offset within cl_data->slab */
   /* User wait events copied at Append time (stable across rebuilds);
@@ -127,6 +128,12 @@ struct _ze_command_list_obj_data {
   pthread_mutex_t    mtx;
   unsigned char      is_immediate;
   unsigned char      is_in_order;
+
+  /* Cached on first use: device handle and context handle for this cl.
+   * Both are immutable for the life of the cl, so caching avoids the
+   * per-Append/per-Execute ZE_*_GET_*_HANDLE_PTR roundtrips. */
+  ze_device_handle_t  cached_device;
+  ze_context_handle_t cached_context;
 };
 
 struct _ze_command_list_obj_data *_ze_cls = NULL;
@@ -156,6 +163,177 @@ pthread_mutex_t _ze_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
     pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
   } while (0)
 
+/* Per-device cache of the first COMPUTE queue group ordinal. The lookup
+ * is read-mostly: scan zeDeviceGetCommandQueueGroupProperties once,
+ * remember the answer. valid=0 means "we already checked and there's no
+ * compute group on this device" — treated as fatal at use sites. */
+struct _ze_compute_ord_entry {
+  ze_device_handle_t device;
+  uint32_t           ordinal;
+  unsigned char      valid;
+  UT_hash_handle     hh;
+};
+static struct _ze_compute_ord_entry *_ze_compute_ords = NULL;
+static pthread_mutex_t _ze_compute_ords_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Returns the first COMPUTE queue group ordinal for device, or (uint32_t)-1
+ * if the device exposes no compute group (fatal — caller should bail). */
+static uint32_t _get_compute_ordinal(ze_device_handle_t device) {
+  pthread_mutex_lock(&_ze_compute_ords_mutex);
+  struct _ze_compute_ord_entry *e = NULL;
+  HASH_FIND_PTR(_ze_compute_ords, &device, e);
+  if (e) {
+    uint32_t r = e->valid ? e->ordinal : (uint32_t)-1;
+    pthread_mutex_unlock(&_ze_compute_ords_mutex);
+    return r;
+  }
+  pthread_mutex_unlock(&_ze_compute_ords_mutex);
+
+  /* Slow path: scan queue groups outside the lock. */
+  uint32_t n_groups = 0;
+  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, NULL)
+      != ZE_RESULT_SUCCESS || n_groups == 0)
+    return (uint32_t)-1;
+  ze_command_queue_group_properties_t *groups =
+      (ze_command_queue_group_properties_t *)calloc(n_groups, sizeof(*groups));
+  if (!groups) return (uint32_t)-1;
+  for (uint32_t i = 0; i < n_groups; ++i)
+    groups[i].stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES;
+  uint32_t found = (uint32_t)-1;
+  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, groups)
+      == ZE_RESULT_SUCCESS) {
+    for (uint32_t i = 0; i < n_groups; ++i)
+      if (groups[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
+        found = i; break;
+      }
+  }
+  free(groups);
+
+  pthread_mutex_lock(&_ze_compute_ords_mutex);
+  /* Re-check under the lock — another thread may have populated. */
+  HASH_FIND_PTR(_ze_compute_ords, &device, e);
+  if (!e) {
+    e = (struct _ze_compute_ord_entry *)calloc(1, sizeof(*e));
+    if (e) {
+      e->device  = device;
+      e->ordinal = found;
+      e->valid   = (found != (uint32_t)-1) ? 1 : 0;
+      HASH_ADD_PTR(_ze_compute_ords, device, e);
+    }
+  } else {
+    found = e->valid ? e->ordinal : (uint32_t)-1;
+  }
+  pthread_mutex_unlock(&_ze_compute_ords_mutex);
+  return found;
+}
+
+/* Per-(context, device) tracer-owned immediate OOO compute cl used to
+ * host the AppendQueryKernelTimestamps op. The Query can't live on the
+ * user's cl when that cl is on a copy-only queue group (driver aborts),
+ * and we use the shadow cl uniformly for all engines so the code path
+ * is identical regardless of user-cl kind. */
+struct _ze_shadow_key {
+  ze_context_handle_t context;
+  ze_device_handle_t  device;
+};
+struct _ze_shadow_cl {
+  struct _ze_shadow_key    key;
+  ze_command_list_handle_t cl;
+  pthread_mutex_t          mtx;
+  UT_hash_handle           hh;
+};
+static struct _ze_shadow_cl *_ze_shadow_cls = NULL;
+static pthread_mutex_t _ze_shadow_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Returns the shadow cl for (context, device), creating it lazily on
+ * first use. Returns NULL if the device has no compute group (fatal:
+ * we log to stderr) or if creation fails. */
+static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
+                                            ze_device_handle_t device) {
+  struct _ze_shadow_key key = { context, device };
+  pthread_mutex_lock(&_ze_shadow_cls_mutex);
+  struct _ze_shadow_cl *sh = NULL;
+  HASH_FIND(hh, _ze_shadow_cls, &key, sizeof(key), sh);
+  if (sh) { pthread_mutex_unlock(&_ze_shadow_cls_mutex); return sh; }
+  pthread_mutex_unlock(&_ze_shadow_cls_mutex);
+
+  /* Slow path: create outside the registry lock. */
+  uint32_t ord = _get_compute_ordinal(device);
+  if (ord == (uint32_t)-1) {
+    fprintf(stderr, "THAPI: device %p has no COMPUTE queue group; "
+                    "cannot create shadow cl. Profiling disabled for "
+                    "command lists on this device.\n", (void *)device);
+    return NULL;
+  }
+  /* ASYNCHRONOUS mode is critical: with SYNCHRONOUS (the DEFAULT),
+   * each AppendQueryKernelTimestamps on this immediate cl blocks until
+   * the Query completes — which it can't, because Query is waiting on
+   * inj, and inj is signaled by the user cl's kernel that hasn't been
+   * submitted yet (we're called from the user's Execute prologue).
+   * Deadlock. ASYNCHRONOUS lets the Append return immediately and the
+   * Query run device-side at its own pace. */
+  ze_command_queue_desc_t qd = {
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, NULL, ord, 0, 0,
+      ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL };
+  ze_command_list_handle_t new_cl = NULL;
+  if (ZE_COMMAND_LIST_CREATE_IMMEDIATE_PTR(context, device, &qd, &new_cl)
+        != ZE_RESULT_SUCCESS || !new_cl) {
+    fprintf(stderr, "THAPI: failed to create shadow cl for "
+                    "context=%p device=%p\n", (void *)context, (void *)device);
+    return NULL;
+  }
+
+  pthread_mutex_lock(&_ze_shadow_cls_mutex);
+  HASH_FIND(hh, _ze_shadow_cls, &key, sizeof(key), sh);
+  if (sh) {
+    /* Lost the race; destroy ours and return the winner. */
+    pthread_mutex_unlock(&_ze_shadow_cls_mutex);
+    ZE_COMMAND_LIST_DESTROY_PTR(new_cl);
+    return sh;
+  }
+  sh = (struct _ze_shadow_cl *)calloc(1, sizeof(*sh));
+  if (!sh) {
+    pthread_mutex_unlock(&_ze_shadow_cls_mutex);
+    ZE_COMMAND_LIST_DESTROY_PTR(new_cl);
+    return NULL;
+  }
+  sh->key = key;
+  sh->cl  = new_cl;
+  pthread_mutex_init(&sh->mtx, NULL);
+  HASH_ADD(hh, _ze_shadow_cls, key, sizeof(sh->key), sh);
+  pthread_mutex_unlock(&_ze_shadow_cls_mutex);
+  return sh;
+}
+
+/* Append AppendQueryKernelTimestamps on the shadow cl: wait on inj,
+ * signal shadow_done, write timestamps into slab[*off]. Serialized on
+ * sh->mtx because L0 doesn't allow concurrent Appends to one cl.
+ * Returns 0 on success, -1 on failure. */
+static int _shadow_append_query(struct _ze_shadow_cl *sh,
+                                ze_event_handle_t inj_event,
+                                void *slab, size_t *off,
+                                ze_event_handle_t shadow_done_event) {
+  pthread_mutex_lock(&sh->mtx);
+  ze_result_t r = ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(
+      sh->cl, 1, &inj_event, slab, off,
+      /*hSignalEvent=*/ shadow_done_event,
+      /*numWaitEvents=*/ 1, &inj_event);
+  pthread_mutex_unlock(&sh->mtx);
+  return (r == ZE_RESULT_SUCCESS) ? 0 : -1;
+}
+
+/* Lazy-cached device handle for a cl. The device is immutable for the
+ * cl's lifetime; we just want to avoid repeating the L0 call. */
+static ze_device_handle_t _cl_get_device(struct _ze_command_list_obj_data *cl_data) {
+  if (cl_data->cached_device) return cl_data->cached_device;
+  ze_device_handle_t d = NULL;
+  if (ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR((ze_command_list_handle_t)cl_data->ptr, &d)
+      != ZE_RESULT_SUCCESS)
+    return NULL;
+  cl_data->cached_device = d;
+  return d;
+}
+
 static inline void _on_create_command_list(ze_command_list_handle_t command_list,
                                             int immediate, int in_order) {
   struct _ze_command_list_obj_data *cl_data = NULL;
@@ -394,6 +572,7 @@ static inline int _cl_slots_grow(struct _ze_command_list_obj_data *cl_data) {
 static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_data,
                                         ze_context_handle_t ctx,
                                         struct _ze_event_h *inj,
+                                        struct _ze_event_h *shadow_done,
                                         ze_event_handle_t attr,
                                         ze_event_handle_t *waits,
                                         uint32_t n_waits) {
@@ -401,8 +580,9 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
   uint32_t idx = cl_data->n_slots++;
   struct _ze_slot *s = &cl_data->slots[idx];
   if (_cl_slab_ensure(cl_data, ctx, idx + 1) != 0) return NULL;
-  s->inj   = inj;
-  s->attr  = attr;
+  s->inj         = inj;
+  s->shadow_done = shadow_done;
+  s->attr        = attr;
   s->off   = (size_t)idx * sizeof(ze_kernel_timestamp_result_t);
   s->live  = 0;
   s->preds = NULL; s->n_preds = 0;
@@ -464,35 +644,76 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   }
   inj->context = ctx;
 
+  /* Tracer-owned fence event: Query signals it, drain host-waits on it
+   * before reading the slab. Decouples drain-time correctness from any
+   * user sync on user_signal — required because in step 2 the Query
+   * moves to a separate shadow cl whose completion isn't implied by
+   * user-level sync. Allocated here so the failure path can release
+   * inj symmetrically. */
+  struct _ze_event_h *shadow_done = _get_profiling_event(command_list);
+  if (!shadow_done) { PUT_ZE_EVENT(inj); return; }
+  shadow_done->context = ctx;
+
   struct _ze_command_list_obj_data *cl_data = NULL;
   FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) { PUT_ZE_EVENT(inj); return; }
+  if (!cl_data) { PUT_ZE_EVENT(inj); PUT_ZE_EVENT(shadow_done); return; }
   pthread_mutex_lock(&cl_data->mtx);
 
-  struct _ze_slot *s = _cl_slot_append(cl_data, ctx, inj, user_signal,
-                                       user_waits, user_n_waits);
+  struct _ze_slot *s = _cl_slot_append(cl_data, ctx, inj, shadow_done,
+                                       user_signal, user_waits, user_n_waits);
   if (!s) {
     pthread_mutex_unlock(&cl_data->mtx);
     ADD_ZE_CL(cl_data);
     PUT_ZE_EVENT(inj);
+    PUT_ZE_EVENT(shadow_done);
     return;
   }
 
   ze_event_handle_t wait_ev = inj->event;
-  ze_result_t r = ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(
-      command_list, 1, &wait_ev, cl_data->slab, &s->off,
-      /*hSignalEvent=*/ user_signal,
-      /*numWaitEvents=*/ 1, &wait_ev);
-  if (r != ZE_RESULT_SUCCESS) {
-    /* Roll the slot back: it was the last one appended. */
-    free(s->waits);
-    cl_data->n_slots--;
-    pthread_mutex_unlock(&cl_data->mtx);
-    ADD_ZE_CL(cl_data);
-    PUT_ZE_EVENT(inj);
-    return;
+
+  /* Chain user_signal off inj so the user's wait still completes once
+   * the underlying op has fired. We swapped user's hSignalEvent for
+   * inj in the prologue, so nothing else on this cl signals
+   * user_signal. AppendBarrier (not AppendSignalEvent) because we need
+   * to both wait on inj and signal user_signal; SignalEvent doesn't
+   * take a wait list. Skipped when user passed NULL. */
+  if (user_signal) {
+    ze_result_t rs = ZE_COMMAND_LIST_APPEND_BARRIER_PTR(
+        command_list, user_signal, 1, &wait_ev);
+    if (rs != ZE_RESULT_SUCCESS) {
+      free(s->waits);
+      cl_data->n_slots--;
+      pthread_mutex_unlock(&cl_data->mtx);
+      ADD_ZE_CL(cl_data);
+      PUT_ZE_EVENT(inj);
+      PUT_ZE_EVENT(shadow_done);
+      return;
+    }
+  }
+
+  /* The Query Append now lives on the per-(context, device) shadow
+   * compute cl rather than the user cl. This is what lets us profile
+   * copy-only user cls — copy engines reject AppendQueryKernelTimestamps
+   * but the shadow cl is always compute. For regular user cls we defer
+   * the Append to Execute prologue (the user cl hasn't run yet, so
+   * nothing is signaling inj — Appending the Query on an immediate
+   * shadow cl now would let it fire too early on a stale inj). */
+  if (cl_data->is_immediate) {
+    cl_data->cached_context = ctx;
+    ze_device_handle_t dev = _cl_get_device(cl_data);
+    struct _ze_shadow_cl *sh = dev ? _get_shadow_cl(ctx, dev) : NULL;
+    if (!sh || _shadow_append_query(sh, inj->event, cl_data->slab, &s->off,
+                                     shadow_done->event) != 0) {
+      free(s->waits);
+      cl_data->n_slots--;
+      pthread_mutex_unlock(&cl_data->mtx);
+      ADD_ZE_CL(cl_data);
+      PUT_ZE_EVENT(inj);
+      PUT_ZE_EVENT(shadow_done);
+      return;
+    }
+    _slot_instantiate(cl_data, s);
   }
-  if (cl_data->is_immediate) _slot_instantiate(cl_data, s);
   pthread_mutex_unlock(&cl_data->mtx);
   ADD_ZE_CL(cl_data);
 }
@@ -507,6 +728,16 @@ static void _slot_drain(struct _ze_command_list_obj_data *cl_data,
   for (uint32_t i = 0; i < s->n_preds; ++i)
     _slot_drain(cl_data, s->preds[i]);
   s->live = 0;
+  /* Block until our Query op has actually fired, then reset the fence
+   * so the next Execute round starts with a clean event. We can't
+   * trust the caller's sync to have covered the Query — in step 2 the
+   * Query will live on a separate shadow cl, and even in step 1 this
+   * makes the slab read unconditional rather than relying on cl-order
+   * implications. */
+  if (s->shadow_done && s->shadow_done->event) {
+    ZE_EVENT_HOST_SYNCHRONIZE_PTR(s->shadow_done->event, UINT64_MAX);
+    ZE_EVENT_HOST_RESET_PTR(s->shadow_done->event);
+  }
   ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
   if (cl_data->slab && attr &&
       tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) {
@@ -580,12 +811,24 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
   pthread_mutex_unlock(&_ze_cls_mutex);
 }
 
-/* zeCommandQueueExecuteCommandLists PROLOGUE.
- * Per cl: if in_flight_q is set from a prior Execute, force-sync that
- * queue and drain first (so the prior run's slab is read before the new
- * Execute overwrites it). Then instantiate every slot for the new run
- * and stamp in_flight_q. */
-static void _on_execute_command_lists_prologue(ze_command_queue_handle_t hQueue,
+/* zeCommandQueueExecuteCommandLists EPILOGUE — runs AFTER L0's actual
+ * Execute has returned, with the user cl in flight on its engine.
+ *
+ * Three things happen here, all under cl_data->mtx so a concurrent
+ * Execute (or Sync) on another thread sees them atomically:
+ *
+ *   1) If in_flight_q is set from a prior Execute by *another* thread,
+ *      force-sync that queue and drain the slab before we overwrite it
+ *      (regression test: inorder_reg_Event_11 — same cl on two queues
+ *      from two threads, expect both rounds' timings).
+ *   2) Append a fresh Query on the per-(ctx,device) shadow cl for each
+ *      slot. Must run AFTER L0 Execute (not before) — Appending on the
+ *      shadow cl before the user cl is in flight deadlocks when the
+ *      shadow shares the engine with the user cl (see
+ *      tests/bugs/query_on_separate_cl_regular_user_cl).
+ *   3) Stamp in_flight_q = hQueue and instantiate the slot, publishing
+ *      it to the dep graph + as the "owner" of this queue. */
+static void _on_execute_command_lists_epilogue(ze_command_queue_handle_t hQueue,
                                                 uint32_t numCommandLists,
                                                 ze_command_list_handle_t *phCommandLists) {
   for (uint32_t i = 0; i < numCommandLists; ++i) {
@@ -597,8 +840,21 @@ static void _on_execute_command_lists_prologue(ze_command_queue_handle_t hQueue,
       ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX);
       _cl_drain(cl_data);
     }
-    for (uint32_t j = 0; j < cl_data->n_slots; ++j)
-      _slot_instantiate(cl_data, &cl_data->slots[j]);
+    ze_context_handle_t ctx = cl_data->cached_context;
+    if (!ctx) {
+      if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(phCommandLists[i], &ctx) == ZE_RESULT_SUCCESS)
+        cl_data->cached_context = ctx;
+    }
+    ze_device_handle_t dev = _cl_get_device(cl_data);
+    struct _ze_shadow_cl *sh = (ctx && dev) ? _get_shadow_cl(ctx, dev) : NULL;
+    for (uint32_t j = 0; j < cl_data->n_slots; ++j) {
+      struct _ze_slot *slot = &cl_data->slots[j];
+      if (!sh || !slot->inj || !slot->shadow_done) continue;
+      if (_shadow_append_query(sh, slot->inj->event, cl_data->slab,
+                               &slot->off, slot->shadow_done->event) != 0)
+        continue;  /* slot stays not-live this round; we miss this timing */
+      _slot_instantiate(cl_data, slot);
+    }
     cl_data->in_flight_q = hQueue;
     pthread_mutex_unlock(&cl_data->mtx);
   }
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index e54ccbd9..b5c0d70a 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -159,11 +159,18 @@ def upper_snake_case(str)
 # synchronized before they reset or destroy the cmdlist, so all our
 # slots are already drained.
 
-# Force-sync prior Execute on a cl before the next Execute would
-# overwrite its slab.
-register_prologue 'zeCommandQueueExecuteCommandLists', <<EOF
-  if (_do_profile && numCommandLists > 0 && phCommandLists)
-    _on_execute_command_lists_prologue(hCommandQueue, numCommandLists, phCommandLists);
+# Epilogue runs after L0's actual submission has returned. ALL the
+# tracer's bookkeeping for Execute happens here (no prologue) so that
+# concurrent Executes / Syncs from other threads observe in_flight_q
+# atomically — the force-sync-prior + Append-Query + claim-in_flight_q
+# are one critical section.
+#
+# The Append-Query specifically MUST run after L0 submit, not before:
+# the shadow cl can share the engine with the user cl, and a pending
+# shadow Query op holds the engine, deadlocking the user cl.
+register_epilogue 'zeCommandQueueExecuteCommandLists', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && numCommandLists > 0 && phCommandLists)
+    _on_execute_command_lists_epilogue(hCommandQueue, numCommandLists, phCommandLists);
 EOF
 
 # Sync hooks: walk dependency edges from the synced anchor and drain

From 29904faf7918b1f8bc4f6d4401d25feea9cf19b0 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 19:59:58 +0000
Subject: [PATCH 09/54] ze: preallocate slot array to keep slot pointers stable

Slots stored raw pointers to each other in preds[] and were referenced
from the global latest[ev] -> slot map. The slot array was a realloc'd
buffer starting at cap 8 and doubling; on any realloc that moved the
buffer, every stored slot pointer became dangling, and the dep-graph
walks at drain time silently skipped any slot allocated before the
last realloc.

In practice this lost timing records non-uniformly: a sync that
walked preds back through the chain hit a dangling pointer and the
recursion ended early (or hit garbage that happened to look not-live).
Standalone reproducer in tests/bugs/missing_drain_dag/repro.c:
3 waves of 12 Appends each. Before this fix, drain emits 20/36
deterministically (slots 8..11 of each wave). After: 36/36.

Fix is to allocate the slot array once at first use, sized to the
slab cap (64). The slab itself is already hard-capped at 64, so
allowing slot growth past it gained nothing.
---
 backends/ze/tracer_ze_helpers.include.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 29b3f221..55700391 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -553,15 +553,20 @@ static int _cl_slab_ensure(struct _ze_command_list_obj_data *cl_data,
   return 0;
 }
 
-/* Universal scheme: grow the slot array if full. */
+/* Slot capacity is fixed at _ZE_SLAB_SLOTS_INITIAL to keep slot
+ * addresses stable for the cl's lifetime. We store raw slot pointers
+ * in `latest[ev] -> slot` and in other slots' `preds[]`; realloc would
+ * invalidate every one of them, silently breaking dep-graph walks
+ * (see tests/bugs/missing_drain_dag). The slab itself caps at the
+ * same number, so growing slots beyond it gains nothing anyway. */
 static inline int _cl_slots_grow(struct _ze_command_list_obj_data *cl_data) {
   if (cl_data->n_slots < cl_data->cap_slots) return 0;
-  uint32_t new_cap = cl_data->cap_slots ? cl_data->cap_slots * 2 : 8;
-  struct _ze_slot *grown = (struct _ze_slot *)realloc(
-      cl_data->slots, new_cap * sizeof(struct _ze_slot));
-  if (!grown) return -1;
-  cl_data->slots = grown;
-  cl_data->cap_slots = new_cap;
+  if (cl_data->cap_slots != 0) return -1;  /* already at cap, no growth */
+  struct _ze_slot *fresh = (struct _ze_slot *)calloc(
+      _ZE_SLAB_SLOTS_INITIAL, sizeof(struct _ze_slot));
+  if (!fresh) return -1;
+  cl_data->slots = fresh;
+  cl_data->cap_slots = _ZE_SLAB_SLOTS_INITIAL;
   return 0;
 }
 

From cbb928bd190d54e21a0680b53699eb04939eef91 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:15:59 +0000
Subject: [PATCH 10/54] ze: drain preds via slot->owner, not caller cl_data

_slot_drain used to recurse with the caller's cl_data, but pred
slots can live in another cl when the user signal chain crosses
cls (cl_A signal=e1, cl_B wait=e1 signal=e2, sync(e2) only).
Recursing with caller's cl_data read cl_B->slab + slot_A->off and
emitted slot_A's tracepoint with cl_B's timestamps.

Each slot now carries an owner back-pointer set at _cl_slot_append.
_slot_drain drops its cl_data argument and reads from s->owner->slab,
so cross-cl preds resolve to the right slab. _on_sync_drain_event
collapses from the cl-walk to a direct s->owner->mtx lock.

Reproducer: tests/matrix/inorder_reg_Event_15 (busy_b/busy_a>=3
ratio); fails before this commit, passes after. Full matrix 43/43.
---
 backends/ze/tracer_ze_helpers.include.c | 59 ++++++++++++-------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 55700391..75281310 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -91,16 +91,17 @@ struct _ze_slot;
  * synced anchor and drain everything reachable. Drain is pop semantics:
  * after emit, the slot is dropped from the cl's list. */
 struct _ze_slot {
+  struct _ze_command_list_obj_data *owner; /* cl_data this slot lives in (==> .slab to read at drain) */
   struct _ze_event_h *inj;             /* tracer-owned event the Query waits on */
   struct _ze_event_h *shadow_done;     /* tracer-owned event the Query signals; drain host-syncs on this */
   ze_event_handle_t   attr;            /* user's original signal event (NULL => inj->event) */
-  size_t              off;             /* byte offset within cl_data->slab */
+  size_t              off;             /* byte offset within owner->slab */
   /* User wait events copied at Append time (stable across rebuilds);
    * preds[] is computed at instantiate from waits[] by looking up
    * latest[w] for each w. */
   ze_event_handle_t  *waits;
   uint32_t            n_waits;
-  struct _ze_slot   **preds;           /* points at slots whose drain must come first */
+  struct _ze_slot   **preds;           /* points at slots whose drain must come first (may be in another cl) */
   uint32_t            n_preds;
   unsigned char       live;            /* in-flight (instantiated, not drained) */
 };
@@ -585,6 +586,7 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
   uint32_t idx = cl_data->n_slots++;
   struct _ze_slot *s = &cl_data->slots[idx];
   if (_cl_slab_ensure(cl_data, ctx, idx + 1) != 0) return NULL;
+  s->owner       = cl_data;
   s->inj         = inj;
   s->shadow_done = shadow_done;
   s->attr        = attr;
@@ -726,12 +728,20 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
 /* Drain one slot. Recurses on its preds first, then emits this slot
  * and pops it. Pop = clear inj/waits/preds; the holed entry is reused
  * by later _cl_slot_append calls. Safe to call on already-drained
- * (live=0) slot. */
-static void _slot_drain(struct _ze_command_list_obj_data *cl_data,
-                        struct _ze_slot *s) {
+ * (live=0) slot.
+ *
+ * Reads use s->owner->slab — preds may live in a different cl than the
+ * caller (cross-cl signal chains), so we cannot use the caller's slab.
+ *
+ * Locking: a pred on another cl is read/mutated WITHOUT taking its
+ * owner's mtx. That's safe in the current model because slot pointers
+ * are stable (cap is fixed, never realloc'd) and live-flag clearing
+ * races are benign — the worst case is one extra tracepoint emit, not
+ * a UAF. Take the pred's mtx only if we ever start freeing slot arrays. */
+static void _slot_drain(struct _ze_slot *s) {
   if (!s || !s->live) return;
   for (uint32_t i = 0; i < s->n_preds; ++i)
-    _slot_drain(cl_data, s->preds[i]);
+    _slot_drain(s->preds[i]);
   s->live = 0;
   /* Block until our Query op has actually fired, then reset the fence
    * so the next Execute round starts with a clean event. We can't
@@ -744,10 +754,10 @@ static void _slot_drain(struct _ze_command_list_obj_data *cl_data,
     ZE_EVENT_HOST_RESET_PTR(s->shadow_done->event);
   }
   ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
-  if (cl_data->slab && attr &&
+  if (s->owner && s->owner->slab && attr &&
       tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) {
     ze_kernel_timestamp_result_t r =
-        *(ze_kernel_timestamp_result_t *)((char *)cl_data->slab + s->off);
+        *(ze_kernel_timestamp_result_t *)((char *)s->owner->slab + s->off);
     do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr,
                   ZE_RESULT_SUCCESS, ZE_RESULT_SUCCESS,
                   r.global.kernelStart, r.global.kernelEnd,
@@ -762,7 +772,7 @@ static void _slot_drain(struct _ze_command_list_obj_data *cl_data,
 /* Drain every live slot in a cl. */
 static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
   for (uint32_t i = 0; i < cl_data->n_slots; ++i)
-    _slot_drain(cl_data, &cl_data->slots[i]);
+    _slot_drain(&cl_data->slots[i]);
   cl_data->in_flight_q = NULL;
 }
 
@@ -793,27 +803,16 @@ static void _on_sync_drain_queue(ze_command_queue_handle_t hQueue) {
 /* Drain the slot that most recently signaled `ev` (recursing on preds). */
 static void _on_sync_drain_event(ze_event_handle_t ev) {
   struct _ze_slot *s = _latest_get(ev);
-  if (!s) return;
-  /* The slot lives inside SOME cl_data->slots[]. To recurse safely
-   * under cl_data->mtx, we'd need to know which cl. For now, walk all
-   * cls and find the owner (rare path, bounded). */
-  pthread_mutex_lock(&_ze_cls_mutex);
-  struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
-  HASH_ITER(hh, _ze_cls, cl_data, tmp) {
-    if (s >= cl_data->slots && s < cl_data->slots + cl_data->cap_slots) {
-      pthread_mutex_lock(&cl_data->mtx);
-      _slot_drain(cl_data, s);
-      /* Slot's drain may not have cleared in_flight_q if other slots
-       * are still live; check whether anything remains. */
-      int any_live = 0;
-      for (uint32_t i = 0; i < cl_data->n_slots; ++i)
-        if (cl_data->slots[i].live) { any_live = 1; break; }
-      if (!any_live) cl_data->in_flight_q = NULL;
-      pthread_mutex_unlock(&cl_data->mtx);
-      break;
-    }
-  }
-  pthread_mutex_unlock(&_ze_cls_mutex);
+  if (!s || !s->owner) return;
+  pthread_mutex_lock(&s->owner->mtx);
+  _slot_drain(s);
+  /* The drained slot may have left siblings live; only clear
+   * in_flight_q if nothing in this cl remains in flight. */
+  int any_live = 0;
+  for (uint32_t i = 0; i < s->owner->n_slots; ++i)
+    if (s->owner->slots[i].live) { any_live = 1; break; }
+  if (!any_live) s->owner->in_flight_q = NULL;
+  pthread_mutex_unlock(&s->owner->mtx);
 }
 
 /* zeCommandQueueExecuteCommandLists EPILOGUE — runs AFTER L0's actual

From d8598db4597389d3a36906ea300cddfd199d222a Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:19:28 +0000
Subject: [PATCH 11/54] ze: document why _slot_drain needs no cycle guard

Code review flagged the unguarded recursion. In practice cycles are
impossible: in-order preds index strictly downward, and latest[]
preds always point to slots published earlier. Forming a cycle
requires the user to declare two Appends each waiting on the other's
signal event, which deadlocks L0 itself before we ever drain.

The live-clear-before-recurse would also stop a cycle if one
appeared, so we are belt-and-suspenders by accident. Comment only,
no code change.
---
 backends/ze/tracer_ze_helpers.include.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 75281310..17924e36 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -737,7 +737,16 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
  * owner's mtx. That's safe in the current model because slot pointers
  * are stable (cap is fixed, never realloc'd) and live-flag clearing
  * races are benign — the worst case is one extra tracepoint emit, not
- * a UAF. Take the pred's mtx only if we ever start freeing slot arrays. */
+ * a UAF. Take the pred's mtx only if we ever start freeing slot arrays.
+ *
+ * No cycle guard: the live-clear-before-recurse below would already
+ * stop a cycle (re-entry hits live==0 and returns), but cycles are also
+ * impossible by construction. preds come from two sources:
+ *   - in-order prev slot in the same cl: strictly lower slot index, DAG.
+ *   - latest[wait_event]: a slot published BEFORE us. Forming a cycle
+ *     requires the user to declare two Appends each waiting on the
+ *     other's signal event — L0 itself would deadlock the GPU on that,
+ *     so we would never observe a sync return to reach drain. */
 static void _slot_drain(struct _ze_slot *s) {
   if (!s || !s->live) return;
   for (uint32_t i = 0; i < s->n_preds; ++i)

From 6a855b84110cb766024746cff16e6fbe0522886f Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:19:51 +0000
Subject: [PATCH 12/54] ze: drop wrong claim about live-clear stopping cycles

The previous commit said live-clear-before-recurse would stop a
cycle. live is cleared AFTER the recursion (see below), not before.
The construction argument alone is the reason no guard is needed;
remove the misleading sentence.
---
 backends/ze/tracer_ze_helpers.include.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 17924e36..795570fe 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -739,9 +739,8 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
  * races are benign — the worst case is one extra tracepoint emit, not
  * a UAF. Take the pred's mtx only if we ever start freeing slot arrays.
  *
- * No cycle guard: the live-clear-before-recurse below would already
- * stop a cycle (re-entry hits live==0 and returns), but cycles are also
- * impossible by construction. preds come from two sources:
+ * No cycle guard: cycles are impossible by construction. preds come
+ * from two sources:
  *   - in-order prev slot in the same cl: strictly lower slot index, DAG.
  *   - latest[wait_event]: a slot published BEFORE us. Forming a cycle
  *     requires the user to declare two Appends each waiting on the

From 72246327d1c5583fba65a629075ea29fb877b969 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:23:30 +0000
Subject: [PATCH 13/54] ze: fold slab/slots alloc into _cl_slot_append, bump
 n_slots last

Two single-use helpers (_cl_slab_ensure, _cl_slots_grow) are inlined
into their only caller. The cap-check and the two lazy allocations
read better adjacent than dispatched through helpers.

Side-effect fix: previous code bumped n_slots BEFORE attempting the
slab alloc, so a failing first alloc left a permanent hole at index 0
(n_slots=1 but no slab). New order: cap-check, slots-alloc,
slab-alloc, slot fill-in, THEN n_slots++. An OOM at any allocation
returns NULL with cl_data unchanged.

Rollbacks in _universal_record_append still apply: they undo a fully
successful append after a downstream Barrier/shadow Append fails,
which is correct under either ordering.

Net -10 lines. Full matrix 43/43.
---
 backends/ze/tracer_ze_helpers.include.c | 68 +++++++++++--------------
 1 file changed, 29 insertions(+), 39 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 795570fe..bf6f6541 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -536,45 +536,20 @@ static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command
   return NULL;
 }
 
-/* Lazy-allocate the cl's host-visible slab buffer (fixed size, enough
- * for _ZE_SLAB_SLOTS_INITIAL timestamps). Returns 0 on success, -1 if
- * already allocated and the requested slot index is out of range, or
- * if the allocation itself failed. */
-static int _cl_slab_ensure(struct _ze_command_list_obj_data *cl_data,
-                           ze_context_handle_t ctx, uint32_t slot_idx) {
-  if (slot_idx >= _ZE_SLAB_SLOTS_INITIAL) return -1;
-  if (cl_data->slab) return 0;
-  size_t bytes = (size_t)_ZE_SLAB_SLOTS_INITIAL * sizeof(ze_kernel_timestamp_result_t);
-  ze_host_mem_alloc_desc_t hd = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, NULL, 0};
-  void *buf = NULL;
-  if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &buf) != ZE_RESULT_SUCCESS || !buf)
-    return -1;
-  memset(buf, 0, bytes);
-  cl_data->slab = buf;
-  return 0;
-}
-
-/* Slot capacity is fixed at _ZE_SLAB_SLOTS_INITIAL to keep slot
- * addresses stable for the cl's lifetime. We store raw slot pointers
- * in `latest[ev] -> slot` and in other slots' `preds[]`; realloc would
- * invalidate every one of them, silently breaking dep-graph walks
- * (see tests/bugs/missing_drain_dag). The slab itself caps at the
- * same number, so growing slots beyond it gains nothing anyway. */
-static inline int _cl_slots_grow(struct _ze_command_list_obj_data *cl_data) {
-  if (cl_data->n_slots < cl_data->cap_slots) return 0;
-  if (cl_data->cap_slots != 0) return -1;  /* already at cap, no growth */
-  struct _ze_slot *fresh = (struct _ze_slot *)calloc(
-      _ZE_SLAB_SLOTS_INITIAL, sizeof(struct _ze_slot));
-  if (!fresh) return -1;
-  cl_data->slots = fresh;
-  cl_data->cap_slots = _ZE_SLAB_SLOTS_INITIAL;
-  return 0;
-}
-
 /* Allocate one new slot at the end of the cl's slot list. Slots are
  * never reused within a cl's lifetime — the cl body's Query op
  * hard-codes inj and off; the slot is the host-side mirror that gets
- * re-instantiated on every Execute. */
+ * re-instantiated on every Execute.
+ *
+ * Capacity is fixed at _ZE_SLAB_SLOTS_INITIAL to keep slot addresses
+ * stable for the cl's lifetime. We store raw slot pointers in
+ * `latest[ev] -> slot` and in other slots' `preds[]`; realloc would
+ * invalidate every one of them, silently breaking dep-graph walks
+ * (see tests/bugs/missing_drain_dag). The slab is sized to match, so
+ * growing slots beyond it would gain nothing anyway.
+ *
+ * Allocations (slots array and slab) happen BEFORE n_slots is bumped,
+ * so an OOM does not leave a hole in the slot indexing. */
 static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_data,
                                         ze_context_handle_t ctx,
                                         struct _ze_event_h *inj,
@@ -582,10 +557,24 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
                                         ze_event_handle_t attr,
                                         ze_event_handle_t *waits,
                                         uint32_t n_waits) {
-  if (_cl_slots_grow(cl_data) != 0) return NULL;
-  uint32_t idx = cl_data->n_slots++;
+  if (cl_data->n_slots >= _ZE_SLAB_SLOTS_INITIAL) return NULL;
+  if (!cl_data->slots) {
+    cl_data->slots = (struct _ze_slot *)calloc(
+        _ZE_SLAB_SLOTS_INITIAL, sizeof(struct _ze_slot));
+    if (!cl_data->slots) return NULL;
+    cl_data->cap_slots = _ZE_SLAB_SLOTS_INITIAL;
+  }
+  if (!cl_data->slab) {
+    size_t bytes = (size_t)_ZE_SLAB_SLOTS_INITIAL * sizeof(ze_kernel_timestamp_result_t);
+    ze_host_mem_alloc_desc_t hd = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, NULL, 0};
+    void *buf = NULL;
+    if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &buf) != ZE_RESULT_SUCCESS || !buf)
+      return NULL;
+    memset(buf, 0, bytes);
+    cl_data->slab = buf;
+  }
+  uint32_t idx = cl_data->n_slots;
   struct _ze_slot *s = &cl_data->slots[idx];
-  if (_cl_slab_ensure(cl_data, ctx, idx + 1) != 0) return NULL;
   s->owner       = cl_data;
   s->inj         = inj;
   s->shadow_done = shadow_done;
@@ -600,6 +589,7 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
       s->n_waits = n_waits;
     } else { s->n_waits = 0; }
   } else { s->waits = NULL; s->n_waits = 0; }
+  cl_data->n_slots++;
   return s;
 }
 

From 83191b97ea2e7af11b9ee1c232f7aa2c695965be Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:29:40 +0000
Subject: [PATCH 14/54] ze: inline _cl_get_device into its two callers

The helper had two callers (immediate-Append shadow lookup, Execute
epilogue shadow lookup) and just lazy-cached the device handle.
Inlined both with the same lazy-cache idiom; the helper and
forward-declared cached_device-read disappear.

The two call sites pass different command_list values to the L0 call:
the Append epilogue uses its local `command_list` parameter, the
Execute epilogue uses `phCommandLists[i]`. The old helper read
cl_data->ptr (set at create time, same handle), so behavior matches.

Full matrix 43/43.
---
 backends/ze/tracer_ze_helpers.include.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index bf6f6541..04201c5b 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -323,18 +323,6 @@ static int _shadow_append_query(struct _ze_shadow_cl *sh,
   return (r == ZE_RESULT_SUCCESS) ? 0 : -1;
 }
 
-/* Lazy-cached device handle for a cl. The device is immutable for the
- * cl's lifetime; we just want to avoid repeating the L0 call. */
-static ze_device_handle_t _cl_get_device(struct _ze_command_list_obj_data *cl_data) {
-  if (cl_data->cached_device) return cl_data->cached_device;
-  ze_device_handle_t d = NULL;
-  if (ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR((ze_command_list_handle_t)cl_data->ptr, &d)
-      != ZE_RESULT_SUCCESS)
-    return NULL;
-  cl_data->cached_device = d;
-  return d;
-}
-
 static inline void _on_create_command_list(ze_command_list_handle_t command_list,
                                             int immediate, int in_order) {
   struct _ze_command_list_obj_data *cl_data = NULL;
@@ -697,7 +685,10 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
    * shadow cl now would let it fire too early on a stale inj). */
   if (cl_data->is_immediate) {
     cl_data->cached_context = ctx;
-    ze_device_handle_t dev = _cl_get_device(cl_data);
+    ze_device_handle_t dev = cl_data->cached_device;
+    if (!dev &&
+        ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev) == ZE_RESULT_SUCCESS)
+      cl_data->cached_device = dev;
     struct _ze_shadow_cl *sh = dev ? _get_shadow_cl(ctx, dev) : NULL;
     if (!sh || _shadow_append_query(sh, inj->event, cl_data->slab, &s->off,
                                      shadow_done->event) != 0) {
@@ -847,7 +838,10 @@ static void _on_execute_command_lists_epilogue(ze_command_queue_handle_t hQueue,
       if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(phCommandLists[i], &ctx) == ZE_RESULT_SUCCESS)
         cl_data->cached_context = ctx;
     }
-    ze_device_handle_t dev = _cl_get_device(cl_data);
+    ze_device_handle_t dev = cl_data->cached_device;
+    if (!dev &&
+        ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(phCommandLists[i], &dev) == ZE_RESULT_SUCCESS)
+      cl_data->cached_device = dev;
     struct _ze_shadow_cl *sh = (ctx && dev) ? _get_shadow_cl(ctx, dev) : NULL;
     for (uint32_t j = 0; j < cl_data->n_slots; ++j) {
       struct _ze_slot *slot = &cl_data->slots[j];

From eec35a97d638b9301bef17d93cd9e83b7015734f Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:32:17 +0000
Subject: [PATCH 15/54] ze: collapse _universal_record_append cleanup with goto
 fail

Four near-identical early-return blocks shared a cleanup pattern that
varied only in how much state had been claimed (event(s), cl from
hash, slot). Two labels capture the two boundaries:

  fail_locked: a slot was allocated AND/OR the cl hash entry was
               claimed AND mtx is held. Rollback slot if present,
               unlock, ADD_ZE_CL, fall through.
  fail:       events still owned, no cl claim. PUT them back.

Net -12 lines.

Full matrix 43/43.
---
 backends/ze/tracer_ze_helpers.include.c | 66 ++++++++++---------------
 1 file changed, 27 insertions(+), 39 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 04201c5b..662decea 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -622,37 +622,31 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
                                      ze_event_handle_t *user_waits,
                                      uint32_t user_n_waits) {
   if (!inj) return;
+  struct _ze_event_h *shadow_done = NULL;
+  struct _ze_command_list_obj_data *cl_data = NULL;
+  struct _ze_slot *s = NULL;
+
   ze_context_handle_t ctx = NULL;
-  if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &ctx) != ZE_RESULT_SUCCESS || !ctx) {
-    PUT_ZE_EVENT(inj);
-    return;
-  }
+  if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &ctx) != ZE_RESULT_SUCCESS || !ctx)
+    goto fail;
   inj->context = ctx;
 
   /* Tracer-owned fence event: Query signals it, drain host-waits on it
    * before reading the slab. Decouples drain-time correctness from any
    * user sync on user_signal — required because in step 2 the Query
    * moves to a separate shadow cl whose completion isn't implied by
-   * user-level sync. Allocated here so the failure path can release
-   * inj symmetrically. */
-  struct _ze_event_h *shadow_done = _get_profiling_event(command_list);
-  if (!shadow_done) { PUT_ZE_EVENT(inj); return; }
+   * user-level sync. */
+  shadow_done = _get_profiling_event(command_list);
+  if (!shadow_done) goto fail;
   shadow_done->context = ctx;
 
-  struct _ze_command_list_obj_data *cl_data = NULL;
   FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) { PUT_ZE_EVENT(inj); PUT_ZE_EVENT(shadow_done); return; }
+  if (!cl_data) goto fail;
   pthread_mutex_lock(&cl_data->mtx);
 
-  struct _ze_slot *s = _cl_slot_append(cl_data, ctx, inj, shadow_done,
-                                       user_signal, user_waits, user_n_waits);
-  if (!s) {
-    pthread_mutex_unlock(&cl_data->mtx);
-    ADD_ZE_CL(cl_data);
-    PUT_ZE_EVENT(inj);
-    PUT_ZE_EVENT(shadow_done);
-    return;
-  }
+  s = _cl_slot_append(cl_data, ctx, inj, shadow_done,
+                      user_signal, user_waits, user_n_waits);
+  if (!s) goto fail_locked;
 
   ze_event_handle_t wait_ev = inj->event;
 
@@ -663,17 +657,9 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
    * to both wait on inj and signal user_signal; SignalEvent doesn't
    * take a wait list. Skipped when user passed NULL. */
   if (user_signal) {
-    ze_result_t rs = ZE_COMMAND_LIST_APPEND_BARRIER_PTR(
-        command_list, user_signal, 1, &wait_ev);
-    if (rs != ZE_RESULT_SUCCESS) {
-      free(s->waits);
-      cl_data->n_slots--;
-      pthread_mutex_unlock(&cl_data->mtx);
-      ADD_ZE_CL(cl_data);
-      PUT_ZE_EVENT(inj);
-      PUT_ZE_EVENT(shadow_done);
-      return;
-    }
+    if (ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev)
+        != ZE_RESULT_SUCCESS)
+      goto fail_locked;
   }
 
   /* The Query Append now lives on the per-(context, device) shadow
@@ -691,19 +677,21 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
       cl_data->cached_device = dev;
     struct _ze_shadow_cl *sh = dev ? _get_shadow_cl(ctx, dev) : NULL;
     if (!sh || _shadow_append_query(sh, inj->event, cl_data->slab, &s->off,
-                                     shadow_done->event) != 0) {
-      free(s->waits);
-      cl_data->n_slots--;
-      pthread_mutex_unlock(&cl_data->mtx);
-      ADD_ZE_CL(cl_data);
-      PUT_ZE_EVENT(inj);
-      PUT_ZE_EVENT(shadow_done);
-      return;
-    }
+                                     shadow_done->event) != 0)
+      goto fail_locked;
     _slot_instantiate(cl_data, s);
   }
   pthread_mutex_unlock(&cl_data->mtx);
   ADD_ZE_CL(cl_data);
+  return;
+
+fail_locked:
+  if (s) { free(s->waits); cl_data->n_slots--; }
+  pthread_mutex_unlock(&cl_data->mtx);
+  ADD_ZE_CL(cl_data);
+fail:
+  if (shadow_done) PUT_ZE_EVENT(shadow_done);
+  PUT_ZE_EVENT(inj);
 }
 
 /* Drain one slot. Recurses on its preds first, then emits this slot

From e566bc5f364dc9b900fdc8f9a8bf1b9c7d31dc4d Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:37:20 +0000
Subject: [PATCH 16/54] ze: collapse THAPI_DBGLOG / _NO_ARGS via ,
 ##__VA_ARGS__
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

C99 forbids an empty __VA_ARGS__ which forced the no-args fork. GCC's
`, ##__VA_ARGS__` extension swallows the dangling comma — already used
elsewhere in THAPI (utils/tracepoint_gen.rb).

Verified clean compile with -DTHAPI_DEBUG (which actually exercises
the variadic body) in addition to the default no-op branch.

Full matrix 43/43.
---
 backends/ze/tracer_ze_helpers.include.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 662decea..82afe8c1 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -34,21 +34,17 @@
 
 #ifdef THAPI_DEBUG
 #define TAHPI_LOG stderr
+/* GCC's `, ##__VA_ARGS__` extension swallows the leading comma when the
+ * variadic list is empty, so the same macro covers both no-arg and
+ * with-args calls. Already used in utils/tracepoint_gen.rb. */
 #define THAPI_DBGLOG(fmt, ...)                                                                     \
   do {                                                                                             \
-    fprintf(TAHPI_LOG, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__, __VA_ARGS__);                \
-  } while (0)
-#define THAPI_DBGLOG_NO_ARGS(fmt)                                                                  \
-  do {                                                                                             \
-    fprintf(TAHPI_LOG, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__);                             \
+    fprintf(TAHPI_LOG, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__, ##__VA_ARGS__);              \
   } while (0)
 #else
 #define THAPI_DBGLOG(...)                                                                          \
   do {                                                                                             \
   } while (0)
-#define THAPI_DBGLOG_NO_ARGS(fmt)                                                                  \
-  do {                                                                                             \
-  } while (0)
 #endif
 
 #ifdef THAPI_USE_DESTRUCTORS
@@ -335,7 +331,7 @@ static inline void _on_create_command_list(ze_command_list_handle_t command_list
 
   cl_data = (struct _ze_command_list_obj_data *)calloc(1, sizeof(*cl_data));
   if (!cl_data) {
-    THAPI_DBGLOG_NO_ARGS("Failed to allocate memory");
+    THAPI_DBGLOG("Failed to allocate memory");
     return;
   }
   cl_data->ptr = (void *)command_list;
@@ -436,7 +432,7 @@ static inline void _latest_clear_if(ze_event_handle_t ev, struct _ze_slot *s) {
     if (!pool) {                                                                                   \
       pool = (struct _ze_event_pool_entry *)calloc(1, sizeof(struct _ze_event_pool_entry));        \
       if (!pool) {                                                                                 \
-        THAPI_DBGLOG_NO_ARGS("Failed to allocate memory");                                         \
+        THAPI_DBGLOG("Failed to allocate memory");                                         \
         pthread_mutex_unlock(&_ze_event_pools_mutex);                                              \
         if (val->event_pool) {                                                                     \
           if (val->event)                                                                          \

From 322402d2a8ccf08feb54db44d3fcb8cafcb513ba Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:40:46 +0000
Subject: [PATCH 17/54] ze: drop THAPI_REPORT_INJECTED_EVENTS debug counter

Added in d5603e4 as a regression guard for a planned bats assertion
that never landed. No test, internal code, or external doc references
it. Easy to reintroduce on demand.

Full matrix 43/43.
---
 backends/ze/tracer_ze_helpers.include.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 82afe8c1..18c42db4 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -59,11 +59,6 @@ static int _do_chained_structs = 0;
 static int _do_paranoid_drift = 0;
 static int _do_paranoid_memory_location = 0;
 static int _do_ddi_table_forward = 0;
-/* When THAPI_REPORT_INJECTED_EVENTS=1, _lib_cleanup prints the running
- * total to stderr. Useful for the bats infra to assert we don't inject
- * more events than necessary (lazy fallback regression guard). */
-static int _do_report_injected_events = 0;
-static volatile uint64_t _injected_event_count = 0;
 
 pthread_mutex_t ze_closures_mutex = PTHREAD_MUTEX_INITIALIZER;
 
@@ -510,8 +505,6 @@ static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command
                  e_w->event_pool, context);
     goto cleanup_ep;
   }
-  if (_do_report_injected_events)
-    __sync_fetch_and_add(&_injected_event_count, 1);
   return e_w;
 cleanup_ep:
   ZE_EVENT_POOL_DESTROY_PTR(e_w->event_pool);
@@ -855,11 +848,6 @@ static inline int _do_state() {
 }
 
 static void THAPI_ATTRIBUTE_DESTRUCTOR _lib_cleanup() {
-  if (_do_cleanup) {
-    if (_do_report_injected_events)
-      fprintf(stderr, "THAPI: injected events: %lu\n",
-              (unsigned long)_injected_event_count);
-  }
 }
 
 static void _dump_driver_subdevice_properties(ze_driver_handle_t hDriver,
@@ -1068,10 +1056,6 @@ static void _load_tracer(void) {
   if (s)
     _do_paranoid_memory_location = 1;
 
-  s = getenv("THAPI_REPORT_INJECTED_EVENTS");
-  if (s)
-    _do_report_injected_events = 1;
-
   _do_cleanup = 1;
 
 #ifndef THAPI_USE_DESTRUCTORS

From ccf6e651206561aeba5d7d3a5428ef95261d897b Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:46:03 +0000
Subject: [PATCH 18/54] ze: drop dead cap_slots field

cap_slots was set once in _cl_slot_append to _ZE_SLAB_SLOTS_INITIAL
and never read. The old _on_sync_drain_event walked the cls array
using `s >= slots && s < slots + cap_slots` to find a slot's owner,
but that walk went away when slots gained an owner back-pointer
(cbb928b). Field has been dead since.

Full matrix 43/43.
---
 backends/ze/tracer_ze_helpers.include.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 18c42db4..aa8b91f4 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -106,7 +106,6 @@ struct _ze_command_list_obj_data {
   void              *slab;       /* host-visible KT result buffer; alloc'd once, leaked on destroy */
   struct _ze_slot   *slots;
   uint32_t           n_slots;
-  uint32_t           cap_slots;
 
   /* in_flight_q is the queue this cl was last Executed on AND not yet
    * drained. NULL means "not in flight" — safe to Execute without a
@@ -539,7 +538,6 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
     cl_data->slots = (struct _ze_slot *)calloc(
         _ZE_SLAB_SLOTS_INITIAL, sizeof(struct _ze_slot));
     if (!cl_data->slots) return NULL;
-    cl_data->cap_slots = _ZE_SLAB_SLOTS_INITIAL;
   }
   if (!cl_data->slab) {
     size_t bytes = (size_t)_ZE_SLAB_SLOTS_INITIAL * sizeof(ze_kernel_timestamp_result_t);

From e396740acf2c4733ef6f0e44a24dbf501bff8dbb Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 21:48:12 +0000
Subject: [PATCH 19/54] ze: drop empty _lib_cleanup and its scaffolding

After THAPI_REPORT_INJECTED_EVENTS went away (322402d), _lib_cleanup
had an empty body. Its registration plumbing (_do_cleanup,
THAPI_ATTRIBUTE_DESTRUCTOR, the atexit call, the THAPI_USE_DESTRUCTORS
ifdef pair) was all in service of running it. The CUDA backend still
has a non-empty _lib_cleanup, so its copy of the same plumbing stays.

Full matrix 43/43.
---
 backends/ze/tracer_ze_helpers.include.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index aa8b91f4..2d63bf1b 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -47,14 +47,7 @@
   } while (0)
 #endif
 
-#ifdef THAPI_USE_DESTRUCTORS
-#define THAPI_ATTRIBUTE_DESTRUCTOR __attribute__((destructor))
-#else
-#define THAPI_ATTRIBUTE_DESTRUCTOR
-#endif
-
 static int _do_profile = 0;
-static int _do_cleanup = 0;
 static int _do_chained_structs = 0;
 static int _do_paranoid_drift = 0;
 static int _do_paranoid_memory_location = 0;
@@ -845,9 +838,6 @@ static inline int _do_state() {
          tracepoint_enabled(lttng_ust_ze_properties, memory_info_range);
 }
 
-static void THAPI_ATTRIBUTE_DESTRUCTOR _lib_cleanup() {
-}
-
 static void _dump_driver_subdevice_properties(ze_driver_handle_t hDriver,
                                               ze_device_handle_t hDevice) {
   if (!tracepoint_enabled(lttng_ust_ze_properties, subdevice))
@@ -1053,12 +1043,6 @@ static void _load_tracer(void) {
   s = getenv("LTTNG_UST_ZE_PARANOID_MEMORY_LOCATION");
   if (s)
     _do_paranoid_memory_location = 1;
-
-  _do_cleanup = 1;
-
-#ifndef THAPI_USE_DESTRUCTORS
-  atexit(_lib_cleanup);
-#endif
 }
 
 static void _load_tracer_dump(void) {

From e15cbf9c32ca67a2fc61ccde83dd9ab801efb75b Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 22:41:58 +0000
Subject: [PATCH 20/54] ze: extract _cl_cache_{device,context} +
 _on_execute_one_cl

The lazy cached_device / cached_context resolve dance appeared 4 times
(2 callsites x 2 fields) with subtle variations. Two tiny helpers
collapse all four. Walks back commit 83191b9's inline-_cl_get_device
with a better factoring that handles both fields together.

The Execute epilogue's body was one 30-line for-loop with 5 distinct
phases per cl. Extracting _on_execute_one_cl makes the loop a
one-liner and lets the per-cl prose live with the code that
implements it. Matches the granularity of _on_sync_drain_cl /
_on_sync_drain_queue.

Full matrix 43/43.
---
 backends/ze/tracer_ze_helpers.include.c | 95 +++++++++++++++----------
 1 file changed, 56 insertions(+), 39 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 2d63bf1b..519aca67 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -306,6 +306,27 @@ static int _shadow_append_query(struct _ze_shadow_cl *sh,
   return (r == ZE_RESULT_SUCCESS) ? 0 : -1;
 }
 
+/* Return cl_data->cached_{device,context}, fetching from L0 on first call.
+ * Both fields are immutable for the cl's lifetime, so caching avoids the
+ * roundtrip on every Append/Execute. Returns NULL on L0 error. */
+static ze_device_handle_t _cl_cache_device(struct _ze_command_list_obj_data *cl_data,
+                                           ze_command_list_handle_t command_list) {
+  if (cl_data->cached_device) return cl_data->cached_device;
+  ze_device_handle_t d = NULL;
+  if (ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &d) == ZE_RESULT_SUCCESS)
+    cl_data->cached_device = d;
+  return d;
+}
+
+static ze_context_handle_t _cl_cache_context(struct _ze_command_list_obj_data *cl_data,
+                                             ze_command_list_handle_t command_list) {
+  if (cl_data->cached_context) return cl_data->cached_context;
+  ze_context_handle_t c = NULL;
+  if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &c) == ZE_RESULT_SUCCESS)
+    cl_data->cached_context = c;
+  return c;
+}
+
 static inline void _on_create_command_list(ze_command_list_handle_t command_list,
                                             int immediate, int in_order) {
   struct _ze_command_list_obj_data *cl_data = NULL;
@@ -651,10 +672,7 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
    * shadow cl now would let it fire too early on a stale inj). */
   if (cl_data->is_immediate) {
     cl_data->cached_context = ctx;
-    ze_device_handle_t dev = cl_data->cached_device;
-    if (!dev &&
-        ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev) == ZE_RESULT_SUCCESS)
-      cl_data->cached_device = dev;
+    ze_device_handle_t dev = _cl_cache_device(cl_data, command_list);
     struct _ze_shadow_cl *sh = dev ? _get_shadow_cl(ctx, dev) : NULL;
     if (!sh || _shadow_append_query(sh, inj->event, cl_data->slab, &s->off,
                                      shadow_done->event) != 0)
@@ -772,11 +790,11 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
   pthread_mutex_unlock(&s->owner->mtx);
 }
 
-/* zeCommandQueueExecuteCommandLists EPILOGUE — runs AFTER L0's actual
- * Execute has returned, with the user cl in flight on its engine.
+/* Execute-epilogue handler for ONE cl. Runs AFTER L0's actual Execute
+ * has returned, with the user cl in flight on its engine.
  *
- * Three things happen here, all under cl_data->mtx so a concurrent
- * Execute (or Sync) on another thread sees them atomically:
+ * Three phases, all under cl_data->mtx so a concurrent Execute or Sync
+ * on another thread sees them atomically:
  *
  *   1) If in_flight_q is set from a prior Execute by *another* thread,
  *      force-sync that queue and drain the slab before we overwrite it
@@ -787,41 +805,40 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
  *      shadow cl before the user cl is in flight deadlocks when the
  *      shadow shares the engine with the user cl (see
  *      tests/bugs/query_on_separate_cl_regular_user_cl).
- *   3) Stamp in_flight_q = hQueue and instantiate the slot, publishing
+ *   3) Stamp in_flight_q = hQueue and instantiate each slot, publishing
  *      it to the dep graph + as the "owner" of this queue. */
+static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
+                               ze_command_list_handle_t command_list) {
+  struct _ze_command_list_obj_data *cl_data = NULL;
+  FIND_ZE_CL(&command_list, cl_data);
+  if (!cl_data) return;
+  pthread_mutex_lock(&cl_data->mtx);
+
+  if (cl_data->in_flight_q) {
+    ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX);
+    _cl_drain(cl_data);
+  }
+  ze_context_handle_t ctx = _cl_cache_context(cl_data, command_list);
+  ze_device_handle_t  dev = _cl_cache_device(cl_data, command_list);
+  struct _ze_shadow_cl *sh = (ctx && dev) ? _get_shadow_cl(ctx, dev) : NULL;
+  for (uint32_t j = 0; j < cl_data->n_slots; ++j) {
+    struct _ze_slot *slot = &cl_data->slots[j];
+    if (!sh || !slot->inj || !slot->shadow_done) continue;
+    if (_shadow_append_query(sh, slot->inj->event, cl_data->slab,
+                             &slot->off, slot->shadow_done->event) != 0)
+      continue;  /* slot stays not-live this round; we miss this timing */
+    _slot_instantiate(cl_data, slot);
+  }
+  cl_data->in_flight_q = hQueue;
+
+  pthread_mutex_unlock(&cl_data->mtx);
+}
+
 static void _on_execute_command_lists_epilogue(ze_command_queue_handle_t hQueue,
                                                 uint32_t numCommandLists,
                                                 ze_command_list_handle_t *phCommandLists) {
-  for (uint32_t i = 0; i < numCommandLists; ++i) {
-    struct _ze_command_list_obj_data *cl_data = NULL;
-    FIND_ZE_CL(phCommandLists + i, cl_data);
-    if (!cl_data) continue;
-    pthread_mutex_lock(&cl_data->mtx);
-    if (cl_data->in_flight_q) {
-      ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX);
-      _cl_drain(cl_data);
-    }
-    ze_context_handle_t ctx = cl_data->cached_context;
-    if (!ctx) {
-      if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(phCommandLists[i], &ctx) == ZE_RESULT_SUCCESS)
-        cl_data->cached_context = ctx;
-    }
-    ze_device_handle_t dev = cl_data->cached_device;
-    if (!dev &&
-        ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(phCommandLists[i], &dev) == ZE_RESULT_SUCCESS)
-      cl_data->cached_device = dev;
-    struct _ze_shadow_cl *sh = (ctx && dev) ? _get_shadow_cl(ctx, dev) : NULL;
-    for (uint32_t j = 0; j < cl_data->n_slots; ++j) {
-      struct _ze_slot *slot = &cl_data->slots[j];
-      if (!sh || !slot->inj || !slot->shadow_done) continue;
-      if (_shadow_append_query(sh, slot->inj->event, cl_data->slab,
-                               &slot->off, slot->shadow_done->event) != 0)
-        continue;  /* slot stays not-live this round; we miss this timing */
-      _slot_instantiate(cl_data, slot);
-    }
-    cl_data->in_flight_q = hQueue;
-    pthread_mutex_unlock(&cl_data->mtx);
-  }
+  for (uint32_t i = 0; i < numCommandLists; ++i)
+    _on_execute_one_cl(hQueue, phCommandLists[i]);
 }
 
 static pthread_once_t _init = PTHREAD_ONCE_INIT;

From ecc2954adeb651cb97c634945ddfcda3e2167771 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 22:45:49 +0000
Subject: [PATCH 21/54] ze: chain user_signal off inj BEFORE the slot
 allocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously _universal_record_append wired user_signal -> inj via
AppendBarrier AFTER reserving a slot. If slot reservation failed
(e.g. _ZE_SLAB_SLOTS_INITIAL=64 cap hit by the 65th Append on an
immediate cl), the function jumped to fail_locked and skipped the
barrier — meaning user_signal was never signaled by anything on the
cl, and the user's Sync(user_signal) hung forever.

Reorder so the barrier Append runs first. We may still lose the
timing tracepoint when the slot can't be reserved, but the user's
event still fires and Sync no longer hangs.

Reproducer: tests/matrix/ooo_imm_Event_06. Previously: timed out
after 20s, 0 tracepoints. Now: 64 tracepoints out of expected 65
(cap still loses one slot; the actual cap removal is the next
commit's job).
---
 backends/ze/tracer_ze_helpers.include.c | 34 ++++++++++++-------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 519aca67..a8911f52 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -627,6 +627,21 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   struct _ze_command_list_obj_data *cl_data = NULL;
   struct _ze_slot *s = NULL;
 
+  /* Chain user_signal off inj BEFORE anything else that can fail. The
+   * prologue already swapped user's hSignalEvent for inj->event, so
+   * nothing else on this cl signals user_signal — if we bail later
+   * (slot full, shadow_done alloc fails, etc.) the user's
+   * Sync(user_signal) would hang forever. AppendBarrier (not
+   * AppendSignalEvent) because we need to both wait on inj and signal
+   * user_signal. NULL user_signal is the "user passed NULL" case where
+   * no chaining is needed. */
+  if (user_signal) {
+    ze_event_handle_t wait_ev = inj->event;
+    if (ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev)
+        != ZE_RESULT_SUCCESS)
+      goto fail;
+  }
+
   ze_context_handle_t ctx = NULL;
   if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &ctx) != ZE_RESULT_SUCCESS || !ctx)
     goto fail;
@@ -634,9 +649,8 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
 
   /* Tracer-owned fence event: Query signals it, drain host-waits on it
    * before reading the slab. Decouples drain-time correctness from any
-   * user sync on user_signal — required because in step 2 the Query
-   * moves to a separate shadow cl whose completion isn't implied by
-   * user-level sync. */
+   * user sync on user_signal — required because the Query lives on a
+   * separate shadow cl whose completion isn't implied by user-level sync. */
   shadow_done = _get_profiling_event(command_list);
   if (!shadow_done) goto fail;
   shadow_done->context = ctx;
@@ -649,20 +663,6 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
                       user_signal, user_waits, user_n_waits);
   if (!s) goto fail_locked;
 
-  ze_event_handle_t wait_ev = inj->event;
-
-  /* Chain user_signal off inj so the user's wait still completes once
-   * the underlying op has fired. We swapped user's hSignalEvent for
-   * inj in the prologue, so nothing else on this cl signals
-   * user_signal. AppendBarrier (not AppendSignalEvent) because we need
-   * to both wait on inj and signal user_signal; SignalEvent doesn't
-   * take a wait list. Skipped when user passed NULL. */
-  if (user_signal) {
-    if (ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev)
-        != ZE_RESULT_SUCCESS)
-      goto fail_locked;
-  }
-
   /* The Query Append now lives on the per-(context, device) shadow
    * compute cl rather than the user cl. This is what lets us profile
    * copy-only user cls — copy engines reject AppendQueryKernelTimestamps

From 77bd81bacd7cf50228b718c6b7b5dd998bb39890 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Wed, 3 Jun 2026 22:57:40 +0000
Subject: [PATCH 22/54] ze: refcount slots, recycle imm-cl events back to the
 per-context pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Slots gain a `refs` field: the count of downstream slots whose
preds[] points here AND that have not yet been drained.
_slot_instantiate atomically increments each pred's refs;
_slot_drain decrements them at cleanup. When a slot reaches
live==0 && refs==0 it is reclaimable.

Reclaim is implemented for immediate cls only. Regular cls bake inj
into the cl body — recycling inj would corrupt the next Execute
round, so their events are reclaimed only at cl destroy (Phase-3).
For immediate slots, _slot_release PUTs inj and shadow_done back to
the per-context pool and frees waits. Slot storage itself stays for
now; Phase-3 will free it along with chunked slot/slab storage that
lifts the 64-Append cap for imm cls.

Full matrix 43/43 (Event_06 still fails on cap until Phase-3).
---
 backends/ze/tracer_ze_helpers.include.c | 39 +++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index a8911f52..edbbc6be 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -88,6 +88,13 @@ struct _ze_slot {
   struct _ze_slot   **preds;           /* points at slots whose drain must come first (may be in another cl) */
   uint32_t            n_preds;
   unsigned char       live;            /* in-flight (instantiated, not drained) */
+  /* Incoming pred edges: count of downstream slots whose preds[] points
+   * here AND that have not yet been drained. Incremented at downstream
+   * _slot_instantiate (one per pred edge), decremented at downstream
+   * _slot_drain. Slot is reclaimable iff live==0 AND refs==0. Atomic
+   * because increment/decrement happen across cl boundaries without
+   * holding the slot's owner mtx. */
+  uint32_t            refs;
 };
 
 #define _ZE_SLAB_SLOTS_INITIAL 64
@@ -605,6 +612,9 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data,
       }
     }
   }
+  /* Each new pred edge holds a ref on its target. */
+  for (uint32_t i = 0; i < s->n_preds; ++i)
+    __atomic_fetch_add(&s->preds[i]->refs, 1, __ATOMIC_RELAXED);
   if (s->attr) _latest_set(s->attr, s);
 }
 
@@ -692,6 +702,26 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   PUT_ZE_EVENT(inj);
 }
 
+/* Reclaim a slot: PUT its tracer-owned events back to the per-context
+ * pool and free waits. Caller must hold s->owner->mtx if `s` is in the
+ * caller's cl; cross-cl reclaim runs without the pred's owner mtx,
+ * which is safe because release only fires at refs==0 && live==0 (no
+ * other party can be mutating the slot at that point).
+ *
+ * Regular cls are NOT released here: their inj is baked into the cl
+ * body and recycling inj would corrupt the next Execute round. Their
+ * events are reclaimed only at cl destroy (Phase-3). Immediate cls
+ * fire exactly once per Append, so per-drain reclaim is safe. */
+static void _slot_release(struct _ze_slot *s) {
+  if (!s || !s->owner || !s->owner->is_immediate) return;
+  if (s->inj)         { PUT_ZE_EVENT(s->inj);         s->inj         = NULL; }
+  if (s->shadow_done) { PUT_ZE_EVENT(s->shadow_done); s->shadow_done = NULL; }
+  free(s->waits);
+  s->waits   = NULL;
+  s->n_waits = 0;
+  s->attr    = NULL;
+}
+
 /* Drain one slot. Recurses on its preds first, then emits this slot
  * and pops it. Pop = clear inj/waits/preds; the holed entry is reused
  * by later _cl_slot_append calls. Safe to call on already-drained
@@ -739,9 +769,18 @@ static void _slot_drain(struct _ze_slot *s) {
                   r.context.kernelStart, r.context.kernelEnd);
   }
   _latest_clear_if(s->attr, s);
+  /* Drop our refs on preds; release any that reached refs==0 & live==0. */
+  for (uint32_t i = 0; i < s->n_preds; ++i) {
+    struct _ze_slot *p = s->preds[i];
+    if (__atomic_sub_fetch(&p->refs, 1, __ATOMIC_RELAXED) == 0 && !p->live)
+      _slot_release(p);
+  }
   /* Per-run preds reset; build-time fields (inj, attr, off, waits) stay
    * so the next Execute can re-instantiate without re-Append. */
   free(s->preds); s->preds = NULL; s->n_preds = 0;
+  /* If no downstream slot holds an edge on us, release immediately. */
+  if (__atomic_load_n(&s->refs, __ATOMIC_RELAXED) == 0)
+    _slot_release(s);
 }
 
 /* Drain every live slot in a cl. */

From 03c24fa222c8d13b915ac870b78b473012860983 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Thu, 4 Jun 2026 00:07:36 +0000
Subject: [PATCH 23/54] ze: chunked slot/slab storage + refcount + cl-destroy
 hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imm cls used to deadlock the user program at 65 Appends in one phase
(_cl_slot_append returned NULL above _ZE_SLAB_SLOTS_INITIAL=64 and
_universal_record_append's fail path stopped chaining user_signal off
inj). The previous commit unblocked the chain; this commit lifts the
cap altogether for imm cls and reclaims slot/event storage as ops
complete.

Storage: cl_data->slots/slab pair is replaced by a utlist DL chain of
_ze_slab_chunk{}, each holding 64 slots and a 64-ts slab. Imm cls
grow chunk-by-chunk on demand; regular cls stop at one chunk (their
inj events are baked into the closed cl body, so adding a chunk
post-Close would create slots the body doesn't address).

Lifetime: slots gain a `refs` atomic counter, incremented at
_slot_instantiate per pred edge, decremented at _slot_drain. A slot
is reclaimable iff live==0 AND refs==0. _slot_release PUTs inj +
shadow_done back to the per-context pool, frees waits, and
decrements its chunk's n_held. A chunk frees itself when n_held
reaches 0 AND it isn't the active tail. Regular-cl slots are not
released at drain — their events come back at cl destroy.

Destroy: zeCommandListDestroy hook walks chunks, PUTs events back,
frees chunks + cl_data. Re-added because the chunked layout makes
the leak-on-destroy bound by per-cl lifetime instead of program
lifetime.

Memory shape: O(in-flight slots) at any time, matching the typical
"one long-lived imm cl with many syncs" use case.

Full matrix 45/45.
---
 backends/ze/tracer_ze_helpers.include.c | 281 +++++++++++++++---------
 backends/ze/ze_model.rb                 |  17 +-
 2 files changed, 192 insertions(+), 106 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index edbbc6be..b2fe9b27 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -68,6 +68,7 @@ struct ze_closure *ze_closures = NULL;
 
 struct _ze_event_h;
 struct _ze_slot;
+struct _ze_slab_chunk;
 
 /* Dependency-tracking slot: one per profiled Append. Slots carry the
  * happens-before edges the user established (via cl in-order semantics
@@ -75,11 +76,12 @@ struct _ze_slot;
  * synced anchor and drain everything reachable. Drain is pop semantics:
  * after emit, the slot is dropped from the cl's list. */
 struct _ze_slot {
-  struct _ze_command_list_obj_data *owner; /* cl_data this slot lives in (==> .slab to read at drain) */
+  struct _ze_command_list_obj_data *owner; /* cl_data this slot lives in */
+  struct _ze_slab_chunk *chunk;        /* chunk this slot lives in (==> .slab to read at drain) */
   struct _ze_event_h *inj;             /* tracer-owned event the Query waits on */
   struct _ze_event_h *shadow_done;     /* tracer-owned event the Query signals; drain host-syncs on this */
   ze_event_handle_t   attr;            /* user's original signal event (NULL => inj->event) */
-  size_t              off;             /* byte offset within owner->slab */
+  size_t              off;             /* byte offset within chunk->slab */
   /* User wait events copied at Append time (stable across rebuilds);
    * preds[] is computed at instantiate from waits[] by looking up
    * latest[w] for each w. */
@@ -97,15 +99,31 @@ struct _ze_slot {
   uint32_t            refs;
 };
 
-#define _ZE_SLAB_SLOTS_INITIAL 64
+#define _ZE_SLAB_CHUNK_SLOTS 64
+
+/* Slot + slab storage in fixed-size chunks; cl_data->chunks is a utlist
+ * DL of these. Imm cls allocate new chunks as needed (no cap); regular
+ * cls stop at one chunk (the inj events are baked into the closed cl
+ * body, so adding a chunk after Close would create slots the body
+ * doesn't address).
+ *
+ * Within a chunk, slots[i].off is i * sizeof(timestamp) into slab. The
+ * chunk frees itself when n_held drops to 0 AND it is not the tail
+ * (new Appends still want to land on the tail). */
+struct _ze_slab_chunk {
+  void                  *slab;          /* _ZE_SLAB_CHUNK_SLOTS * sizeof(ze_kernel_timestamp_result_t) */
+  ze_context_handle_t    slab_ctx;      /* context the slab was allocated against (zeMemFree target) */
+  uint32_t               n_used;        /* slots ever assigned in this chunk (monotonic until chunk free) */
+  uint32_t               n_held;        /* unreleased slots (n_used minus _slot_release calls) */
+  struct _ze_slab_chunk *next, *prev;
+  struct _ze_slot        slots[_ZE_SLAB_CHUNK_SLOTS];
+};
 
 struct _ze_command_list_obj_data {
   void *ptr;
   UT_hash_handle hh;
 
-  void              *slab;       /* host-visible KT result buffer; alloc'd once, leaked on destroy */
-  struct _ze_slot   *slots;
-  uint32_t           n_slots;
+  struct _ze_slab_chunk *chunks;        /* utlist DL_ head; tail = chunks->prev (circular) */
 
   /* in_flight_q is the queue this cl was last Executed on AND not yet
    * drained. NULL means "not in flight" — safe to Execute without a
@@ -533,20 +551,28 @@ static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command
   return NULL;
 }
 
-/* Allocate one new slot at the end of the cl's slot list. Slots are
- * never reused within a cl's lifetime — the cl body's Query op
- * hard-codes inj and off; the slot is the host-side mirror that gets
- * re-instantiated on every Execute.
- *
- * Capacity is fixed at _ZE_SLAB_SLOTS_INITIAL to keep slot addresses
- * stable for the cl's lifetime. We store raw slot pointers in
- * `latest[ev] -> slot` and in other slots' `preds[]`; realloc would
- * invalidate every one of them, silently breaking dep-graph walks
- * (see tests/bugs/missing_drain_dag). The slab is sized to match, so
- * growing slots beyond it would gain nothing anyway.
- *
- * Allocations (slots array and slab) happen BEFORE n_slots is bumped,
- * so an OOM does not leave a hole in the slot indexing. */
+/* Allocate a new chunk and append it to cl_data->chunks. */
+static struct _ze_slab_chunk *_cl_chunk_alloc(struct _ze_command_list_obj_data *cl_data,
+                                              ze_context_handle_t ctx) {
+  struct _ze_slab_chunk *c = (struct _ze_slab_chunk *)calloc(1, sizeof(*c));
+  if (!c) return NULL;
+  size_t bytes = (size_t)_ZE_SLAB_CHUNK_SLOTS * sizeof(ze_kernel_timestamp_result_t);
+  ze_host_mem_alloc_desc_t hd = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, NULL, 0};
+  if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &c->slab) != ZE_RESULT_SUCCESS
+      || !c->slab) {
+    free(c);
+    return NULL;
+  }
+  memset(c->slab, 0, bytes);
+  c->slab_ctx = ctx;
+  DL_APPEND(cl_data->chunks, c);
+  return c;
+}
+
+/* Allocate one new slot at the tail of cl_data->chunks. Grows by one
+ * chunk for imm cls; regular cls stay at one chunk and return NULL when
+ * full (their inj events are baked into the closed cl body, so storage
+ * must keep addressing them via the same (slab, off) pair). */
 static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_data,
                                         ze_context_handle_t ctx,
                                         struct _ze_event_h *inj,
@@ -554,38 +580,30 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
                                         ze_event_handle_t attr,
                                         ze_event_handle_t *waits,
                                         uint32_t n_waits) {
-  if (cl_data->n_slots >= _ZE_SLAB_SLOTS_INITIAL) return NULL;
-  if (!cl_data->slots) {
-    cl_data->slots = (struct _ze_slot *)calloc(
-        _ZE_SLAB_SLOTS_INITIAL, sizeof(struct _ze_slot));
-    if (!cl_data->slots) return NULL;
+  struct _ze_slab_chunk *tail = cl_data->chunks ? cl_data->chunks->prev : NULL;
+  if (!tail || tail->n_used >= _ZE_SLAB_CHUNK_SLOTS) {
+    if (tail && !cl_data->is_immediate) return NULL;
+    tail = _cl_chunk_alloc(cl_data, ctx);
+    if (!tail) return NULL;
   }
-  if (!cl_data->slab) {
-    size_t bytes = (size_t)_ZE_SLAB_SLOTS_INITIAL * sizeof(ze_kernel_timestamp_result_t);
-    ze_host_mem_alloc_desc_t hd = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, NULL, 0};
-    void *buf = NULL;
-    if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &buf) != ZE_RESULT_SUCCESS || !buf)
-      return NULL;
-    memset(buf, 0, bytes);
-    cl_data->slab = buf;
-  }
-  uint32_t idx = cl_data->n_slots;
-  struct _ze_slot *s = &cl_data->slots[idx];
+  uint32_t idx = tail->n_used;
+  struct _ze_slot *s = &tail->slots[idx];
+  /* Chunk memory is calloc'd, so all other slot fields are already zero. */
   s->owner       = cl_data;
+  s->chunk       = tail;
   s->inj         = inj;
   s->shadow_done = shadow_done;
   s->attr        = attr;
-  s->off   = (size_t)idx * sizeof(ze_kernel_timestamp_result_t);
-  s->live  = 0;
-  s->preds = NULL; s->n_preds = 0;
+  s->off         = (size_t)idx * sizeof(ze_kernel_timestamp_result_t);
   if (n_waits) {
     s->waits = (ze_event_handle_t *)malloc(n_waits * sizeof(ze_event_handle_t));
     if (s->waits) {
       memcpy(s->waits, waits, n_waits * sizeof(ze_event_handle_t));
       s->n_waits = n_waits;
-    } else { s->n_waits = 0; }
-  } else { s->waits = NULL; s->n_waits = 0; }
-  cl_data->n_slots++;
+    }
+  }
+  tail->n_used++;
+  tail->n_held++;
   return s;
 }
 
@@ -603,14 +621,22 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data,
     if (p && p->live) s->preds[s->n_preds++] = p;
   }
   if (cl_data->is_in_order) {
-    /* Find previous live slot in this cl (by slot index lower than s). */
-    uint32_t self = (uint32_t)(s - cl_data->slots);
-    for (int32_t i = (int32_t)self - 1; i >= 0; --i) {
-      if (cl_data->slots[i].live) {
-        s->preds[s->n_preds++] = &cl_data->slots[i];
-        break;
+    /* Walk chunks newest-to-oldest, slots high-to-low, stop at the first
+     * live slot strictly before s. Chunks are appended in time order
+     * (DL_APPEND) and slots within a chunk in time order, so reverse-walk
+     * yields reverse time order. Skip s itself; s might still have
+     * live=0 here but the !=s guard is safe and clearer. */
+    struct _ze_slab_chunk *c;
+    struct _ze_slot *prev = NULL;
+    for (c = cl_data->chunks ? cl_data->chunks->prev : NULL;
+         c && !prev;
+         c = (c == cl_data->chunks) ? NULL : c->prev) {
+      for (int32_t i = (int32_t)c->n_used - 1; i >= 0; --i) {
+        if (&c->slots[i] == s) continue;
+        if (c->slots[i].live) { prev = &c->slots[i]; break; }
       }
     }
+    if (prev) s->preds[s->n_preds++] = prev;
   }
   /* Each new pred edge holds a ref on its target. */
   for (uint32_t i = 0; i < s->n_preds; ++i)
@@ -684,7 +710,7 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     cl_data->cached_context = ctx;
     ze_device_handle_t dev = _cl_cache_device(cl_data, command_list);
     struct _ze_shadow_cl *sh = dev ? _get_shadow_cl(ctx, dev) : NULL;
-    if (!sh || _shadow_append_query(sh, inj->event, cl_data->slab, &s->off,
+    if (!sh || _shadow_append_query(sh, inj->event, s->chunk->slab, &s->off,
                                      shadow_done->event) != 0)
       goto fail_locked;
     _slot_instantiate(cl_data, s);
@@ -694,7 +720,23 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   return;
 
 fail_locked:
-  if (s) { free(s->waits); cl_data->n_slots--; }
+  if (s) {
+    /* Roll back the slot we just appended. We were the very last to
+     * touch the tail chunk and we hold cl_data->mtx, so decrementing
+     * n_used/n_held and clearing the slot is safe. If the chunk
+     * was freshly allocated only for this Append (n_used now 0), free
+     * it back so we don't leak a chunk per slot-append failure. */
+    free(s->waits);
+    struct _ze_slab_chunk *c = s->chunk;
+    c->n_used--;
+    c->n_held--;
+    memset(s, 0, sizeof(*s));
+    if (c->n_used == 0) {
+      DL_DELETE(cl_data->chunks, c);
+      if (c->slab) ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+      free(c);
+    }
+  }
   pthread_mutex_unlock(&cl_data->mtx);
   ADD_ZE_CL(cl_data);
 fail:
@@ -702,16 +744,10 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   PUT_ZE_EVENT(inj);
 }
 
-/* Reclaim a slot: PUT its tracer-owned events back to the per-context
- * pool and free waits. Caller must hold s->owner->mtx if `s` is in the
- * caller's cl; cross-cl reclaim runs without the pred's owner mtx,
- * which is safe because release only fires at refs==0 && live==0 (no
- * other party can be mutating the slot at that point).
- *
- * Regular cls are NOT released here: their inj is baked into the cl
- * body and recycling inj would corrupt the next Execute round. Their
- * events are reclaimed only at cl destroy (Phase-3). Immediate cls
- * fire exactly once per Append, so per-drain reclaim is safe. */
+/* Reclaim a slot: PUT events back to the per-context pool, free waits,
+ * decrement chunk n_held; if the chunk hits 0 AND isn't the active
+ * tail, unlink and free it. Regular cls are skipped (their inj is
+ * baked into the cl body — reclaim happens at cl destroy instead). */
 static void _slot_release(struct _ze_slot *s) {
   if (!s || !s->owner || !s->owner->is_immediate) return;
   if (s->inj)         { PUT_ZE_EVENT(s->inj);         s->inj         = NULL; }
@@ -720,73 +756,70 @@ static void _slot_release(struct _ze_slot *s) {
   s->waits   = NULL;
   s->n_waits = 0;
   s->attr    = NULL;
+
+  struct _ze_slab_chunk *c = s->chunk;
+  struct _ze_command_list_obj_data *cl = s->owner;
+  if (!c) return;
+  if (--c->n_held == 0 && c != cl->chunks->prev) {
+    DL_DELETE(cl->chunks, c);
+    if (c->slab) ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+    free(c);
+  }
 }
 
-/* Drain one slot. Recurses on its preds first, then emits this slot
- * and pops it. Pop = clear inj/waits/preds; the holed entry is reused
- * by later _cl_slot_append calls. Safe to call on already-drained
- * (live=0) slot.
- *
- * Reads use s->owner->slab — preds may live in a different cl than the
- * caller (cross-cl signal chains), so we cannot use the caller's slab.
+/* Drain one slot. Recurses on its preds, emits the slot's tracepoint,
+ * drops one ref on each pred (releasing fully-drained-and-unreferenced
+ * preds), then releases s if its own refs hit 0. Safe to call on an
+ * already-drained (live=0) slot.
  *
- * Locking: a pred on another cl is read/mutated WITHOUT taking its
- * owner's mtx. That's safe in the current model because slot pointers
- * are stable (cap is fixed, never realloc'd) and live-flag clearing
- * races are benign — the worst case is one extra tracepoint emit, not
- * a UAF. Take the pred's mtx only if we ever start freeing slot arrays.
+ * Slab read uses s->chunk->slab — preds may live in another cl, so we
+ * can't use the caller's slab.
  *
- * No cycle guard: cycles are impossible by construction. preds come
- * from two sources:
- *   - in-order prev slot in the same cl: strictly lower slot index, DAG.
- *   - latest[wait_event]: a slot published BEFORE us. Forming a cycle
- *     requires the user to declare two Appends each waiting on the
- *     other's signal event — L0 itself would deadlock the GPU on that,
- *     so we would never observe a sync return to reach drain. */
+ * No cycle guard: preds come from in-order prev (strictly earlier slot
+ * in the same cl, DAG) and from latest[wait_event] (a slot published
+ * BEFORE us). Forming a cycle would require user-declared mutual waits,
+ * which L0 itself deadlocks on. */
 static void _slot_drain(struct _ze_slot *s) {
   if (!s || !s->live) return;
   for (uint32_t i = 0; i < s->n_preds; ++i)
     _slot_drain(s->preds[i]);
   s->live = 0;
-  /* Block until our Query op has actually fired, then reset the fence
-   * so the next Execute round starts with a clean event. We can't
-   * trust the caller's sync to have covered the Query — in step 2 the
-   * Query will live on a separate shadow cl, and even in step 1 this
-   * makes the slab read unconditional rather than relying on cl-order
-   * implications. */
+  /* Block until the Query op has fired, then reset shadow_done so the
+   * next Execute round (regular cls) starts with a clean event. The
+   * user's own sync doesn't cover the Query — it runs on the shadow cl. */
   if (s->shadow_done && s->shadow_done->event) {
     ZE_EVENT_HOST_SYNCHRONIZE_PTR(s->shadow_done->event, UINT64_MAX);
     ZE_EVENT_HOST_RESET_PTR(s->shadow_done->event);
   }
   ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
-  if (s->owner && s->owner->slab && attr &&
+  if (s->chunk && s->chunk->slab && attr &&
       tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) {
     ze_kernel_timestamp_result_t r =
-        *(ze_kernel_timestamp_result_t *)((char *)s->owner->slab + s->off);
+        *(ze_kernel_timestamp_result_t *)((char *)s->chunk->slab + s->off);
     do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr,
                   ZE_RESULT_SUCCESS, ZE_RESULT_SUCCESS,
                   r.global.kernelStart, r.global.kernelEnd,
                   r.context.kernelStart, r.context.kernelEnd);
   }
   _latest_clear_if(s->attr, s);
-  /* Drop our refs on preds; release any that reached refs==0 & live==0. */
+  /* Drop refs on preds; release any that hit 0 and are already drained. */
   for (uint32_t i = 0; i < s->n_preds; ++i) {
     struct _ze_slot *p = s->preds[i];
     if (__atomic_sub_fetch(&p->refs, 1, __ATOMIC_RELAXED) == 0 && !p->live)
       _slot_release(p);
   }
-  /* Per-run preds reset; build-time fields (inj, attr, off, waits) stay
-   * so the next Execute can re-instantiate without re-Append. */
   free(s->preds); s->preds = NULL; s->n_preds = 0;
-  /* If no downstream slot holds an edge on us, release immediately. */
   if (__atomic_load_n(&s->refs, __ATOMIC_RELAXED) == 0)
     _slot_release(s);
 }
 
-/* Drain every live slot in a cl. */
+/* Drain every live slot in a cl (walk chunks oldest-to-newest, slots
+ * low-to-high — natural time order for emission). */
 static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
-  for (uint32_t i = 0; i < cl_data->n_slots; ++i)
-    _slot_drain(&cl_data->slots[i]);
+  struct _ze_slab_chunk *c, *tmp;
+  DL_FOREACH_SAFE(cl_data->chunks, c, tmp)
+    for (uint32_t i = 0; i < c->n_used; ++i)
+      _slot_drain(&c->slots[i]);
   cl_data->in_flight_q = NULL;
 }
 
@@ -800,6 +833,41 @@ static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
   pthread_mutex_unlock(&cl_data->mtx);
 }
 
+/* zeCommandListDestroy epilogue. The L0 spec says the user must have
+ * ensured the device is no longer referencing the cl, so we don't drain
+ * (the GPU is already idle on this cl). We just release our state:
+ * PUT every slot's tracer-owned events back to the per-context pool,
+ * free per-slot allocations, free every chunk's slab + chunk struct,
+ * remove cl_data from the registry, free cl_data itself.
+ *
+ * Works for both cl kinds: regular cls (inj baked into the cl body)
+ * can recycle inj here because the cl body is about to be destroyed by
+ * L0; immediate cls' slots have likely already been released at drain
+ * time but any stragglers get cleaned up too. */
+static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
+  struct _ze_command_list_obj_data *cl_data = NULL;
+  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
+  if (!cl_data) return;
+  pthread_mutex_lock(&cl_data->mtx);
+  struct _ze_slab_chunk *c, *tmp;
+  DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
+    for (uint32_t i = 0; i < c->n_used; ++i) {
+      struct _ze_slot *s = &c->slots[i];
+      if (s->inj)         PUT_ZE_EVENT(s->inj);
+      if (s->shadow_done) PUT_ZE_EVENT(s->shadow_done);
+      free(s->waits);
+      free(s->preds);
+      _latest_clear_if(s->attr, s);
+    }
+    DL_DELETE(cl_data->chunks, c);
+    if (c->slab) ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+    free(c);
+  }
+  pthread_mutex_unlock(&cl_data->mtx);
+  pthread_mutex_destroy(&cl_data->mtx);
+  free(cl_data);
+}
+
 /* Drain every cl whose in_flight_q matches. */
 static void _on_sync_drain_queue(ze_command_queue_handle_t hQueue) {
   pthread_mutex_lock(&_ze_cls_mutex);
@@ -823,8 +891,12 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
   /* The drained slot may have left siblings live; only clear
    * in_flight_q if nothing in this cl remains in flight. */
   int any_live = 0;
-  for (uint32_t i = 0; i < s->owner->n_slots; ++i)
-    if (s->owner->slots[i].live) { any_live = 1; break; }
+  struct _ze_slab_chunk *c;
+  DL_FOREACH(s->owner->chunks, c) {
+    for (uint32_t i = 0; i < c->n_used; ++i)
+      if (c->slots[i].live) { any_live = 1; break; }
+    if (any_live) break;
+  }
   if (!any_live) s->owner->in_flight_q = NULL;
   pthread_mutex_unlock(&s->owner->mtx);
 }
@@ -860,13 +932,16 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
   ze_context_handle_t ctx = _cl_cache_context(cl_data, command_list);
   ze_device_handle_t  dev = _cl_cache_device(cl_data, command_list);
   struct _ze_shadow_cl *sh = (ctx && dev) ? _get_shadow_cl(ctx, dev) : NULL;
-  for (uint32_t j = 0; j < cl_data->n_slots; ++j) {
-    struct _ze_slot *slot = &cl_data->slots[j];
-    if (!sh || !slot->inj || !slot->shadow_done) continue;
-    if (_shadow_append_query(sh, slot->inj->event, cl_data->slab,
-                             &slot->off, slot->shadow_done->event) != 0)
-      continue;  /* slot stays not-live this round; we miss this timing */
-    _slot_instantiate(cl_data, slot);
+  struct _ze_slab_chunk *c;
+  DL_FOREACH(cl_data->chunks, c) {
+    for (uint32_t j = 0; j < c->n_used; ++j) {
+      struct _ze_slot *slot = &c->slots[j];
+      if (!sh || !slot->inj || !slot->shadow_done) continue;
+      if (_shadow_append_query(sh, slot->inj->event, c->slab,
+                               &slot->off, slot->shadow_done->event) != 0)
+        continue;  /* slot stays not-live this round; we miss this timing */
+      _slot_instantiate(cl_data, slot);
+    }
   }
   cl_data->in_flight_q = hQueue;
 
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index b5c0d70a..10c263f9 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -155,9 +155,20 @@ def upper_snake_case(str)
   }
 EOF
 
-# Reset / Destroy hooks intentionally omitted: the user must have
-# synchronized before they reset or destroy the cmdlist, so all our
-# slots are already drained.
+# Reset hook intentionally omitted: the L0 spec
+# (https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#zecommandlistreset)
+# says the user must have synchronized first, so all our slots are
+# already drained.
+#
+# Destroy hook: the same spec rule applies for the GPU side (no in-flight
+# work on the cl), but we still need to clean up OUR host-side state —
+# slot/slab chunks, per-slot waits, and tracer-owned events that haven't
+# already gone back to the pool. Otherwise every cl create/destroy cycle
+# leaks all of the above.
+register_epilogue 'zeCommandListDestroy', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hCommandList)
+    _on_destroy_command_list(hCommandList);
+EOF
 
 # Epilogue runs after L0's actual submission has returned. ALL the
 # tracer's bookkeeping for Execute happens here (no prologue) so that

From 567691167b19b4224fdc1a99a65e06fab0f5b503 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Thu, 4 Jun 2026 22:25:20 +0000
Subject: [PATCH 24/54] ze: tear down our context-bound state in
 zeContextDestroy prologue

The tracer was leaving its own L0 objects (event pools/events used as
the recycle pool, shadow command lists keyed by (ctx, device)) alive
inside the user's context. When the user app destroyed the context
before destroying a command list that had our injected events baked
into it (the Intel libomptarget L0 plugin does this at _dl_fini), the
following zeCommandListDestroy segfaulted inside libze_intel_gpu.so.

Add a zeContextDestroy prologue that, while the context is still
valid, walks _ze_cls / _ze_shadow_cls / _ze_event_pools and frees
everything we own keyed to the dying context.
---
 backends/ze/tracer_ze_helpers.include.c | 78 +++++++++++++++++++++++++
 backends/ze/ze_model.rb                 | 10 ++++
 2 files changed, 88 insertions(+)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index b2fe9b27..874a6c46 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -868,6 +868,84 @@ static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
   free(cl_data);
 }
 
+/* zeContextDestroy prologue. The user contract is that the device is no
+ * longer referencing the context, so all cls/events bound to it are
+ * conceptually dead from the user's perspective. Our job here is solely
+ * to avoid leaking our own L0 objects that live inside this context:
+ *
+ *   1) cls registered against this ctx: free their slot/slab/chunk state
+ *      (drop tracer-owned events to L0 without re-pooling — the pool is
+ *      about to die anyway).
+ *   2) per-(ctx, device) shadow cls: zeCommandListDestroy them.
+ *   3) per-ctx event-pool freelist: zeEventDestroy + zeEventPoolDestroy
+ *      each wrapper, recycle the wrapper structs.
+ *
+ * Forwards no calls about the user's own cls/events to the driver — the
+ * user takes care of those (or accepts the contract). */
+static void _on_destroy_context(ze_context_handle_t hContext) {
+  /* 1) Drop cls bound to this ctx. */
+  pthread_mutex_lock(&_ze_cls_mutex);
+  struct _ze_command_list_obj_data *cl_data = NULL, *cl_tmp = NULL;
+  HASH_ITER(hh, _ze_cls, cl_data, cl_tmp) {
+    if (cl_data->cached_context != hContext) continue;
+    HASH_DEL(_ze_cls, cl_data);
+    pthread_mutex_lock(&cl_data->mtx);
+    struct _ze_slab_chunk *c, *ctmp;
+    DL_FOREACH_SAFE(cl_data->chunks, c, ctmp) {
+      for (uint32_t i = 0; i < c->n_used; ++i) {
+        struct _ze_slot *s = &c->slots[i];
+        /* Recycle our event wrappers but DON'T return them to the per-ctx
+         * pool — the pool entry will be wiped in step 3, and we want the
+         * underlying L0 event/pool destroyed there too, not here. The
+         * wrappers themselves are context-agnostic, so reuse them. */
+        if (s->inj)         PUT_ZE_EVENT_WRAPPER(s->inj);
+        if (s->shadow_done) PUT_ZE_EVENT_WRAPPER(s->shadow_done);
+        free(s->waits);
+        free(s->preds);
+        _latest_clear_if(s->attr, s);
+      }
+      DL_DELETE(cl_data->chunks, c);
+      /* Skip zeMemFree on the slab — the ctx is being destroyed; the
+       * driver will reclaim the device allocation. Calling zeMemFree
+       * on a doomed ctx is at best racy. */
+      free(c);
+    }
+    pthread_mutex_unlock(&cl_data->mtx);
+    pthread_mutex_destroy(&cl_data->mtx);
+    free(cl_data);
+  }
+  pthread_mutex_unlock(&_ze_cls_mutex);
+
+  /* 2) Shadow cls keyed by (ctx, device). */
+  pthread_mutex_lock(&_ze_shadow_cls_mutex);
+  struct _ze_shadow_cl *sh = NULL, *sh_tmp = NULL;
+  HASH_ITER(hh, _ze_shadow_cls, sh, sh_tmp) {
+    if (sh->key.context != hContext) continue;
+    HASH_DEL(_ze_shadow_cls, sh);
+    if (sh->cl) ZE_COMMAND_LIST_DESTROY_PTR(sh->cl);
+    pthread_mutex_destroy(&sh->mtx);
+    free(sh);
+  }
+  pthread_mutex_unlock(&_ze_shadow_cls_mutex);
+
+  /* 3) Per-ctx event pool freelist. */
+  pthread_mutex_lock(&_ze_event_pools_mutex);
+  struct _ze_event_pool_entry *pe = NULL;
+  HASH_FIND_PTR(_ze_event_pools, &hContext, pe);
+  if (pe) {
+    HASH_DEL(_ze_event_pools, pe);
+    struct _ze_event_h *w, *w_tmp;
+    DL_FOREACH_SAFE(pe->events, w, w_tmp) {
+      if (w->event)      ZE_EVENT_DESTROY_PTR(w->event);
+      if (w->event_pool) ZE_EVENT_POOL_DESTROY_PTR(w->event_pool);
+      DL_DELETE(pe->events, w);
+      PUT_ZE_EVENT_WRAPPER(w);
+    }
+    free(pe);
+  }
+  pthread_mutex_unlock(&_ze_event_pools_mutex);
+}
+
 /* Drain every cl whose in_flight_q matches. */
 static void _on_sync_drain_queue(ze_command_queue_handle_t hQueue) {
   pthread_mutex_lock(&_ze_cls_mutex);
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index 10c263f9..e219f608 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -170,6 +170,16 @@ def upper_snake_case(str)
     _on_destroy_command_list(hCommandList);
 EOF
 
+# zeContextDestroy prologue: tear down our own L0 objects that live
+# inside this context (shadow cls, per-ctx event pools/events) BEFORE the
+# user destroys the context. The L0 spec says the user has ensured the
+# device is no longer referencing the context, so all user-side cls/events
+# are already done — we just need to not leak our allocations.
+register_prologue 'zeContextDestroy', <<EOF
+  if (_do_profile && hContext)
+    _on_destroy_context(hContext);
+EOF
+
 # Epilogue runs after L0's actual submission has returned. ALL the
 # tracer's bookkeeping for Execute happens here (no prologue) so that
 # concurrent Executes / Syncs from other threads observe in_flight_q

From 88100226e2f9b94b2557f4f8e9bf2a478d0a910a Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Mon, 8 Jun 2026 18:47:36 +0000
Subject: [PATCH 25/54] rubocopt

---
 backends/ze/ze_model.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index e219f608..d5eedf68 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -305,7 +305,7 @@ def upper_snake_case(str)
 EOF
 }
 
-profiling_epilogue = lambda { |event_name, waits_expr = "phWaitEvents", n_waits_expr = "numWaitEvents"|
+profiling_epilogue = lambda { |_event_name, waits_expr = 'phWaitEvents', n_waits_expr = 'numWaitEvents'|
   <<EOF
   if (_do_profile && _ewrapper) {
     if (_retval == ZE_RESULT_SUCCESS) {

From 48e384375134d53941e1b8c7f96d16dfdd356171 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Mon, 8 Jun 2026 18:49:04 +0000
Subject: [PATCH 26/54] Clang-format

---
 backends/ze/tracer_ze_helpers.include.c | 329 ++++++++++++++----------
 1 file changed, 196 insertions(+), 133 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 874a6c46..e8dafeda 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -77,26 +77,27 @@ struct _ze_slab_chunk;
  * after emit, the slot is dropped from the cl's list. */
 struct _ze_slot {
   struct _ze_command_list_obj_data *owner; /* cl_data this slot lives in */
-  struct _ze_slab_chunk *chunk;        /* chunk this slot lives in (==> .slab to read at drain) */
-  struct _ze_event_h *inj;             /* tracer-owned event the Query waits on */
-  struct _ze_event_h *shadow_done;     /* tracer-owned event the Query signals; drain host-syncs on this */
-  ze_event_handle_t   attr;            /* user's original signal event (NULL => inj->event) */
-  size_t              off;             /* byte offset within chunk->slab */
+  struct _ze_slab_chunk *chunk; /* chunk this slot lives in (==> .slab to read at drain) */
+  struct _ze_event_h *inj;      /* tracer-owned event the Query waits on */
+  struct _ze_event_h
+      *shadow_done;       /* tracer-owned event the Query signals; drain host-syncs on this */
+  ze_event_handle_t attr; /* user's original signal event (NULL => inj->event) */
+  size_t off;             /* byte offset within chunk->slab */
   /* User wait events copied at Append time (stable across rebuilds);
    * preds[] is computed at instantiate from waits[] by looking up
    * latest[w] for each w. */
-  ze_event_handle_t  *waits;
-  uint32_t            n_waits;
-  struct _ze_slot   **preds;           /* points at slots whose drain must come first (may be in another cl) */
-  uint32_t            n_preds;
-  unsigned char       live;            /* in-flight (instantiated, not drained) */
+  ze_event_handle_t *waits;
+  uint32_t n_waits;
+  struct _ze_slot **preds; /* points at slots whose drain must come first (may be in another cl) */
+  uint32_t n_preds;
+  unsigned char live; /* in-flight (instantiated, not drained) */
   /* Incoming pred edges: count of downstream slots whose preds[] points
    * here AND that have not yet been drained. Incremented at downstream
    * _slot_instantiate (one per pred edge), decremented at downstream
    * _slot_drain. Slot is reclaimable iff live==0 AND refs==0. Atomic
    * because increment/decrement happen across cl boundaries without
    * holding the slot's owner mtx. */
-  uint32_t            refs;
+  uint32_t refs;
 };
 
 #define _ZE_SLAB_CHUNK_SLOTS 64
@@ -111,19 +112,19 @@ struct _ze_slot {
  * chunk frees itself when n_held drops to 0 AND it is not the tail
  * (new Appends still want to land on the tail). */
 struct _ze_slab_chunk {
-  void                  *slab;          /* _ZE_SLAB_CHUNK_SLOTS * sizeof(ze_kernel_timestamp_result_t) */
-  ze_context_handle_t    slab_ctx;      /* context the slab was allocated against (zeMemFree target) */
-  uint32_t               n_used;        /* slots ever assigned in this chunk (monotonic until chunk free) */
-  uint32_t               n_held;        /* unreleased slots (n_used minus _slot_release calls) */
+  void *slab;                   /* _ZE_SLAB_CHUNK_SLOTS * sizeof(ze_kernel_timestamp_result_t) */
+  ze_context_handle_t slab_ctx; /* context the slab was allocated against (zeMemFree target) */
+  uint32_t n_used;              /* slots ever assigned in this chunk (monotonic until chunk free) */
+  uint32_t n_held;              /* unreleased slots (n_used minus _slot_release calls) */
   struct _ze_slab_chunk *next, *prev;
-  struct _ze_slot        slots[_ZE_SLAB_CHUNK_SLOTS];
+  struct _ze_slot slots[_ZE_SLAB_CHUNK_SLOTS];
 };
 
 struct _ze_command_list_obj_data {
   void *ptr;
   UT_hash_handle hh;
 
-  struct _ze_slab_chunk *chunks;        /* utlist DL_ head; tail = chunks->prev (circular) */
+  struct _ze_slab_chunk *chunks; /* utlist DL_ head; tail = chunks->prev (circular) */
 
   /* in_flight_q is the queue this cl was last Executed on AND not yet
    * drained. NULL means "not in flight" — safe to Execute without a
@@ -134,14 +135,14 @@ struct _ze_command_list_obj_data {
   /* Serializes the Execute prologue: if two threads race to Execute the
    * same closed cl on different queues, we need to force-sync the prior
    * one before letting the second run instantiate. */
-  pthread_mutex_t    mtx;
-  unsigned char      is_immediate;
-  unsigned char      is_in_order;
+  pthread_mutex_t mtx;
+  unsigned char is_immediate;
+  unsigned char is_in_order;
 
   /* Cached on first use: device handle and context handle for this cl.
    * Both are immutable for the life of the cl, so caching avoids the
    * per-Append/per-Execute ZE_*_GET_*_HANDLE_PTR roundtrips. */
-  ze_device_handle_t  cached_device;
+  ze_device_handle_t cached_device;
   ze_context_handle_t cached_context;
 };
 
@@ -178,9 +179,9 @@ pthread_mutex_t _ze_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
  * compute group on this device" — treated as fatal at use sites. */
 struct _ze_compute_ord_entry {
   ze_device_handle_t device;
-  uint32_t           ordinal;
-  unsigned char      valid;
-  UT_hash_handle     hh;
+  uint32_t ordinal;
+  unsigned char valid;
+  UT_hash_handle hh;
 };
 static struct _ze_compute_ord_entry *_ze_compute_ords = NULL;
 static pthread_mutex_t _ze_compute_ords_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -200,20 +201,23 @@ static uint32_t _get_compute_ordinal(ze_device_handle_t device) {
 
   /* Slow path: scan queue groups outside the lock. */
   uint32_t n_groups = 0;
-  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, NULL)
-      != ZE_RESULT_SUCCESS || n_groups == 0)
+  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, NULL) !=
+          ZE_RESULT_SUCCESS ||
+      n_groups == 0)
     return (uint32_t)-1;
   ze_command_queue_group_properties_t *groups =
       (ze_command_queue_group_properties_t *)calloc(n_groups, sizeof(*groups));
-  if (!groups) return (uint32_t)-1;
+  if (!groups)
+    return (uint32_t)-1;
   for (uint32_t i = 0; i < n_groups; ++i)
     groups[i].stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES;
   uint32_t found = (uint32_t)-1;
-  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, groups)
-      == ZE_RESULT_SUCCESS) {
+  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, groups) ==
+      ZE_RESULT_SUCCESS) {
     for (uint32_t i = 0; i < n_groups; ++i)
       if (groups[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
-        found = i; break;
+        found = i;
+        break;
       }
   }
   free(groups);
@@ -224,9 +228,9 @@ static uint32_t _get_compute_ordinal(ze_device_handle_t device) {
   if (!e) {
     e = (struct _ze_compute_ord_entry *)calloc(1, sizeof(*e));
     if (e) {
-      e->device  = device;
+      e->device = device;
       e->ordinal = found;
-      e->valid   = (found != (uint32_t)-1) ? 1 : 0;
+      e->valid = (found != (uint32_t)-1) ? 1 : 0;
       HASH_ADD_PTR(_ze_compute_ords, device, e);
     }
   } else {
@@ -243,13 +247,13 @@ static uint32_t _get_compute_ordinal(ze_device_handle_t device) {
  * is identical regardless of user-cl kind. */
 struct _ze_shadow_key {
   ze_context_handle_t context;
-  ze_device_handle_t  device;
+  ze_device_handle_t device;
 };
 struct _ze_shadow_cl {
-  struct _ze_shadow_key    key;
+  struct _ze_shadow_key key;
   ze_command_list_handle_t cl;
-  pthread_mutex_t          mtx;
-  UT_hash_handle           hh;
+  pthread_mutex_t mtx;
+  UT_hash_handle hh;
 };
 static struct _ze_shadow_cl *_ze_shadow_cls = NULL;
 static pthread_mutex_t _ze_shadow_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -259,19 +263,24 @@ static pthread_mutex_t _ze_shadow_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
  * we log to stderr) or if creation fails. */
 static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
                                             ze_device_handle_t device) {
-  struct _ze_shadow_key key = { context, device };
+  struct _ze_shadow_key key = {context, device};
   pthread_mutex_lock(&_ze_shadow_cls_mutex);
   struct _ze_shadow_cl *sh = NULL;
   HASH_FIND(hh, _ze_shadow_cls, &key, sizeof(key), sh);
-  if (sh) { pthread_mutex_unlock(&_ze_shadow_cls_mutex); return sh; }
+  if (sh) {
+    pthread_mutex_unlock(&_ze_shadow_cls_mutex);
+    return sh;
+  }
   pthread_mutex_unlock(&_ze_shadow_cls_mutex);
 
   /* Slow path: create outside the registry lock. */
   uint32_t ord = _get_compute_ordinal(device);
   if (ord == (uint32_t)-1) {
-    fprintf(stderr, "THAPI: device %p has no COMPUTE queue group; "
-                    "cannot create shadow cl. Profiling disabled for "
-                    "command lists on this device.\n", (void *)device);
+    fprintf(stderr,
+            "THAPI: device %p has no COMPUTE queue group; "
+            "cannot create shadow cl. Profiling disabled for "
+            "command lists on this device.\n",
+            (void *)device);
     return NULL;
   }
   /* ASYNCHRONOUS mode is critical: with SYNCHRONOUS (the DEFAULT),
@@ -282,13 +291,15 @@ static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
    * Deadlock. ASYNCHRONOUS lets the Append return immediately and the
    * Query run device-side at its own pace. */
   ze_command_queue_desc_t qd = {
-      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, NULL, ord, 0, 0,
-      ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL };
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, NULL, ord, 0, 0, ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+      ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
   ze_command_list_handle_t new_cl = NULL;
-  if (ZE_COMMAND_LIST_CREATE_IMMEDIATE_PTR(context, device, &qd, &new_cl)
-        != ZE_RESULT_SUCCESS || !new_cl) {
-    fprintf(stderr, "THAPI: failed to create shadow cl for "
-                    "context=%p device=%p\n", (void *)context, (void *)device);
+  if (ZE_COMMAND_LIST_CREATE_IMMEDIATE_PTR(context, device, &qd, &new_cl) != ZE_RESULT_SUCCESS ||
+      !new_cl) {
+    fprintf(stderr,
+            "THAPI: failed to create shadow cl for "
+            "context=%p device=%p\n",
+            (void *)context, (void *)device);
     return NULL;
   }
 
@@ -307,7 +318,7 @@ static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
     return NULL;
   }
   sh->key = key;
-  sh->cl  = new_cl;
+  sh->cl = new_cl;
   pthread_mutex_init(&sh->mtx, NULL);
   HASH_ADD(hh, _ze_shadow_cls, key, sizeof(sh->key), sh);
   pthread_mutex_unlock(&_ze_shadow_cls_mutex);
@@ -320,13 +331,14 @@ static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
  * Returns 0 on success, -1 on failure. */
 static int _shadow_append_query(struct _ze_shadow_cl *sh,
                                 ze_event_handle_t inj_event,
-                                void *slab, size_t *off,
+                                void *slab,
+                                size_t *off,
                                 ze_event_handle_t shadow_done_event) {
   pthread_mutex_lock(&sh->mtx);
-  ze_result_t r = ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(
-      sh->cl, 1, &inj_event, slab, off,
-      /*hSignalEvent=*/ shadow_done_event,
-      /*numWaitEvents=*/ 1, &inj_event);
+  ze_result_t r =
+      ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(sh->cl, 1, &inj_event, slab, off,
+                                                         /*hSignalEvent=*/shadow_done_event,
+                                                         /*numWaitEvents=*/1, &inj_event);
   pthread_mutex_unlock(&sh->mtx);
   return (r == ZE_RESULT_SUCCESS) ? 0 : -1;
 }
@@ -336,7 +348,8 @@ static int _shadow_append_query(struct _ze_shadow_cl *sh,
  * roundtrip on every Append/Execute. Returns NULL on L0 error. */
 static ze_device_handle_t _cl_cache_device(struct _ze_command_list_obj_data *cl_data,
                                            ze_command_list_handle_t command_list) {
-  if (cl_data->cached_device) return cl_data->cached_device;
+  if (cl_data->cached_device)
+    return cl_data->cached_device;
   ze_device_handle_t d = NULL;
   if (ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &d) == ZE_RESULT_SUCCESS)
     cl_data->cached_device = d;
@@ -345,15 +358,16 @@ static ze_device_handle_t _cl_cache_device(struct _ze_command_list_obj_data *cl_
 
 static ze_context_handle_t _cl_cache_context(struct _ze_command_list_obj_data *cl_data,
                                              ze_command_list_handle_t command_list) {
-  if (cl_data->cached_context) return cl_data->cached_context;
+  if (cl_data->cached_context)
+    return cl_data->cached_context;
   ze_context_handle_t c = NULL;
   if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &c) == ZE_RESULT_SUCCESS)
     cl_data->cached_context = c;
   return c;
 }
 
-static inline void _on_create_command_list(ze_command_list_handle_t command_list,
-                                            int immediate, int in_order) {
+static inline void
+_on_create_command_list(ze_command_list_handle_t command_list, int immediate, int in_order) {
   struct _ze_command_list_obj_data *cl_data = NULL;
 
   FIND_ZE_CL(&command_list, cl_data);
@@ -398,9 +412,9 @@ static pthread_mutex_t _ze_event_pools_mutex = PTHREAD_MUTEX_INITIALIZER;
  * the latest slot for ev as a pred. Updated at instantiate and cleared
  * at drain. */
 struct _ze_latest_entry {
-  ze_event_handle_t ev;     /* key */
-  struct _ze_slot  *slot;
-  UT_hash_handle    hh;
+  ze_event_handle_t ev; /* key */
+  struct _ze_slot *slot;
+  UT_hash_handle hh;
 };
 static struct _ze_latest_entry *_ze_latest = NULL;
 static pthread_mutex_t _ze_latest_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -415,13 +429,17 @@ static inline struct _ze_slot *_latest_get(ze_event_handle_t ev) {
 }
 
 static inline void _latest_set(ze_event_handle_t ev, struct _ze_slot *s) {
-  if (!ev) return;
+  if (!ev)
+    return;
   pthread_mutex_lock(&_ze_latest_mutex);
   struct _ze_latest_entry *e = NULL;
   HASH_FIND_PTR(_ze_latest, &ev, e);
   if (!e) {
     e = (struct _ze_latest_entry *)calloc(1, sizeof(*e));
-    if (!e) { pthread_mutex_unlock(&_ze_latest_mutex); return; }
+    if (!e) {
+      pthread_mutex_unlock(&_ze_latest_mutex);
+      return;
+    }
     e->ev = ev;
     HASH_ADD_PTR(_ze_latest, ev, e);
   }
@@ -433,7 +451,8 @@ static inline void _latest_set(ze_event_handle_t ev, struct _ze_slot *s) {
  * being drained — but if a newer Append already overwrote latest[ev],
  * don't clobber that). */
 static inline void _latest_clear_if(ze_event_handle_t ev, struct _ze_slot *s) {
-  if (!ev) return;
+  if (!ev)
+    return;
   pthread_mutex_lock(&_ze_latest_mutex);
   struct _ze_latest_entry *e = NULL;
   HASH_FIND_PTR(_ze_latest, &ev, e);
@@ -465,7 +484,7 @@ static inline void _latest_clear_if(ze_event_handle_t ev, struct _ze_slot *s) {
     if (!pool) {                                                                                   \
       pool = (struct _ze_event_pool_entry *)calloc(1, sizeof(struct _ze_event_pool_entry));        \
       if (!pool) {                                                                                 \
-        THAPI_DBGLOG("Failed to allocate memory");                                         \
+        THAPI_DBGLOG("Failed to allocate memory");                                                 \
         pthread_mutex_unlock(&_ze_event_pools_mutex);                                              \
         if (val->event_pool) {                                                                     \
           if (val->event)                                                                          \
@@ -555,11 +574,12 @@ static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command
 static struct _ze_slab_chunk *_cl_chunk_alloc(struct _ze_command_list_obj_data *cl_data,
                                               ze_context_handle_t ctx) {
   struct _ze_slab_chunk *c = (struct _ze_slab_chunk *)calloc(1, sizeof(*c));
-  if (!c) return NULL;
+  if (!c)
+    return NULL;
   size_t bytes = (size_t)_ZE_SLAB_CHUNK_SLOTS * sizeof(ze_kernel_timestamp_result_t);
   ze_host_mem_alloc_desc_t hd = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, NULL, 0};
-  if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &c->slab) != ZE_RESULT_SUCCESS
-      || !c->slab) {
+  if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &c->slab) != ZE_RESULT_SUCCESS ||
+      !c->slab) {
     free(c);
     return NULL;
   }
@@ -582,19 +602,21 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
                                         uint32_t n_waits) {
   struct _ze_slab_chunk *tail = cl_data->chunks ? cl_data->chunks->prev : NULL;
   if (!tail || tail->n_used >= _ZE_SLAB_CHUNK_SLOTS) {
-    if (tail && !cl_data->is_immediate) return NULL;
+    if (tail && !cl_data->is_immediate)
+      return NULL;
     tail = _cl_chunk_alloc(cl_data, ctx);
-    if (!tail) return NULL;
+    if (!tail)
+      return NULL;
   }
   uint32_t idx = tail->n_used;
   struct _ze_slot *s = &tail->slots[idx];
   /* Chunk memory is calloc'd, so all other slot fields are already zero. */
-  s->owner       = cl_data;
-  s->chunk       = tail;
-  s->inj         = inj;
+  s->owner = cl_data;
+  s->chunk = tail;
+  s->inj = inj;
   s->shadow_done = shadow_done;
-  s->attr        = attr;
-  s->off         = (size_t)idx * sizeof(ze_kernel_timestamp_result_t);
+  s->attr = attr;
+  s->off = (size_t)idx * sizeof(ze_kernel_timestamp_result_t);
   if (n_waits) {
     s->waits = (ze_event_handle_t *)malloc(n_waits * sizeof(ze_event_handle_t));
     if (s->waits) {
@@ -610,15 +632,15 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
 /* Compute s->preds from s->waits via the global latest[] map, plus the
  * previous live slot on this cl if the cl is in-order. Marks s live and
  * publishes s as the new latest[attr]. */
-static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data,
-                              struct _ze_slot *s) {
+static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct _ze_slot *s) {
   s->live = 1;
   uint32_t cap = s->n_waits + 1; /* +1 for in-order prev */
   s->preds = (struct _ze_slot **)calloc(cap, sizeof(struct _ze_slot *));
   s->n_preds = 0;
   for (uint32_t i = 0; i < s->n_waits; ++i) {
     struct _ze_slot *p = _latest_get(s->waits[i]);
-    if (p && p->live) s->preds[s->n_preds++] = p;
+    if (p && p->live)
+      s->preds[s->n_preds++] = p;
   }
   if (cl_data->is_in_order) {
     /* Walk chunks newest-to-oldest, slots high-to-low, stop at the first
@@ -628,20 +650,25 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data,
      * live=0 here but the !=s guard is safe and clearer. */
     struct _ze_slab_chunk *c;
     struct _ze_slot *prev = NULL;
-    for (c = cl_data->chunks ? cl_data->chunks->prev : NULL;
-         c && !prev;
+    for (c = cl_data->chunks ? cl_data->chunks->prev : NULL; c && !prev;
          c = (c == cl_data->chunks) ? NULL : c->prev) {
       for (int32_t i = (int32_t)c->n_used - 1; i >= 0; --i) {
-        if (&c->slots[i] == s) continue;
-        if (c->slots[i].live) { prev = &c->slots[i]; break; }
+        if (&c->slots[i] == s)
+          continue;
+        if (c->slots[i].live) {
+          prev = &c->slots[i];
+          break;
+        }
       }
     }
-    if (prev) s->preds[s->n_preds++] = prev;
+    if (prev)
+      s->preds[s->n_preds++] = prev;
   }
   /* Each new pred edge holds a ref on its target. */
   for (uint32_t i = 0; i < s->n_preds; ++i)
     __atomic_fetch_add(&s->preds[i]->refs, 1, __ATOMIC_RELAXED);
-  if (s->attr) _latest_set(s->attr, s);
+  if (s->attr)
+    _latest_set(s->attr, s);
 }
 
 /* Append-time hook called from profiling_epilogue. Caller already
@@ -658,7 +685,8 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
                                      ze_event_handle_t user_signal,
                                      ze_event_handle_t *user_waits,
                                      uint32_t user_n_waits) {
-  if (!inj) return;
+  if (!inj)
+    return;
   struct _ze_event_h *shadow_done = NULL;
   struct _ze_command_list_obj_data *cl_data = NULL;
   struct _ze_slot *s = NULL;
@@ -673,8 +701,8 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
    * no chaining is needed. */
   if (user_signal) {
     ze_event_handle_t wait_ev = inj->event;
-    if (ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev)
-        != ZE_RESULT_SUCCESS)
+    if (ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev) !=
+        ZE_RESULT_SUCCESS)
       goto fail;
   }
 
@@ -688,16 +716,18 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
    * user sync on user_signal — required because the Query lives on a
    * separate shadow cl whose completion isn't implied by user-level sync. */
   shadow_done = _get_profiling_event(command_list);
-  if (!shadow_done) goto fail;
+  if (!shadow_done)
+    goto fail;
   shadow_done->context = ctx;
 
   FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) goto fail;
+  if (!cl_data)
+    goto fail;
   pthread_mutex_lock(&cl_data->mtx);
 
-  s = _cl_slot_append(cl_data, ctx, inj, shadow_done,
-                      user_signal, user_waits, user_n_waits);
-  if (!s) goto fail_locked;
+  s = _cl_slot_append(cl_data, ctx, inj, shadow_done, user_signal, user_waits, user_n_waits);
+  if (!s)
+    goto fail_locked;
 
   /* The Query Append now lives on the per-(context, device) shadow
    * compute cl rather than the user cl. This is what lets us profile
@@ -710,8 +740,8 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     cl_data->cached_context = ctx;
     ze_device_handle_t dev = _cl_cache_device(cl_data, command_list);
     struct _ze_shadow_cl *sh = dev ? _get_shadow_cl(ctx, dev) : NULL;
-    if (!sh || _shadow_append_query(sh, inj->event, s->chunk->slab, &s->off,
-                                     shadow_done->event) != 0)
+    if (!sh ||
+        _shadow_append_query(sh, inj->event, s->chunk->slab, &s->off, shadow_done->event) != 0)
       goto fail_locked;
     _slot_instantiate(cl_data, s);
   }
@@ -733,14 +763,16 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     memset(s, 0, sizeof(*s));
     if (c->n_used == 0) {
       DL_DELETE(cl_data->chunks, c);
-      if (c->slab) ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+      if (c->slab)
+        ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
       free(c);
     }
   }
   pthread_mutex_unlock(&cl_data->mtx);
   ADD_ZE_CL(cl_data);
 fail:
-  if (shadow_done) PUT_ZE_EVENT(shadow_done);
+  if (shadow_done)
+    PUT_ZE_EVENT(shadow_done);
   PUT_ZE_EVENT(inj);
 }
 
@@ -749,20 +781,29 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
  * tail, unlink and free it. Regular cls are skipped (their inj is
  * baked into the cl body — reclaim happens at cl destroy instead). */
 static void _slot_release(struct _ze_slot *s) {
-  if (!s || !s->owner || !s->owner->is_immediate) return;
-  if (s->inj)         { PUT_ZE_EVENT(s->inj);         s->inj         = NULL; }
-  if (s->shadow_done) { PUT_ZE_EVENT(s->shadow_done); s->shadow_done = NULL; }
+  if (!s || !s->owner || !s->owner->is_immediate)
+    return;
+  if (s->inj) {
+    PUT_ZE_EVENT(s->inj);
+    s->inj = NULL;
+  }
+  if (s->shadow_done) {
+    PUT_ZE_EVENT(s->shadow_done);
+    s->shadow_done = NULL;
+  }
   free(s->waits);
-  s->waits   = NULL;
+  s->waits = NULL;
   s->n_waits = 0;
-  s->attr    = NULL;
+  s->attr = NULL;
 
   struct _ze_slab_chunk *c = s->chunk;
   struct _ze_command_list_obj_data *cl = s->owner;
-  if (!c) return;
+  if (!c)
+    return;
   if (--c->n_held == 0 && c != cl->chunks->prev) {
     DL_DELETE(cl->chunks, c);
-    if (c->slab) ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+    if (c->slab)
+      ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
     free(c);
   }
 }
@@ -780,7 +821,8 @@ static void _slot_release(struct _ze_slot *s) {
  * BEFORE us). Forming a cycle would require user-declared mutual waits,
  * which L0 itself deadlocks on. */
 static void _slot_drain(struct _ze_slot *s) {
-  if (!s || !s->live) return;
+  if (!s || !s->live)
+    return;
   for (uint32_t i = 0; i < s->n_preds; ++i)
     _slot_drain(s->preds[i]);
   s->live = 0;
@@ -796,9 +838,8 @@ static void _slot_drain(struct _ze_slot *s) {
       tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) {
     ze_kernel_timestamp_result_t r =
         *(ze_kernel_timestamp_result_t *)((char *)s->chunk->slab + s->off);
-    do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr,
-                  ZE_RESULT_SUCCESS, ZE_RESULT_SUCCESS,
-                  r.global.kernelStart, r.global.kernelEnd,
+    do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr, ZE_RESULT_SUCCESS,
+                  ZE_RESULT_SUCCESS, r.global.kernelStart, r.global.kernelEnd,
                   r.context.kernelStart, r.context.kernelEnd);
   }
   _latest_clear_if(s->attr, s);
@@ -808,7 +849,9 @@ static void _slot_drain(struct _ze_slot *s) {
     if (__atomic_sub_fetch(&p->refs, 1, __ATOMIC_RELAXED) == 0 && !p->live)
       _slot_release(p);
   }
-  free(s->preds); s->preds = NULL; s->n_preds = 0;
+  free(s->preds);
+  s->preds = NULL;
+  s->n_preds = 0;
   if (__atomic_load_n(&s->refs, __ATOMIC_RELAXED) == 0)
     _slot_release(s);
 }
@@ -818,8 +861,8 @@ static void _slot_drain(struct _ze_slot *s) {
 static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
   struct _ze_slab_chunk *c, *tmp;
   DL_FOREACH_SAFE(cl_data->chunks, c, tmp)
-    for (uint32_t i = 0; i < c->n_used; ++i)
-      _slot_drain(&c->slots[i]);
+  for (uint32_t i = 0; i < c->n_used; ++i)
+    _slot_drain(&c->slots[i]);
   cl_data->in_flight_q = NULL;
 }
 
@@ -827,7 +870,8 @@ static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
 static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
   struct _ze_command_list_obj_data *cl_data = NULL;
   FIND_ZE_CL(&command_list, cl_data);
-  if (!cl_data) return;
+  if (!cl_data)
+    return;
   pthread_mutex_lock(&cl_data->mtx);
   _cl_drain(cl_data);
   pthread_mutex_unlock(&cl_data->mtx);
@@ -847,20 +891,24 @@ static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
 static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
   struct _ze_command_list_obj_data *cl_data = NULL;
   FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) return;
+  if (!cl_data)
+    return;
   pthread_mutex_lock(&cl_data->mtx);
   struct _ze_slab_chunk *c, *tmp;
   DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
     for (uint32_t i = 0; i < c->n_used; ++i) {
       struct _ze_slot *s = &c->slots[i];
-      if (s->inj)         PUT_ZE_EVENT(s->inj);
-      if (s->shadow_done) PUT_ZE_EVENT(s->shadow_done);
+      if (s->inj)
+        PUT_ZE_EVENT(s->inj);
+      if (s->shadow_done)
+        PUT_ZE_EVENT(s->shadow_done);
       free(s->waits);
       free(s->preds);
       _latest_clear_if(s->attr, s);
     }
     DL_DELETE(cl_data->chunks, c);
-    if (c->slab) ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+    if (c->slab)
+      ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
     free(c);
   }
   pthread_mutex_unlock(&cl_data->mtx);
@@ -887,7 +935,8 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
   pthread_mutex_lock(&_ze_cls_mutex);
   struct _ze_command_list_obj_data *cl_data = NULL, *cl_tmp = NULL;
   HASH_ITER(hh, _ze_cls, cl_data, cl_tmp) {
-    if (cl_data->cached_context != hContext) continue;
+    if (cl_data->cached_context != hContext)
+      continue;
     HASH_DEL(_ze_cls, cl_data);
     pthread_mutex_lock(&cl_data->mtx);
     struct _ze_slab_chunk *c, *ctmp;
@@ -898,8 +947,10 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
          * pool — the pool entry will be wiped in step 3, and we want the
          * underlying L0 event/pool destroyed there too, not here. The
          * wrappers themselves are context-agnostic, so reuse them. */
-        if (s->inj)         PUT_ZE_EVENT_WRAPPER(s->inj);
-        if (s->shadow_done) PUT_ZE_EVENT_WRAPPER(s->shadow_done);
+        if (s->inj)
+          PUT_ZE_EVENT_WRAPPER(s->inj);
+        if (s->shadow_done)
+          PUT_ZE_EVENT_WRAPPER(s->shadow_done);
         free(s->waits);
         free(s->preds);
         _latest_clear_if(s->attr, s);
@@ -920,9 +971,11 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
   pthread_mutex_lock(&_ze_shadow_cls_mutex);
   struct _ze_shadow_cl *sh = NULL, *sh_tmp = NULL;
   HASH_ITER(hh, _ze_shadow_cls, sh, sh_tmp) {
-    if (sh->key.context != hContext) continue;
+    if (sh->key.context != hContext)
+      continue;
     HASH_DEL(_ze_shadow_cls, sh);
-    if (sh->cl) ZE_COMMAND_LIST_DESTROY_PTR(sh->cl);
+    if (sh->cl)
+      ZE_COMMAND_LIST_DESTROY_PTR(sh->cl);
     pthread_mutex_destroy(&sh->mtx);
     free(sh);
   }
@@ -936,8 +989,10 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
     HASH_DEL(_ze_event_pools, pe);
     struct _ze_event_h *w, *w_tmp;
     DL_FOREACH_SAFE(pe->events, w, w_tmp) {
-      if (w->event)      ZE_EVENT_DESTROY_PTR(w->event);
-      if (w->event_pool) ZE_EVENT_POOL_DESTROY_PTR(w->event_pool);
+      if (w->event)
+        ZE_EVENT_DESTROY_PTR(w->event);
+      if (w->event_pool)
+        ZE_EVENT_POOL_DESTROY_PTR(w->event_pool);
       DL_DELETE(pe->events, w);
       PUT_ZE_EVENT_WRAPPER(w);
     }
@@ -963,7 +1018,8 @@ static void _on_sync_drain_queue(ze_command_queue_handle_t hQueue) {
 /* Drain the slot that most recently signaled `ev` (recursing on preds). */
 static void _on_sync_drain_event(ze_event_handle_t ev) {
   struct _ze_slot *s = _latest_get(ev);
-  if (!s || !s->owner) return;
+  if (!s || !s->owner)
+    return;
   pthread_mutex_lock(&s->owner->mtx);
   _slot_drain(s);
   /* The drained slot may have left siblings live; only clear
@@ -972,10 +1028,15 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
   struct _ze_slab_chunk *c;
   DL_FOREACH(s->owner->chunks, c) {
     for (uint32_t i = 0; i < c->n_used; ++i)
-      if (c->slots[i].live) { any_live = 1; break; }
-    if (any_live) break;
+      if (c->slots[i].live) {
+        any_live = 1;
+        break;
+      }
+    if (any_live)
+      break;
   }
-  if (!any_live) s->owner->in_flight_q = NULL;
+  if (!any_live)
+    s->owner->in_flight_q = NULL;
   pthread_mutex_unlock(&s->owner->mtx);
 }
 
@@ -1000,7 +1061,8 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
                                ze_command_list_handle_t command_list) {
   struct _ze_command_list_obj_data *cl_data = NULL;
   FIND_ZE_CL(&command_list, cl_data);
-  if (!cl_data) return;
+  if (!cl_data)
+    return;
   pthread_mutex_lock(&cl_data->mtx);
 
   if (cl_data->in_flight_q) {
@@ -1008,16 +1070,17 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
     _cl_drain(cl_data);
   }
   ze_context_handle_t ctx = _cl_cache_context(cl_data, command_list);
-  ze_device_handle_t  dev = _cl_cache_device(cl_data, command_list);
+  ze_device_handle_t dev = _cl_cache_device(cl_data, command_list);
   struct _ze_shadow_cl *sh = (ctx && dev) ? _get_shadow_cl(ctx, dev) : NULL;
   struct _ze_slab_chunk *c;
   DL_FOREACH(cl_data->chunks, c) {
     for (uint32_t j = 0; j < c->n_used; ++j) {
       struct _ze_slot *slot = &c->slots[j];
-      if (!sh || !slot->inj || !slot->shadow_done) continue;
-      if (_shadow_append_query(sh, slot->inj->event, c->slab,
-                               &slot->off, slot->shadow_done->event) != 0)
-        continue;  /* slot stays not-live this round; we miss this timing */
+      if (!sh || !slot->inj || !slot->shadow_done)
+        continue;
+      if (_shadow_append_query(sh, slot->inj->event, c->slab, &slot->off,
+                               slot->shadow_done->event) != 0)
+        continue; /* slot stays not-live this round; we miss this timing */
       _slot_instantiate(cl_data, slot);
     }
   }
@@ -1027,8 +1090,8 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
 }
 
 static void _on_execute_command_lists_epilogue(ze_command_queue_handle_t hQueue,
-                                                uint32_t numCommandLists,
-                                                ze_command_list_handle_t *phCommandLists) {
+                                               uint32_t numCommandLists,
+                                               ze_command_list_handle_t *phCommandLists) {
   for (uint32_t i = 0; i < numCommandLists; ++i)
     _on_execute_one_cl(hQueue, phCommandLists[i]);
 }

From e3d6b6328bcd9a0c2e5c924ed1085d97b3d1fef2 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Mon, 8 Jun 2026 20:08:10 +0000
Subject: [PATCH 27/54] ze: fix UAF in _cl_drain when draining the chunk's last
 slot

_slot_drain on the last slot of a chunk calls _slot_release, which decrements
n_held to 0 and frees the chunk. The next iteration of the inner for loop then
reads c->n_used and accesses &c->slots[i] on freed memory.

Pin the chunk with an extra n_held bump during the inner loop. Drop the ref
after the loop and free the chunk in _cl_drain itself if it was the last
holder. The chunk-free condition (n_held == 0 && c != tail) is unchanged.

Found by ASAN running ooo_imm_Event_07 (cross-cl chunk-mutation stress).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index e8dafeda..f05cc8ba 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -860,9 +860,19 @@ static void _slot_drain(struct _ze_slot *s) {
  * low-to-high — natural time order for emission). */
 static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
   struct _ze_slab_chunk *c, *tmp;
-  DL_FOREACH_SAFE(cl_data->chunks, c, tmp)
-  for (uint32_t i = 0; i < c->n_used; ++i)
-    _slot_drain(&c->slots[i]);
+  DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
+    /* Bump refcount during traversal so the last _slot_drain doesn't
+     * free c out from under the inner loop. Drop after, free here. */
+    ++c->n_held;
+    for (uint32_t i = 0; i < c->n_used; ++i)
+      _slot_drain(&c->slots[i]);
+    if (--c->n_held == 0 && c != cl_data->chunks->prev) {
+      DL_DELETE(cl_data->chunks, c);
+      if (c->slab)
+        ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+      free(c);
+    }
+  }
   cl_data->in_flight_q = NULL;
 }
 

From a46879d5a2782d6247e40c5b95d7c621fac989a8 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Mon, 8 Jun 2026 21:37:06 +0000
Subject: [PATCH 28/54] ze: Reset shadow cl on idle to reclaim L0 driver
 per-QKT storage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The L0 driver records each zeCommandListAppendQueryKernelTimestamps in
cl-internal storage (~10 KB per call) and only releases it on
zeCommandListReset/Destroy — never lazily, even after the device has
completed the op. The shadow cl is per-(context, device) and lives for
the context's lifetime, so an app that profiles many Appends grows
unbounded driver memory.

Track live_queries (under sh->mtx): bumped in _shadow_append_query,
decremented in _slot_drain after the shadow_done host-sync. When the
counter hits 0 inside _slot_drain, the shadow cl is idle from our
perspective and we Reset it to reclaim the driver state. Cross-cl case
is handled: with cls A and B each holding a Query in flight, A.sync()
decrements 2→1 (no Reset), B.sync() decrements 1→0 (Reset).

The slot stores its shadow cl so drain knows where to decrement; same
sh* is set in both Append paths (immediate cl and Execute prologue for
regular cls).

Drive-by: split two existing `if (--x == 0 && ...)` sites in
_slot_release and _cl_drain into two statements for consistent style.

Validated with mem_const_steady (added in tests repo): heap delta drops
from 1.93 MB/iter to 32 B/iter over 100 iters of (create cl + 200
Appends + sync + destroy cl). ASAN sweep across 46 correctness tests:
clean (no UAF/OOB/leaks attributable to the tracer). Full correctness
matrix: 46/46 pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 28 +++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index f05cc8ba..ac90d471 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -78,7 +78,9 @@ struct _ze_slab_chunk;
 struct _ze_slot {
   struct _ze_command_list_obj_data *owner; /* cl_data this slot lives in */
   struct _ze_slab_chunk *chunk; /* chunk this slot lives in (==> .slab to read at drain) */
-  struct _ze_event_h *inj;      /* tracer-owned event the Query waits on */
+  struct _ze_shadow_cl
+      *sh; /* shadow cl this slot's Query was Appended to (NULL until instantiated) */
+  struct _ze_event_h *inj; /* tracer-owned event the Query waits on */
   struct _ze_event_h
       *shadow_done;       /* tracer-owned event the Query signals; drain host-syncs on this */
   ze_event_handle_t attr; /* user's original signal event (NULL => inj->event) */
@@ -253,6 +255,7 @@ struct _ze_shadow_cl {
   struct _ze_shadow_key key;
   ze_command_list_handle_t cl;
   pthread_mutex_t mtx;
+  uint32_t live_queries; /* QKTs appended but not yet host-synced; protected by mtx */
   UT_hash_handle hh;
 };
 static struct _ze_shadow_cl *_ze_shadow_cls = NULL;
@@ -335,10 +338,13 @@ static int _shadow_append_query(struct _ze_shadow_cl *sh,
                                 size_t *off,
                                 ze_event_handle_t shadow_done_event) {
   pthread_mutex_lock(&sh->mtx);
+  sh->live_queries++;
   ze_result_t r =
       ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(sh->cl, 1, &inj_event, slab, off,
                                                          /*hSignalEvent=*/shadow_done_event,
                                                          /*numWaitEvents=*/1, &inj_event);
+  if (r != ZE_RESULT_SUCCESS)
+    sh->live_queries--;
   pthread_mutex_unlock(&sh->mtx);
   return (r == ZE_RESULT_SUCCESS) ? 0 : -1;
 }
@@ -743,6 +749,7 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     if (!sh ||
         _shadow_append_query(sh, inj->event, s->chunk->slab, &s->off, shadow_done->event) != 0)
       goto fail_locked;
+    s->sh = sh;
     _slot_instantiate(cl_data, s);
   }
   pthread_mutex_unlock(&cl_data->mtx);
@@ -800,7 +807,8 @@ static void _slot_release(struct _ze_slot *s) {
   struct _ze_command_list_obj_data *cl = s->owner;
   if (!c)
     return;
-  if (--c->n_held == 0 && c != cl->chunks->prev) {
+  c->n_held--;
+  if (c->n_held == 0 && c != cl->chunks->prev) {
     DL_DELETE(cl->chunks, c);
     if (c->slab)
       ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
@@ -832,6 +840,16 @@ static void _slot_drain(struct _ze_slot *s) {
   if (s->shadow_done && s->shadow_done->event) {
     ZE_EVENT_HOST_SYNCHRONIZE_PTR(s->shadow_done->event, UINT64_MAX);
     ZE_EVENT_HOST_RESET_PTR(s->shadow_done->event);
+    /* QKT completed device-side. Drop the live ref; if nothing else on
+     * this shadow cl is in flight, Reset it: the L0 driver leaks ~10 KB
+     * per AppendQueryKernelTimestamps and only reclaims at Reset/Destroy. */
+    if (s->sh) {
+      pthread_mutex_lock(&s->sh->mtx);
+      s->sh->live_queries--;
+      if (s->sh->live_queries == 0)
+        ZE_COMMAND_LIST_RESET_PTR(s->sh->cl);
+      pthread_mutex_unlock(&s->sh->mtx);
+    }
   }
   ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
   if (s->chunk && s->chunk->slab && attr &&
@@ -863,10 +881,11 @@ static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
   DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
     /* Bump refcount during traversal so the last _slot_drain doesn't
      * free c out from under the inner loop. Drop after, free here. */
-    ++c->n_held;
+    c->n_held++;
     for (uint32_t i = 0; i < c->n_used; ++i)
       _slot_drain(&c->slots[i]);
-    if (--c->n_held == 0 && c != cl_data->chunks->prev) {
+    c->n_held--;
+    if (c->n_held == 0 && c != cl_data->chunks->prev) {
       DL_DELETE(cl_data->chunks, c);
       if (c->slab)
         ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
@@ -1091,6 +1110,7 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
       if (_shadow_append_query(sh, slot->inj->event, c->slab, &slot->off,
                                slot->shadow_done->event) != 0)
         continue; /* slot stays not-live this round; we miss this timing */
+      slot->sh = sh;
       _slot_instantiate(cl_data, slot);
     }
   }

From 7227a970b17e3c2e85a83c8b6110d2b0b686a6ef Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Tue, 9 Jun 2026 20:59:21 +0000
Subject: [PATCH 29/54] ze: inline QKT on compute user cls to skip the shadow
 cl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the user's command list is on a COMPUTE queue group, the
AppendQueryKernelTimestamps now lives in the user cl body itself,
signaling user_signal directly. This collapses two Appends per
profiled op (Barrier + shadow QKT) into one, removes the per-Append
shadow_done fence event allocation and its drain-time host-sync, and
drops the per-Execute shadow re-Append loop for regular compute cls.

Detection runs at zeCommandListCreate{,Immediate} via the desc's
commandQueueGroupOrdinal against a generalized per-device queue-group
flag cache. cl_data->is_compute=0 (the shadow path) is the safe
fallback for copy-only cls and any case where the group flags can't
be determined.

Also: add _ZE_MUST() and apply it to every tracer-issued operational
L0 call (Barrier, QKT Append, EventHostSync/Reset, CommandListReset,
QueueSync, pool event reset). These are calls the tracer adds on the
user's behalf where failure means either a user hang (Barrier chain)
or a non-self-consistent trace. Defensive: print + abort so the bug
surfaces under sanitizers/CI rather than ship bad data. Driver query
calls (Get*Handle, GetCommandQueueGroupProperties) keep their graceful
fallbacks — they can fail transiently during teardown.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 320 +++++++++++++++---------
 backends/ze/ze_model.rb                 |  14 +-
 2 files changed, 213 insertions(+), 121 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index ac90d471..1c6fdf64 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -47,6 +47,22 @@
   } while (0)
 #endif
 
+/* Wrap a tracer-issued L0 call whose failure means we'd either hang the
+ * user (sync chain Barrier) or produce a non-self-consistent trace
+ * (QKT, event create, ...). Defensive: print + abort so the bug surfaces
+ * under sanitizers/CI rather than ship bad data. NOT for driver query
+ * calls (Get*Handle, GetCommandQueueGroupProperties) — those can fail
+ * transiently during teardown and have graceful fallbacks. */
+#define _ZE_MUST(call)                                                                             \
+  do {                                                                                             \
+    ze_result_t _r = (call);                                                                       \
+    if (_r != ZE_RESULT_SUCCESS) {                                                                 \
+      fprintf(stderr, "THAPI: tracer-issued L0 call failed: %s = 0x%x at %s:%d\n", #call, _r,      \
+              __FILE__, __LINE__);                                                                 \
+      abort();                                                                                     \
+    }                                                                                              \
+  } while (0)
+
 static int _do_profile = 0;
 static int _do_chained_structs = 0;
 static int _do_paranoid_drift = 0;
@@ -140,6 +156,12 @@ struct _ze_command_list_obj_data {
   pthread_mutex_t mtx;
   unsigned char is_immediate;
   unsigned char is_in_order;
+  /* 1 if this cl's queue group exposes COMPUTE — its body can host
+   * AppendQueryKernelTimestamps directly, so we skip the per-(ctx,device)
+   * shadow cl and bake QKT into the user cl itself. See the placement
+   * diagram at the top of this file. 0 for copy-only cls and for any cl
+   * whose group flags we couldn't determine. Set at create; immutable. */
+  unsigned char is_compute;
 
   /* Cached on first use: device handle and context handle for this cl.
    * Both are immutable for the life of the cl, so caching avoids the
@@ -175,71 +197,98 @@ pthread_mutex_t _ze_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
     pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
   } while (0)
 
-/* Per-device cache of the first COMPUTE queue group ordinal. The lookup
- * is read-mostly: scan zeDeviceGetCommandQueueGroupProperties once,
- * remember the answer. valid=0 means "we already checked and there's no
- * compute group on this device" — treated as fatal at use sites. */
-struct _ze_compute_ord_entry {
+/* Per-device cache of the queue-group flag bitmap. The lookup is
+ * read-mostly: scan zeDeviceGetCommandQueueGroupProperties once,
+ * remember the per-ordinal flags. flags==NULL means "we already checked
+ * and the device returned no groups". Used by two readers:
+ *   _get_compute_ordinal(dev)        -> first COMPUTE ord, or -1
+ *   _ordinal_is_compute(dev, ord)    -> 1 if ord is COMPUTE on dev */
+struct _ze_qgroup_cache_entry {
   ze_device_handle_t device;
-  uint32_t ordinal;
-  unsigned char valid;
+  ze_command_queue_group_property_flags_t *flags; /* owned; n_groups entries */
+  uint32_t n_groups;
   UT_hash_handle hh;
 };
-static struct _ze_compute_ord_entry *_ze_compute_ords = NULL;
-static pthread_mutex_t _ze_compute_ords_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-/* Returns the first COMPUTE queue group ordinal for device, or (uint32_t)-1
- * if the device exposes no compute group (fatal — caller should bail). */
-static uint32_t _get_compute_ordinal(ze_device_handle_t device) {
-  pthread_mutex_lock(&_ze_compute_ords_mutex);
-  struct _ze_compute_ord_entry *e = NULL;
-  HASH_FIND_PTR(_ze_compute_ords, &device, e);
-  if (e) {
-    uint32_t r = e->valid ? e->ordinal : (uint32_t)-1;
-    pthread_mutex_unlock(&_ze_compute_ords_mutex);
-    return r;
-  }
-  pthread_mutex_unlock(&_ze_compute_ords_mutex);
+static struct _ze_qgroup_cache_entry *_ze_qgroup_cache = NULL;
+static pthread_mutex_t _ze_qgroup_cache_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Populate (or return cached) flag bitmap for device. The cache lives
+ * for process lifetime — once published, the entry pointer and its flags
+ * array are immutable, so callers can dereference them without the
+ * mutex. Returns NULL on driver error / OOM. */
+static struct _ze_qgroup_cache_entry *_qgroup_cache_get(ze_device_handle_t device) {
+  pthread_mutex_lock(&_ze_qgroup_cache_mutex);
+  struct _ze_qgroup_cache_entry *e = NULL;
+  HASH_FIND_PTR(_ze_qgroup_cache, &device, e);
+  pthread_mutex_unlock(&_ze_qgroup_cache_mutex);
+  if (e)
+    return e;
 
   /* Slow path: scan queue groups outside the lock. */
   uint32_t n_groups = 0;
   if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, NULL) !=
           ZE_RESULT_SUCCESS ||
       n_groups == 0)
-    return (uint32_t)-1;
+    return NULL;
   ze_command_queue_group_properties_t *groups =
       (ze_command_queue_group_properties_t *)calloc(n_groups, sizeof(*groups));
   if (!groups)
-    return (uint32_t)-1;
+    return NULL;
   for (uint32_t i = 0; i < n_groups; ++i)
     groups[i].stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES;
-  uint32_t found = (uint32_t)-1;
-  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, groups) ==
+  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, groups) !=
       ZE_RESULT_SUCCESS) {
-    for (uint32_t i = 0; i < n_groups; ++i)
-      if (groups[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
-        found = i;
-        break;
-      }
+    free(groups);
+    return NULL;
+  }
+  ze_command_queue_group_property_flags_t *flags =
+      (ze_command_queue_group_property_flags_t *)calloc(n_groups, sizeof(*flags));
+  if (!flags) {
+    free(groups);
+    return NULL;
   }
+  for (uint32_t i = 0; i < n_groups; ++i)
+    flags[i] = groups[i].flags;
   free(groups);
 
-  pthread_mutex_lock(&_ze_compute_ords_mutex);
-  /* Re-check under the lock — another thread may have populated. */
-  HASH_FIND_PTR(_ze_compute_ords, &device, e);
+  pthread_mutex_lock(&_ze_qgroup_cache_mutex);
+  HASH_FIND_PTR(_ze_qgroup_cache, &device, e);
   if (!e) {
-    e = (struct _ze_compute_ord_entry *)calloc(1, sizeof(*e));
+    e = (struct _ze_qgroup_cache_entry *)calloc(1, sizeof(*e));
     if (e) {
       e->device = device;
-      e->ordinal = found;
-      e->valid = (found != (uint32_t)-1) ? 1 : 0;
-      HASH_ADD_PTR(_ze_compute_ords, device, e);
+      e->flags = flags;
+      e->n_groups = n_groups;
+      HASH_ADD_PTR(_ze_qgroup_cache, device, e);
     }
-  } else {
-    found = e->valid ? e->ordinal : (uint32_t)-1;
   }
-  pthread_mutex_unlock(&_ze_compute_ords_mutex);
-  return found;
+  pthread_mutex_unlock(&_ze_qgroup_cache_mutex);
+  if (!e || e->flags != flags)
+    free(flags);
+  return e;
+}
+
+/* Returns the first COMPUTE queue group ordinal for device, or (uint32_t)-1
+ * if the device exposes no compute group (fatal — caller should bail). */
+static uint32_t _get_compute_ordinal(ze_device_handle_t device) {
+  struct _ze_qgroup_cache_entry *e = _qgroup_cache_get(device);
+  if (!e)
+    return (uint32_t)-1;
+  for (uint32_t i = 0; i < e->n_groups; ++i)
+    if (e->flags[i] & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)
+      return i;
+  return (uint32_t)-1;
+}
+
+/* 1 iff `ordinal` on `device` is a COMPUTE queue group. Returns 0 on any
+ * uncertainty (unknown device, OOB ordinal, driver error) — callers
+ * should treat the cl as non-compute and use the shadow-cl QKT path. */
+static int _ordinal_is_compute(ze_device_handle_t device, uint32_t ordinal) {
+  if (!device)
+    return 0;
+  struct _ze_qgroup_cache_entry *e = _qgroup_cache_get(device);
+  return e && ordinal < e->n_groups &&
+         (e->flags[ordinal] & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) ? 1 : 0;
 }
 
 /* Per-(context, device) tracer-owned immediate OOO compute cl used to
@@ -331,22 +380,19 @@ static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
 /* Append AppendQueryKernelTimestamps on the shadow cl: wait on inj,
  * signal shadow_done, write timestamps into slab[*off]. Serialized on
  * sh->mtx because L0 doesn't allow concurrent Appends to one cl.
- * Returns 0 on success, -1 on failure. */
-static int _shadow_append_query(struct _ze_shadow_cl *sh,
-                                ze_event_handle_t inj_event,
-                                void *slab,
-                                size_t *off,
-                                ze_event_handle_t shadow_done_event) {
+ * Aborts on L0 failure (defensive — a missing Query would silently
+ * drop this kernel's timing). */
+static void _shadow_append_query(struct _ze_shadow_cl *sh,
+                                 ze_event_handle_t inj_event,
+                                 void *slab,
+                                 size_t *off,
+                                 ze_event_handle_t shadow_done_event) {
   pthread_mutex_lock(&sh->mtx);
   sh->live_queries++;
-  ze_result_t r =
-      ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(sh->cl, 1, &inj_event, slab, off,
-                                                         /*hSignalEvent=*/shadow_done_event,
-                                                         /*numWaitEvents=*/1, &inj_event);
-  if (r != ZE_RESULT_SUCCESS)
-    sh->live_queries--;
+  _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(sh->cl, 1, &inj_event, slab, off,
+                                                              /*hSignalEvent=*/shadow_done_event,
+                                                              /*numWaitEvents=*/1, &inj_event));
   pthread_mutex_unlock(&sh->mtx);
-  return (r == ZE_RESULT_SUCCESS) ? 0 : -1;
 }
 
 /* Return cl_data->cached_{device,context}, fetching from L0 on first call.
@@ -372,8 +418,9 @@ static ze_context_handle_t _cl_cache_context(struct _ze_command_list_obj_data *c
   return c;
 }
 
-static inline void
-_on_create_command_list(ze_command_list_handle_t command_list, int immediate, int in_order) {
+static inline void _on_create_command_list(ze_command_list_handle_t command_list,
+                                           ze_device_handle_t device,
+                                           uint32_t ordinal, int immediate, int in_order) {
   struct _ze_command_list_obj_data *cl_data = NULL;
 
   FIND_ZE_CL(&command_list, cl_data);
@@ -390,6 +437,8 @@ _on_create_command_list(ze_command_list_handle_t command_list, int immediate, in
   cl_data->ptr = (void *)command_list;
   cl_data->is_immediate = immediate ? 1 : 0;
   cl_data->is_in_order = in_order ? 1 : 0;
+  cl_data->is_compute = _ordinal_is_compute(device, ordinal) ? 1 : 0;
+  cl_data->cached_device = device;
   pthread_mutex_init(&cl_data->mtx, NULL);
   ADD_ZE_CL(cl_data);
 }
@@ -503,7 +552,7 @@ static inline void _latest_clear_if(ze_event_handle_t ev, struct _ze_slot *s) {
       pool->context = val->context;                                                                \
       HASH_ADD_PTR(_ze_event_pools, context, pool);                                                \
     }                                                                                              \
-    ZE_EVENT_HOST_RESET_PTR(val->event);                                                           \
+    _ZE_MUST(ZE_EVENT_HOST_RESET_PTR(val->event));                                                 \
     DL_PREPEND(pool->events, val);                                                                 \
     pthread_mutex_unlock(&_ze_event_pools_mutex);                                                  \
   } while (0)
@@ -682,10 +731,23 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
  * ORIGINAL value (possibly NULL). user_waits is the user's wait list
  * (NULL,0 if none).
  *
- * Inserts a Query waiting on inj, signaling user_signal. For immediate
- * cls, instantiates the slot inline (immediate Appends fire as soon as
- * appended). For regular cls, the slot is created but not instantiated
- * until Execute. */
+ * Two placements for the QKT — picked from cl_data->is_compute:
+ *
+ *  INLINE (user cl is compute):
+ *      Kernel(sig=inj) → QKT(wait=inj, sig=user_signal)        [on user cl]
+ *    One Append. QKT signals user_signal directly, so user-level sync
+ *    on user_signal (or queue/cl sync) implies QKT done — no fence
+ *    event, no host-sync at drain. For regular cls the QKT is baked
+ *    into the cl body at Append; Execute only re-instantiates slots.
+ *
+ *  SHADOW (user cl is copy-only):
+ *                       ┌─> Barrier(wait=inj, sig=user_signal) [on user cl]
+ *      Kernel(sig=inj) ─┤
+ *                       └─> QKT(wait=inj, sig=shadow_done)     [on shadow cl]
+ *    Two Appends. shadow_done is host-synced at drain because the
+ *    shadow cl's completion isn't implied by any user-level sync. For
+ *    regular cls the shadow QKT is (re-)Appended in the Execute
+ *    epilogue rather than here. */
 static void _universal_record_append(ze_command_list_handle_t command_list,
                                      struct _ze_event_h *inj,
                                      ze_event_handle_t user_signal,
@@ -696,59 +758,66 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   struct _ze_event_h *shadow_done = NULL;
   struct _ze_command_list_obj_data *cl_data = NULL;
   struct _ze_slot *s = NULL;
-
-  /* Chain user_signal off inj BEFORE anything else that can fail. The
-   * prologue already swapped user's hSignalEvent for inj->event, so
-   * nothing else on this cl signals user_signal — if we bail later
-   * (slot full, shadow_done alloc fails, etc.) the user's
-   * Sync(user_signal) would hang forever. AppendBarrier (not
-   * AppendSignalEvent) because we need to both wait on inj and signal
-   * user_signal. NULL user_signal is the "user passed NULL" case where
-   * no chaining is needed. */
-  if (user_signal) {
-    ze_event_handle_t wait_ev = inj->event;
-    if (ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev) !=
-        ZE_RESULT_SUCCESS)
-      goto fail;
-  }
+  int inline_path = 0;
+  int barrier_chained = 0;
 
   ze_context_handle_t ctx = NULL;
   if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &ctx) != ZE_RESULT_SUCCESS || !ctx)
     goto fail;
   inj->context = ctx;
 
-  /* Tracer-owned fence event: Query signals it, drain host-waits on it
-   * before reading the slab. Decouples drain-time correctness from any
-   * user sync on user_signal — required because the Query lives on a
-   * separate shadow cl whose completion isn't implied by user-level sync. */
-  shadow_done = _get_profiling_event(command_list);
-  if (!shadow_done)
-    goto fail;
-  shadow_done->context = ctx;
-
   FIND_AND_DEL_ZE_CL(&command_list, cl_data);
   if (!cl_data)
     goto fail;
+  inline_path = cl_data->is_compute;
+
+  /* Shadow path needs a fence event (Query lives on the shadow cl;
+   * drain host-syncs on it). Inline path uses user_signal as the fence
+   * via the dep graph, no extra event needed. */
+  if (!inline_path) {
+    shadow_done = _get_profiling_event(command_list);
+    if (!shadow_done)
+      goto fail_with_cl;
+    shadow_done->context = ctx;
+  }
+
   pthread_mutex_lock(&cl_data->mtx);
 
   s = _cl_slot_append(cl_data, ctx, inj, shadow_done, user_signal, user_waits, user_n_waits);
   if (!s)
     goto fail_locked;
 
-  /* The Query Append now lives on the per-(context, device) shadow
-   * compute cl rather than the user cl. This is what lets us profile
-   * copy-only user cls — copy engines reject AppendQueryKernelTimestamps
-   * but the shadow cl is always compute. For regular user cls we defer
-   * the Append to Execute prologue (the user cl hasn't run yet, so
-   * nothing is signaling inj — Appending the Query on an immediate
-   * shadow cl now would let it fire too early on a stale inj). */
+  if (inline_path) {
+    /* Bake the QKT into the user cl. wait=inj, sig=user_signal.
+     * Holds for both immediate (fires when Appended) and regular cls
+     * (fires on every Execute — the QKT is now part of the cl body). */
+    cl_data->cached_context = ctx;
+    ze_event_handle_t wait_ev = inj->event;
+    _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(command_list, 1, &wait_ev,
+                                                                s->chunk->slab, &s->off,
+                                                                user_signal, 1, &wait_ev));
+    barrier_chained = 1; /* user_signal chained via the QKT itself */
+    _slot_instantiate(cl_data, s);
+    pthread_mutex_unlock(&cl_data->mtx);
+    ADD_ZE_CL(cl_data);
+    return;
+  }
+
+  /* Shadow path: chain user_signal off inj on the user cl, then place
+   * the Query on the shadow cl (immediate cls only — regular cls defer
+   * to the Execute epilogue, see _on_execute_one_cl). */
+  if (user_signal) {
+    ze_event_handle_t wait_ev = inj->event;
+    _ZE_MUST(ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev));
+    barrier_chained = 1;
+  }
   if (cl_data->is_immediate) {
     cl_data->cached_context = ctx;
     ze_device_handle_t dev = _cl_cache_device(cl_data, command_list);
     struct _ze_shadow_cl *sh = dev ? _get_shadow_cl(ctx, dev) : NULL;
-    if (!sh ||
-        _shadow_append_query(sh, inj->event, s->chunk->slab, &s->off, shadow_done->event) != 0)
+    if (!sh)
       goto fail_locked;
+    _shadow_append_query(sh, inj->event, s->chunk->slab, &s->off, shadow_done->event);
     s->sh = sh;
     _slot_instantiate(cl_data, s);
   }
@@ -776,8 +845,17 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     }
   }
   pthread_mutex_unlock(&cl_data->mtx);
+fail_with_cl:
   ADD_ZE_CL(cl_data);
 fail:
+  /* If we never chained user_signal off inj, do it now. The prologue
+   * swapped user's sig for inj->event; without this Append the user's
+   * Sync(user_signal) would hang forever. Aborts on failure — we have
+   * no second-chance recovery and a silent hang is worse than a crash. */
+  if (user_signal && !barrier_chained) {
+    ze_event_handle_t wait_ev = inj->event;
+    _ZE_MUST(ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev));
+  }
   if (shadow_done)
     PUT_ZE_EVENT(shadow_done);
   PUT_ZE_EVENT(inj);
@@ -834,12 +912,15 @@ static void _slot_drain(struct _ze_slot *s) {
   for (uint32_t i = 0; i < s->n_preds; ++i)
     _slot_drain(s->preds[i]);
   s->live = 0;
-  /* Block until the Query op has fired, then reset shadow_done so the
-   * next Execute round (regular cls) starts with a clean event. The
-   * user's own sync doesn't cover the Query — it runs on the shadow cl. */
+  /* Shadow-path only: block until the Query has fired, then reset
+   * shadow_done so the next Execute round starts with a clean event.
+   * The user's own sync doesn't cover the Query because it runs on the
+   * shadow cl. Inline-path slots have shadow_done==NULL — their QKT
+   * lives in the user cl body and the dep-graph walk that brought us
+   * here already implies it has run. */
   if (s->shadow_done && s->shadow_done->event) {
-    ZE_EVENT_HOST_SYNCHRONIZE_PTR(s->shadow_done->event, UINT64_MAX);
-    ZE_EVENT_HOST_RESET_PTR(s->shadow_done->event);
+    _ZE_MUST(ZE_EVENT_HOST_SYNCHRONIZE_PTR(s->shadow_done->event, UINT64_MAX));
+    _ZE_MUST(ZE_EVENT_HOST_RESET_PTR(s->shadow_done->event));
     /* QKT completed device-side. Drop the live ref; if nothing else on
      * this shadow cl is in flight, Reset it: the L0 driver leaks ~10 KB
      * per AppendQueryKernelTimestamps and only reclaims at Reset/Destroy. */
@@ -847,7 +928,7 @@ static void _slot_drain(struct _ze_slot *s) {
       pthread_mutex_lock(&s->sh->mtx);
       s->sh->live_queries--;
       if (s->sh->live_queries == 0)
-        ZE_COMMAND_LIST_RESET_PTR(s->sh->cl);
+        _ZE_MUST(ZE_COMMAND_LIST_RESET_PTR(s->sh->cl));
       pthread_mutex_unlock(&s->sh->mtx);
     }
   }
@@ -1079,11 +1160,13 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
  *      force-sync that queue and drain the slab before we overwrite it
  *      (regression test: inorder_reg_Event_11 — same cl on two queues
  *      from two threads, expect both rounds' timings).
- *   2) Append a fresh Query on the per-(ctx,device) shadow cl for each
- *      slot. Must run AFTER L0 Execute (not before) — Appending on the
- *      shadow cl before the user cl is in flight deadlocks when the
- *      shadow shares the engine with the user cl (see
- *      tests/bugs/query_on_separate_cl_regular_user_cl).
+ *   2) (SHADOW PATH ONLY) Append a fresh Query on the per-(ctx,device)
+ *      shadow cl for each slot. Must run AFTER L0 Execute (not before) —
+ *      Appending on the shadow cl before the user cl is in flight
+ *      deadlocks when the shadow shares the engine with the user cl
+ *      (see tests/bugs/query_on_separate_cl_regular_user_cl). Inline
+ *      (compute) cls have the QKT baked into the cl body at Append; it
+ *      re-fires automatically on every Execute, no work here.
  *   3) Stamp in_flight_q = hQueue and instantiate each slot, publishing
  *      it to the dep graph + as the "owner" of this queue. */
 static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
@@ -1095,22 +1178,29 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
   pthread_mutex_lock(&cl_data->mtx);
 
   if (cl_data->in_flight_q) {
-    ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX);
+    _ZE_MUST(ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX));
     _cl_drain(cl_data);
   }
-  ze_context_handle_t ctx = _cl_cache_context(cl_data, command_list);
-  ze_device_handle_t dev = _cl_cache_device(cl_data, command_list);
-  struct _ze_shadow_cl *sh = (ctx && dev) ? _get_shadow_cl(ctx, dev) : NULL;
+  /* Shadow-path setup is skipped entirely for compute cls — those slots
+   * have shadow_done==NULL and rely on the inline QKT in the cl body. */
+  struct _ze_shadow_cl *sh = NULL;
+  if (!cl_data->is_compute) {
+    ze_context_handle_t ctx = _cl_cache_context(cl_data, command_list);
+    ze_device_handle_t dev = _cl_cache_device(cl_data, command_list);
+    sh = (ctx && dev) ? _get_shadow_cl(ctx, dev) : NULL;
+  }
   struct _ze_slab_chunk *c;
   DL_FOREACH(cl_data->chunks, c) {
     for (uint32_t j = 0; j < c->n_used; ++j) {
       struct _ze_slot *slot = &c->slots[j];
-      if (!sh || !slot->inj || !slot->shadow_done)
+      if (!slot->inj)
         continue;
-      if (_shadow_append_query(sh, slot->inj->event, c->slab, &slot->off,
-                               slot->shadow_done->event) != 0)
-        continue; /* slot stays not-live this round; we miss this timing */
-      slot->sh = sh;
+      if (!cl_data->is_compute) {
+        if (!sh || !slot->shadow_done)
+          continue;
+        _shadow_append_query(sh, slot->inj->event, c->slab, &slot->off, slot->shadow_done->event);
+        slot->sh = sh;
+      }
       _slot_instantiate(cl_data, slot);
     }
   }
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index d5eedf68..3b7268b7 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -139,18 +139,20 @@ def upper_snake_case(str)
 
 register_epilogue 'zeCommandListCreate', <<EOF
   if (_do_state()) {
-    if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList) {
-      int _io = desc && (desc->flags & ZE_COMMAND_LIST_FLAG_IN_ORDER);
-      _on_create_command_list(*phCommandList, /*immediate=*/0, _io);
+    if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList && desc) {
+      int _io = (desc->flags & ZE_COMMAND_LIST_FLAG_IN_ORDER) ? 1 : 0;
+      _on_create_command_list(*phCommandList, hDevice, desc->commandQueueGroupOrdinal,
+                              /*immediate=*/0, _io);
     }
   }
 EOF
 
 register_epilogue 'zeCommandListCreateImmediate', <<EOF
   if (_do_state()) {
-    if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList) {
-      int _io = altdesc && (altdesc->flags & ZE_COMMAND_QUEUE_FLAG_IN_ORDER);
-      _on_create_command_list(*phCommandList, /*immediate=*/1, _io);
+    if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList && altdesc) {
+      int _io = (altdesc->flags & ZE_COMMAND_QUEUE_FLAG_IN_ORDER) ? 1 : 0;
+      _on_create_command_list(*phCommandList, hDevice, altdesc->ordinal,
+                              /*immediate=*/1, _io);
     }
   }
 EOF

From 69da77c6733d97c695c88feba3f1eccaff20142c Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Tue, 9 Jun 2026 21:03:33 +0000
Subject: [PATCH 30/54] ze: document QKT placement (INLINE vs SHADOW), sweep
 stale comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Top-of-file algorithm sketch now names the two QKT placements and
shows their ASCII shape. The previous claim that "we use the shadow
cl uniformly for all engines so the code path is identical" is gone —
that was true before the inline path and misleading afterwards.

- Algorithm section: split "place a Query" off; spell out per-path
  behavior at Execute (shadow re-Appends; inline already in cl body)
  and at drain (shadow host-syncs shadow_done; inline does not).
- New "QKT placement" section with the two diagrams, anchored from
  cl_data->is_compute.
- struct _ze_slot: sh and shadow_done documented as shadow-path only.
- struct _ze_shadow_cl preamble: re-cast as the copy-only path.
- _universal_record_append: redundant function-level diagram replaced
  with a pointer to the top-of-file section.
- Slab-chunk regular-cl note: mention the QKT is also baked into the
  closed cl body on the inline path (additional reason chunks can't
  grow after Close).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 86 ++++++++++++++++---------
 1 file changed, 56 insertions(+), 30 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 1c6fdf64..836e4e7f 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -3,7 +3,7 @@
  *
  * On profiled Append (cl, sig=user_sig, waits=user_waits):
  *   - allocate inj from per-context pool; swap user_sig -> inj
- *   - insert Query(wait=inj, sig=user_sig, slab[off])
+ *   - place a Query (see "QKT placement" below)
  *   - allocate a slot {inj, attr=user_sig, off, waits=copy(user_waits)}
  *   - immediate cl: instantiate(slot) inline
  *
@@ -15,6 +15,8 @@
  * On Execute(q, cl) prologue:
  *   - lock cl.mtx
  *   - if cl.in_flight_q: Synchronize(in_flight_q); drain_cl(cl)
+ *   - shadow-path slots: re-Append Query on shadow cl
+ *     inline-path slots: nothing (Query is baked into cl body)
  *   - instantiate every slot in cl
  *   - cl.in_flight_q = q; unlock
  *
@@ -25,11 +27,46 @@
  *
  * drain(s):
  *   - for p in s.preds: drain(p)
+ *   - shadow-path: host-sync on shadow_done, reset, decrement live_queries
  *   - read slab[s.off], emit tracepoint(s.attr or inj)
  *   - clear latest[s.attr] (if it still points at s)
  *   - clear s.live and s.preds
  *   (Build-time fields inj, attr, off, waits stay so the next Execute
  *    can re-instantiate without re-Appending.)
+ *
+ * QKT placement
+ * =============
+ *
+ * AppendQueryKernelTimestamps (the device-side timestamp read) lives
+ * in one of two places, picked at cl create from the queue group's
+ * COMPUTE flag and stored in cl_data->is_compute. Both paths share the
+ * slot/drain/dep-graph machinery; they only differ in where the QKT is
+ * Appended and how the drain knows it has fired.
+ *
+ *   INLINE (user cl is on a COMPUTE queue group):
+ *
+ *     Kernel(sig=inj) ──> QKT(wait=inj, sig=user_signal)   [on user cl]
+ *
+ *     One Append. user_signal IS the QKT-done edge — any user-level
+ *     sync (event/queue/cl) that covers user_signal also covers the
+ *     QKT. No tracer fence event, no host-sync at drain. For regular
+ *     cls the QKT is baked into the cl body once and re-fires on every
+ *     Execute.
+ *
+ *   SHADOW (user cl is copy-only, or queue group unknown):
+ *
+ *                       ┌─> Barrier(wait=inj, sig=user_signal) [on user cl]
+ *     Kernel(sig=inj) ──┤
+ *                       └─> QKT(wait=inj, sig=shadow_done)     [on shadow cl]
+ *
+ *     Two Appends. The shadow cl is a per-(context, device) tracer-owned
+ *     immediate compute cl; QKT goes there because copy queue groups
+ *     reject AppendQueryKernelTimestamps. shadow_done is a tracer-owned
+ *     fence event that drain host-syncs on — required because the
+ *     shadow cl's completion isn't implied by any user-level sync. For
+ *     regular cls the shadow QKT is (re-)Appended in the Execute
+ *     epilogue (the user cl is in flight by then, so Appending the
+ *     Query won't deadlock on a shared engine).
  */
 
 #ifdef THAPI_DEBUG
@@ -94,11 +131,14 @@ struct _ze_slab_chunk;
 struct _ze_slot {
   struct _ze_command_list_obj_data *owner; /* cl_data this slot lives in */
   struct _ze_slab_chunk *chunk; /* chunk this slot lives in (==> .slab to read at drain) */
-  struct _ze_shadow_cl
-      *sh; /* shadow cl this slot's Query was Appended to (NULL until instantiated) */
+  /* Shadow path only: shadow cl the Query was Appended to. Inline-path
+   * slots leave this NULL — their Query lives in the user cl body and
+   * the dep-graph walk that triggers drain already implies it has run. */
+  struct _ze_shadow_cl *sh;
   struct _ze_event_h *inj; /* tracer-owned event the Query waits on */
-  struct _ze_event_h
-      *shadow_done;       /* tracer-owned event the Query signals; drain host-syncs on this */
+  /* Shadow path only: tracer-owned fence event the Query signals; drain
+   * host-syncs on it. Inline-path slots leave this NULL. */
+  struct _ze_event_h *shadow_done;
   ze_event_handle_t attr; /* user's original signal event (NULL => inj->event) */
   size_t off;             /* byte offset within chunk->slab */
   /* User wait events copied at Append time (stable across rebuilds);
@@ -122,9 +162,9 @@ struct _ze_slot {
 
 /* Slot + slab storage in fixed-size chunks; cl_data->chunks is a utlist
  * DL of these. Imm cls allocate new chunks as needed (no cap); regular
- * cls stop at one chunk (the inj events are baked into the closed cl
- * body, so adding a chunk after Close would create slots the body
- * doesn't address).
+ * cls stop at one chunk — the inj events (and on the inline path, the
+ * QKT itself) are baked into the closed cl body, so adding a chunk
+ * after Close would create slots the body doesn't address.
  *
  * Within a chunk, slots[i].off is i * sizeof(timestamp) into slab. The
  * chunk frees itself when n_held drops to 0 AND it is not the tail
@@ -291,11 +331,12 @@ static int _ordinal_is_compute(ze_device_handle_t device, uint32_t ordinal) {
          (e->flags[ordinal] & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) ? 1 : 0;
 }
 
-/* Per-(context, device) tracer-owned immediate OOO compute cl used to
- * host the AppendQueryKernelTimestamps op. The Query can't live on the
- * user's cl when that cl is on a copy-only queue group (driver aborts),
- * and we use the shadow cl uniformly for all engines so the code path
- * is identical regardless of user-cl kind. */
+/* Per-(context, device) tracer-owned immediate OOO compute cl used by
+ * the SHADOW path to host AppendQueryKernelTimestamps. Copy queue
+ * groups reject QKT, so the shadow cl exists to give those user cls
+ * somewhere compute-capable to put their Query. Compute user cls take
+ * the INLINE path and never touch a shadow cl — see the QKT placement
+ * diagram at the top of this file. */
 struct _ze_shadow_key {
   ze_context_handle_t context;
   ze_device_handle_t device;
@@ -731,23 +772,8 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
  * ORIGINAL value (possibly NULL). user_waits is the user's wait list
  * (NULL,0 if none).
  *
- * Two placements for the QKT — picked from cl_data->is_compute:
- *
- *  INLINE (user cl is compute):
- *      Kernel(sig=inj) → QKT(wait=inj, sig=user_signal)        [on user cl]
- *    One Append. QKT signals user_signal directly, so user-level sync
- *    on user_signal (or queue/cl sync) implies QKT done — no fence
- *    event, no host-sync at drain. For regular cls the QKT is baked
- *    into the cl body at Append; Execute only re-instantiates slots.
- *
- *  SHADOW (user cl is copy-only):
- *                       ┌─> Barrier(wait=inj, sig=user_signal) [on user cl]
- *      Kernel(sig=inj) ─┤
- *                       └─> QKT(wait=inj, sig=shadow_done)     [on shadow cl]
- *    Two Appends. shadow_done is host-synced at drain because the
- *    shadow cl's completion isn't implied by any user-level sync. For
- *    regular cls the shadow QKT is (re-)Appended in the Execute
- *    epilogue rather than here. */
+ * Forks on cl_data->is_compute to pick the QKT placement (INLINE vs
+ * SHADOW) — see the "QKT placement" diagram at the top of this file. */
 static void _universal_record_append(ze_command_list_handle_t command_list,
                                      struct _ze_event_h *inj,
                                      ze_event_handle_t user_signal,

From c83b59767d45bd8ce7d62e03e515ff8def4e15f6 Mon Sep 17 00:00:00 2001
From: tapplencourt <tapplencourt@anl.gov>
Date: Tue, 9 Jun 2026 21:06:41 +0000
Subject: [PATCH 31/54] ze: include <alloca.h>; rename inorder_reg_Event_11 ->
 inorder_reg_Event_multithreaded_01

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backends/ze/gen_ze.rb                   | 1 +
 backends/ze/tracer_ze_helpers.include.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/ze/gen_ze.rb b/backends/ze/gen_ze.rb
index 12dc75df..9e662b95 100644
--- a/backends/ze/gen_ze.rb
+++ b/backends/ze/gen_ze.rb
@@ -8,6 +8,7 @@
     #include <dlfcn.h>
     #include <stdio.h>
     #include <stdlib.h>
+    #include <alloca.h>
     #include <unistd.h>
     #include <string.h>
     #include <pthread.h>
diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 836e4e7f..e7acde94 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -1184,7 +1184,7 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
  *
  *   1) If in_flight_q is set from a prior Execute by *another* thread,
  *      force-sync that queue and drain the slab before we overwrite it
- *      (regression test: inorder_reg_Event_11 — same cl on two queues
+ *      (regression test: inorder_reg_Event_multithreaded_01 — same cl on two queues
  *      from two threads, expect both rounds' timings).
  *   2) (SHADOW PATH ONLY) Append a fresh Query on the per-(ctx,device)
  *      shadow cl for each slot. Must run AFTER L0 Execute (not before) —

From 0cb0f1684ae55f9cb8e9f51002d4de3b7be87452 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Tue, 9 Jun 2026 22:11:01 +0000
Subject: [PATCH 32/54] ze: skip Execute-time _slot_instantiate when slot is
 already live

The inline QKT path (compute user cls) instantiates each slot at
Append time so the QKT can be baked into the cl body. The Execute
hook then re-instantiated the same slot unconditionally, re-running
the in-order pred walk and picking up later-appended live siblings
as predecessors. Two siblings on an in-order cl ended up mutually
referencing each other in preds[], and _slot_drain's recursive
walk infinite-looped, crashing the user with SIGSEGV from stack
overflow.

Gate the Execute-time _slot_instantiate on !slot->live. Inline
slots arrive live=1 from Append and are skipped; shadow-path
slots always arrive live=0 (regular cls aren't instantiated at
Append at all; second+ Execute rounds see slots reset to live=0
by the preceding _cl_drain), so the guard is a no-op for them.

Tests added to thapi_ze_test that fail before / pass after:
  inorder_reg_Event_02 (2 Appends share one user signal event)
  inorder_reg_Event_05 (Event_02 + HostReset between 2 Executes)
  inorder_reg_Event_10 (Event_02 + cl resubmit N=2)
  inorder_reg_Event_11 (3 cls / 3 queues, 2 Appends each)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index e7acde94..0582f672 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -1227,7 +1227,13 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
         _shadow_append_query(sh, slot->inj->event, c->slab, &slot->off, slot->shadow_done->event);
         slot->sh = sh;
       }
-      _slot_instantiate(cl_data, slot);
+      /* Inline-path slots are already instantiated at Append time; re-running
+       * _slot_instantiate would re-walk in-order preds and pick up later-
+       * appended live slots, forming cycles that infinite-loop _slot_drain.
+       * On second+ Execute rounds the slot is live=0 (drained) and we DO need
+       * to re-instantiate so it republishes in the dep graph for this round. */
+      if (!slot->live)
+        _slot_instantiate(cl_data, slot);
     }
   }
   cl_data->in_flight_q = hQueue;

From c93fb406f9657a26fb4aedba1984b040cd4cc5a3 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Tue, 9 Jun 2026 22:22:54 +0000
Subject: [PATCH 33/54] ze: assert in _slot_instantiate, skip live slots
 earlier in Execute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Defensive cleanup on top of the previous double-instantiate fix. Three
related changes, one commit:

1. Introduce _THAPI_LOG (always-on; THAPI(func:line) prefix; flushes
   stderr so the line lands before abort) and _THAPI_ASSERT (logs +
   aborts unconditionally — not gated on NDEBUG, since silently
   dropping the check would let the bug ship bad data). _ZE_MUST is
   rewritten in terms of _THAPI_ASSERT, and the existing THAPI_DBGLOG
   now routes through _THAPI_LOG when THAPI_DEBUG is set.

2. _slot_instantiate asserts !s->live at entry. Re-instantiating a
   live slot leaks its preds[] and lets the in-order pred walk pick
   up later-appended siblings as predecessors, which infinite-loops
   _slot_drain. The prior fix added a guard at the one known caller;
   the assert turns the rule into an invariant of the function so the
   next caller can't trip the same trap silently.

3. Drop the !slot->live guard around _slot_instantiate in
   _on_execute_one_cl in favor of a same-condition `continue` at the
   top of the per-slot body. Already-live slots have nothing left to
   do this Execute (dep-graph entry still valid; inline-path QKT is
   baked into the cl body and re-fires automatically), so they
   shouldn't be running the shadow_append_query branch either. Cleaner
   and removes the if/assert contradiction at the call site.

All 51 correctness + 1 bench tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 55 ++++++++++++++++---------
 1 file changed, 36 insertions(+), 19 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 0582f672..80caa5de 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -69,21 +69,37 @@
  *     Query won't deadlock on a shared engine).
  */
 
-#ifdef THAPI_DEBUG
-#define TAHPI_LOG stderr
-/* GCC's `, ##__VA_ARGS__` extension swallows the leading comma when the
- * variadic list is empty, so the same macro covers both no-arg and
- * with-args calls. Already used in utils/tracepoint_gen.rb. */
-#define THAPI_DBGLOG(fmt, ...)                                                                     \
+/* Always-on tracer log. Prefixes THAPI(func:line) so messages are
+ * grep-able across the bench/test harness which often interleaves
+ * tracer and user output. GCC's `, ##__VA_ARGS__` extension swallows
+ * the leading comma when the variadic list is empty. fflush so the
+ * line lands even if we abort() right after. */
+#define _THAPI_LOG(fmt, ...)                                                                       \
   do {                                                                                             \
-    fprintf(TAHPI_LOG, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__, ##__VA_ARGS__);              \
+    fprintf(stderr, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__, ##__VA_ARGS__);                 \
+    fflush(stderr);                                                                                \
   } while (0)
+
+#ifdef THAPI_DEBUG
+#define THAPI_DBGLOG(fmt, ...) _THAPI_LOG(fmt, ##__VA_ARGS__)
 #else
 #define THAPI_DBGLOG(...)                                                                          \
   do {                                                                                             \
   } while (0)
 #endif
 
+/* Tracer invariant check: print + abort. Unconditional (not gated on
+ * NDEBUG) — silently dropping the check would let the bug ship bad
+ * data instead of crashing. Use for "this can never happen" preconditions
+ * inside the tracer, not for user-input validation. */
+#define _THAPI_ASSERT(cond, fmt, ...)                                                              \
+  do {                                                                                             \
+    if (!(cond)) {                                                                                 \
+      _THAPI_LOG("assertion failed: %s — " fmt, #cond, ##__VA_ARGS__);                             \
+      abort();                                                                                     \
+    }                                                                                              \
+  } while (0)
+
 /* Wrap a tracer-issued L0 call whose failure means we'd either hang the
  * user (sync chain Barrier) or produce a non-self-consistent trace
  * (QKT, event create, ...). Defensive: print + abort so the bug surfaces
@@ -93,11 +109,7 @@
 #define _ZE_MUST(call)                                                                             \
   do {                                                                                             \
     ze_result_t _r = (call);                                                                       \
-    if (_r != ZE_RESULT_SUCCESS) {                                                                 \
-      fprintf(stderr, "THAPI: tracer-issued L0 call failed: %s = 0x%x at %s:%d\n", #call, _r,      \
-              __FILE__, __LINE__);                                                                 \
-      abort();                                                                                     \
-    }                                                                                              \
+    _THAPI_ASSERT(_r == ZE_RESULT_SUCCESS, "%s = 0x%x", #call, _r);                                \
   } while (0)
 
 static int _do_profile = 0;
@@ -729,6 +741,11 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
  * previous live slot on this cl if the cl is in-order. Marks s live and
  * publishes s as the new latest[attr]. */
 static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct _ze_slot *s) {
+  /* Slot must be inert: live=0, preds NULL. Re-instantiating a live slot
+   * would overwrite preds[] (leaking the prior pred refs) and let the
+   * in-order pred walk pick up later-appended live siblings as predecessors,
+   * forming cycles that infinite-loop _slot_drain. */
+  _THAPI_ASSERT(!s->live, "slot %p already live (double _slot_instantiate)", (void *)s);
   s->live = 1;
   uint32_t cap = s->n_waits + 1; /* +1 for in-order prev */
   s->preds = (struct _ze_slot **)calloc(cap, sizeof(struct _ze_slot *));
@@ -1221,19 +1238,19 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
       struct _ze_slot *slot = &c->slots[j];
       if (!slot->inj)
         continue;
+      /* Already-live slots have nothing left to do this Execute: their
+       * dep-graph entry from Append-time _slot_instantiate is still valid,
+       * and (inline path) their QKT is baked into the cl body and re-fires
+       * automatically. Only fresh / drained slots need work here. */
+      if (slot->live)
+        continue;
       if (!cl_data->is_compute) {
         if (!sh || !slot->shadow_done)
           continue;
         _shadow_append_query(sh, slot->inj->event, c->slab, &slot->off, slot->shadow_done->event);
         slot->sh = sh;
       }
-      /* Inline-path slots are already instantiated at Append time; re-running
-       * _slot_instantiate would re-walk in-order preds and pick up later-
-       * appended live slots, forming cycles that infinite-loop _slot_drain.
-       * On second+ Execute rounds the slot is live=0 (drained) and we DO need
-       * to re-instantiate so it republishes in the dep graph for this round. */
-      if (!slot->live)
-        _slot_instantiate(cl_data, slot);
+      _slot_instantiate(cl_data, slot);
     }
   }
   cl_data->in_flight_q = hQueue;

From ecde3c520fa023103c838fe82b45f49843ae0494 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Tue, 9 Jun 2026 23:01:01 +0000
Subject: [PATCH 34/54] ze: collapse is_compute branches via _slot_publish;
 drop cached_device + dead macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_slot_publish(cl_data, s, sh) routes on s->shadow_done — the slot data
itself is now the source of truth for "shadow vs inline". The Execute-time
is_compute branch and the per-slot is_compute branch in _on_execute_one_cl
collapse to one data-driven publish. _universal_record_append's shadow-
immediate tail collapses to the same call. The inline-vs-shadow fork at
Append remains: chaining user_signal via the QKT itself avoids an extra
zeCommandListAppendBarrier per kernel.

cached_device + _cl_cache_device go away — the only call sites are on the
shadow path (immediate-copy Append, regular-copy Execute), both cold; the
extra ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR call is acceptable for one
fewer field and one fewer cache to reason about. cached_context stays:
load-bearing for _on_destroy_context's per-cl sweep.

Delete _ZE_ERROR_MSG / _ZE_ERROR_MSG_NOTERMINATE / _ERROR_MSG: zero
callers, and _ERROR_MSG was syntactically broken (orphan do/while).

All 52 correctness + bench tests pass.
---
 backends/ze/tracer_ze_helpers.include.c | 96 +++++++++++--------------
 1 file changed, 40 insertions(+), 56 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 80caa5de..3b4d0e7e 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -215,10 +215,9 @@ struct _ze_command_list_obj_data {
    * whose group flags we couldn't determine. Set at create; immutable. */
   unsigned char is_compute;
 
-  /* Cached on first use: device handle and context handle for this cl.
-   * Both are immutable for the life of the cl, so caching avoids the
-   * per-Append/per-Execute ZE_*_GET_*_HANDLE_PTR roundtrips. */
-  ze_device_handle_t cached_device;
+  /* Cached on first use: context handle for this cl. Immutable for the
+   * cl's lifetime. Load-bearing for _on_destroy_context's sweep: lets it
+   * associate cls back to their ctx without an L0 roundtrip per cl. */
   ze_context_handle_t cached_context;
 };
 
@@ -448,19 +447,10 @@ static void _shadow_append_query(struct _ze_shadow_cl *sh,
   pthread_mutex_unlock(&sh->mtx);
 }
 
-/* Return cl_data->cached_{device,context}, fetching from L0 on first call.
- * Both fields are immutable for the cl's lifetime, so caching avoids the
- * roundtrip on every Append/Execute. Returns NULL on L0 error. */
-static ze_device_handle_t _cl_cache_device(struct _ze_command_list_obj_data *cl_data,
-                                           ze_command_list_handle_t command_list) {
-  if (cl_data->cached_device)
-    return cl_data->cached_device;
-  ze_device_handle_t d = NULL;
-  if (ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &d) == ZE_RESULT_SUCCESS)
-    cl_data->cached_device = d;
-  return d;
-}
-
+/* Returns the cl's context, fetching from L0 on first call and caching
+ * for the cl's lifetime. The cache is load-bearing for
+ * _on_destroy_context, which scans cls by context. Returns NULL on L0
+ * error. */
 static ze_context_handle_t _cl_cache_context(struct _ze_command_list_obj_data *cl_data,
                                              ze_command_list_handle_t command_list) {
   if (cl_data->cached_context)
@@ -491,7 +481,6 @@ static inline void _on_create_command_list(ze_command_list_handle_t command_list
   cl_data->is_immediate = immediate ? 1 : 0;
   cl_data->is_in_order = in_order ? 1 : 0;
   cl_data->is_compute = _ordinal_is_compute(device, ordinal) ? 1 : 0;
-  cl_data->cached_device = device;
   pthread_mutex_init(&cl_data->mtx, NULL);
   ADD_ZE_CL(cl_data);
 }
@@ -784,6 +773,22 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
     _latest_set(s->attr, s);
 }
 
+/* Publish a fresh slot: shadow path appends a Query on the per-(ctx,device)
+ * shadow cl; inline path is a no-op here (its QKT is baked into the user cl
+ * body at Append). Then instantiate in the dep graph. `s->shadow_done` is
+ * the single source of truth for "shadow vs inline" — no is_compute branch
+ * at the call site. Caller holds cl_data->mtx. */
+static void _slot_publish(struct _ze_command_list_obj_data *cl_data,
+                          struct _ze_slot *s,
+                          struct _ze_shadow_cl *sh) {
+  if (s->shadow_done) {
+    _THAPI_ASSERT(sh, "shadow-path slot needs a shadow cl");
+    _shadow_append_query(sh, s->inj->event, s->chunk->slab, &s->off, s->shadow_done->event);
+    s->sh = sh;
+  }
+  _slot_instantiate(cl_data, s);
+}
+
 /* Append-time hook called from profiling_epilogue. Caller already
  * swapped user's hSignalEvent for inj->event. user_signal is the
  * ORIGINAL value (possibly NULL). user_waits is the user's wait list
@@ -829,12 +834,12 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   s = _cl_slot_append(cl_data, ctx, inj, shadow_done, user_signal, user_waits, user_n_waits);
   if (!s)
     goto fail_locked;
+  cl_data->cached_context = ctx;
 
   if (inline_path) {
     /* Bake the QKT into the user cl. wait=inj, sig=user_signal.
      * Holds for both immediate (fires when Appended) and regular cls
      * (fires on every Execute — the QKT is now part of the cl body). */
-    cl_data->cached_context = ctx;
     ze_event_handle_t wait_ev = inj->event;
     _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(command_list, 1, &wait_ev,
                                                                 s->chunk->slab, &s->off,
@@ -855,14 +860,12 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     barrier_chained = 1;
   }
   if (cl_data->is_immediate) {
-    cl_data->cached_context = ctx;
-    ze_device_handle_t dev = _cl_cache_device(cl_data, command_list);
-    struct _ze_shadow_cl *sh = dev ? _get_shadow_cl(ctx, dev) : NULL;
+    ze_device_handle_t dev = NULL;
+    _ZE_MUST(ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev));
+    struct _ze_shadow_cl *sh = _get_shadow_cl(ctx, dev);
     if (!sh)
       goto fail_locked;
-    _shadow_append_query(sh, inj->event, s->chunk->slab, &s->off, shadow_done->event);
-    s->sh = sh;
-    _slot_instantiate(cl_data, s);
+    _slot_publish(cl_data, s, sh);
   }
   pthread_mutex_unlock(&cl_data->mtx);
   ADD_ZE_CL(cl_data);
@@ -1224,14 +1227,10 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
     _ZE_MUST(ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX));
     _cl_drain(cl_data);
   }
-  /* Shadow-path setup is skipped entirely for compute cls — those slots
-   * have shadow_done==NULL and rely on the inline QKT in the cl body. */
+  /* Shadow cl is resolved lazily on first shadow-path slot. Inline-only cls
+   * never trigger the lookup. */
   struct _ze_shadow_cl *sh = NULL;
-  if (!cl_data->is_compute) {
-    ze_context_handle_t ctx = _cl_cache_context(cl_data, command_list);
-    ze_device_handle_t dev = _cl_cache_device(cl_data, command_list);
-    sh = (ctx && dev) ? _get_shadow_cl(ctx, dev) : NULL;
-  }
+  int sh_resolved = 0;
   struct _ze_slab_chunk *c;
   DL_FOREACH(cl_data->chunks, c) {
     for (uint32_t j = 0; j < c->n_used; ++j) {
@@ -1244,13 +1243,16 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
        * automatically. Only fresh / drained slots need work here. */
       if (slot->live)
         continue;
-      if (!cl_data->is_compute) {
-        if (!sh || !slot->shadow_done)
-          continue;
-        _shadow_append_query(sh, slot->inj->event, c->slab, &slot->off, slot->shadow_done->event);
-        slot->sh = sh;
+      if (slot->shadow_done && !sh_resolved) {
+        ze_context_handle_t ctx = _cl_cache_context(cl_data, command_list);
+        ze_device_handle_t dev = NULL;
+        _ZE_MUST(ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev));
+        sh = ctx ? _get_shadow_cl(ctx, dev) : NULL;
+        sh_resolved = 1;
       }
-      _slot_instantiate(cl_data, slot);
+      if (slot->shadow_done && !sh)
+        continue;
+      _slot_publish(cl_data, slot, sh);
     }
   }
   cl_data->in_flight_q = hQueue;
@@ -1414,24 +1416,6 @@ static inline void _dump_memory_info(ze_command_list_handle_t hCommandList, cons
     _dump_memory_info_ctx(hContext, ptr);
 }
 
-////////////////////////////////////////////
-#define _ZE_ERROR_MSG(NAME, RES)                                                                   \
-  do {                                                                                             \
-    fprintf(stderr, "%s() failed at %d(%s): res=%x\n", (NAME), __LINE__, __FILE__, (RES));         \
-  } while (0)
-#define _ZE_ERROR_MSG_NOTERMINATE(NAME, RES)                                                       \
-  do {                                                                                             \
-    fprintf(stderr, "%s() error at %d(%s): res=%x\n", (NAME), __LINE__, __FILE__, (RES));          \
-  } while (0)
-#define _ERROR_MSG(MSG)                                                                            \
-  {                                                                                                \
-    perror((MSG)) do {                                                                             \
-      {                                                                                            \
-        perror((MSG));                                                                             \
-        fprintf(stderr, "errno=%d at %d(%s)", errno, __LINE__, __FILE__);                          \
-      }                                                                                            \
-      while (0)
-
 static void _load_tracer(void) {
   char *s = NULL;
   void *handle = NULL;

From b6519929f37209f6dec8cde886ef8f5786d024a9 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Tue, 9 Jun 2026 23:31:19 +0000
Subject: [PATCH 35/54] ze: rename _ze_latest -> _ze_event_latest_signaled

The map is "event -> the most recent slot that signaled it". The old
"latest" name didn't say what it was the latest OF. The new name does.

Mechanical rename across the global, struct, mutex, three helpers, and
the algorithm/struct/comment shorthand. No semantic change. All 53 tests
pass.
---
 backends/ze/tracer_ze_helpers.include.c | 88 ++++++++++++-------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 3b4d0e7e..2c0bc261 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -8,9 +8,9 @@
  *   - immediate cl: instantiate(slot) inline
  *
  * instantiate(s):
- *   - s.preds = [latest[w] for w in s.waits if live]
+ *   - s.preds = [event_latest_signaled[w] for w in s.waits if live]
  *                + previous live slot in same cl (if cl is in-order)
- *   - s.live = true; latest[s.attr] = &s
+ *   - s.live = true; event_latest_signaled[s.attr] = &s
  *
  * On Execute(q, cl) prologue:
  *   - lock cl.mtx
@@ -21,7 +21,7 @@
  *   - cl.in_flight_q = q; unlock
  *
  * On Sync (the synced anchor tells us what to drain):
- *   - Sync(ev):  drain(latest[ev])
+ *   - Sync(ev):  drain(event_latest_signaled[ev])
  *   - Sync(q):   drain_cl(cl) for every cl whose in_flight_q == q
  *   - Sync(cl):  drain_cl(cl)
  *
@@ -29,7 +29,7 @@
  *   - for p in s.preds: drain(p)
  *   - shadow-path: host-sync on shadow_done, reset, decrement live_queries
  *   - read slab[s.off], emit tracepoint(s.attr or inj)
- *   - clear latest[s.attr] (if it still points at s)
+ *   - clear event_latest_signaled[s.attr] (if it still points at s)
  *   - clear s.live and s.preds
  *   (Build-time fields inj, attr, off, waits stay so the next Execute
  *    can re-instantiate without re-Appending.)
@@ -155,7 +155,7 @@ struct _ze_slot {
   size_t off;             /* byte offset within chunk->slab */
   /* User wait events copied at Append time (stable across rebuilds);
    * preds[] is computed at instantiate from waits[] by looking up
-   * latest[w] for each w. */
+   * event_latest_signaled[w] for each w. */
   ze_event_handle_t *waits;
   uint32_t n_waits;
   struct _ze_slot **preds; /* points at slots whose drain must come first (may be in another cl) */
@@ -504,60 +504,60 @@ struct _ze_event_pool_entry {
 struct _ze_event_pool_entry *_ze_event_pools = NULL;
 static pthread_mutex_t _ze_event_pools_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-/* latest[ev] -> the most recent slot whose attr==ev. Used to resolve
- * happens-before edges: when a new Append says "wait on ev", we record
- * the latest slot for ev as a pred. Updated at instantiate and cleared
- * at drain. */
-struct _ze_latest_entry {
+/* event_latest_signaled[ev] -> the most recent slot whose attr==ev.
+ * Used to resolve happens-before edges: when a new Append says "wait on
+ * ev", we record the latest slot for ev as a pred. Updated at
+ * instantiate and cleared at drain. */
+struct _ze_event_latest_signaled_entry {
   ze_event_handle_t ev; /* key */
   struct _ze_slot *slot;
   UT_hash_handle hh;
 };
-static struct _ze_latest_entry *_ze_latest = NULL;
-static pthread_mutex_t _ze_latest_mutex = PTHREAD_MUTEX_INITIALIZER;
+static struct _ze_event_latest_signaled_entry *_ze_event_latest_signaled = NULL;
+static pthread_mutex_t _ze_event_latest_signaled_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-static inline struct _ze_slot *_latest_get(ze_event_handle_t ev) {
-  struct _ze_latest_entry *e = NULL;
-  pthread_mutex_lock(&_ze_latest_mutex);
-  HASH_FIND_PTR(_ze_latest, &ev, e);
+static inline struct _ze_slot *_event_latest_signaled_get(ze_event_handle_t ev) {
+  struct _ze_event_latest_signaled_entry *e = NULL;
+  pthread_mutex_lock(&_ze_event_latest_signaled_mutex);
+  HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
   struct _ze_slot *s = e ? e->slot : NULL;
-  pthread_mutex_unlock(&_ze_latest_mutex);
+  pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
   return s;
 }
 
-static inline void _latest_set(ze_event_handle_t ev, struct _ze_slot *s) {
+static inline void _event_latest_signaled_set(ze_event_handle_t ev, struct _ze_slot *s) {
   if (!ev)
     return;
-  pthread_mutex_lock(&_ze_latest_mutex);
-  struct _ze_latest_entry *e = NULL;
-  HASH_FIND_PTR(_ze_latest, &ev, e);
+  pthread_mutex_lock(&_ze_event_latest_signaled_mutex);
+  struct _ze_event_latest_signaled_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
   if (!e) {
-    e = (struct _ze_latest_entry *)calloc(1, sizeof(*e));
+    e = (struct _ze_event_latest_signaled_entry *)calloc(1, sizeof(*e));
     if (!e) {
-      pthread_mutex_unlock(&_ze_latest_mutex);
+      pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
       return;
     }
     e->ev = ev;
-    HASH_ADD_PTR(_ze_latest, ev, e);
+    HASH_ADD_PTR(_ze_event_latest_signaled, ev, e);
   }
   e->slot = s;
-  pthread_mutex_unlock(&_ze_latest_mutex);
+  pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
 }
 
-/* Remove latest[ev] only if it still points at slot s (the slot is
- * being drained — but if a newer Append already overwrote latest[ev],
- * don't clobber that). */
-static inline void _latest_clear_if(ze_event_handle_t ev, struct _ze_slot *s) {
+/* Remove event_latest_signaled[ev] only if it still points at slot s
+ * (the slot is being drained — but if a newer Append already overwrote
+ * the entry, don't clobber that). */
+static inline void _event_latest_signaled_clear_if(ze_event_handle_t ev, struct _ze_slot *s) {
   if (!ev)
     return;
-  pthread_mutex_lock(&_ze_latest_mutex);
-  struct _ze_latest_entry *e = NULL;
-  HASH_FIND_PTR(_ze_latest, &ev, e);
+  pthread_mutex_lock(&_ze_event_latest_signaled_mutex);
+  struct _ze_event_latest_signaled_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
   if (e && e->slot == s) {
-    HASH_DEL(_ze_latest, e);
+    HASH_DEL(_ze_event_latest_signaled, e);
     free(e);
   }
-  pthread_mutex_unlock(&_ze_latest_mutex);
+  pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
 }
 
 #define GET_ZE_EVENT(key, val)                                                                     \
@@ -726,9 +726,9 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
   return s;
 }
 
-/* Compute s->preds from s->waits via the global latest[] map, plus the
- * previous live slot on this cl if the cl is in-order. Marks s live and
- * publishes s as the new latest[attr]. */
+/* Compute s->preds from s->waits via the global event_latest_signaled
+ * map, plus the previous live slot on this cl if the cl is in-order.
+ * Marks s live and publishes s as the new event_latest_signaled[attr]. */
 static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct _ze_slot *s) {
   /* Slot must be inert: live=0, preds NULL. Re-instantiating a live slot
    * would overwrite preds[] (leaking the prior pred refs) and let the
@@ -740,7 +740,7 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
   s->preds = (struct _ze_slot **)calloc(cap, sizeof(struct _ze_slot *));
   s->n_preds = 0;
   for (uint32_t i = 0; i < s->n_waits; ++i) {
-    struct _ze_slot *p = _latest_get(s->waits[i]);
+    struct _ze_slot *p = _event_latest_signaled_get(s->waits[i]);
     if (p && p->live)
       s->preds[s->n_preds++] = p;
   }
@@ -770,7 +770,7 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
   for (uint32_t i = 0; i < s->n_preds; ++i)
     __atomic_fetch_add(&s->preds[i]->refs, 1, __ATOMIC_RELAXED);
   if (s->attr)
-    _latest_set(s->attr, s);
+    _event_latest_signaled_set(s->attr, s);
 }
 
 /* Publish a fresh slot: shadow path appends a Query on the per-(ctx,device)
@@ -949,7 +949,7 @@ static void _slot_release(struct _ze_slot *s) {
  * can't use the caller's slab.
  *
  * No cycle guard: preds come from in-order prev (strictly earlier slot
- * in the same cl, DAG) and from latest[wait_event] (a slot published
+ * in the same cl, DAG) and from event_latest_signaled[wait_event] (a slot published
  * BEFORE us). Forming a cycle would require user-declared mutual waits,
  * which L0 itself deadlocks on. */
 static void _slot_drain(struct _ze_slot *s) {
@@ -987,7 +987,7 @@ static void _slot_drain(struct _ze_slot *s) {
                   ZE_RESULT_SUCCESS, r.global.kernelStart, r.global.kernelEnd,
                   r.context.kernelStart, r.context.kernelEnd);
   }
-  _latest_clear_if(s->attr, s);
+  _event_latest_signaled_clear_if(s->attr, s);
   /* Drop refs on preds; release any that hit 0 and are already drained. */
   for (uint32_t i = 0; i < s->n_preds; ++i) {
     struct _ze_slot *p = s->preds[i];
@@ -1060,7 +1060,7 @@ static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
         PUT_ZE_EVENT(s->shadow_done);
       free(s->waits);
       free(s->preds);
-      _latest_clear_if(s->attr, s);
+      _event_latest_signaled_clear_if(s->attr, s);
     }
     DL_DELETE(cl_data->chunks, c);
     if (c->slab)
@@ -1109,7 +1109,7 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
           PUT_ZE_EVENT_WRAPPER(s->shadow_done);
         free(s->waits);
         free(s->preds);
-        _latest_clear_if(s->attr, s);
+        _event_latest_signaled_clear_if(s->attr, s);
       }
       DL_DELETE(cl_data->chunks, c);
       /* Skip zeMemFree on the slab — the ctx is being destroyed; the
@@ -1173,7 +1173,7 @@ static void _on_sync_drain_queue(ze_command_queue_handle_t hQueue) {
 
 /* Drain the slot that most recently signaled `ev` (recursing on preds). */
 static void _on_sync_drain_event(ze_event_handle_t ev) {
-  struct _ze_slot *s = _latest_get(ev);
+  struct _ze_slot *s = _event_latest_signaled_get(ev);
   if (!s || !s->owner)
     return;
   pthread_mutex_lock(&s->owner->mtx);

From 584fd18f221cea7e5790ed6a6b44294c5811654c Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 10 Jun 2026 19:21:19 +0000
Subject: [PATCH 36/54] ze: drop FIND_AND_DEL+ADD on _ze_cls_mutex in
 _universal_record_append

cl_data was being yanked out of the global hash at the top of every
profiled Append and reinserted at every exit, taking _ze_cls_mutex three
times per Append (DEL + ADD on each return path + the cleanup ADD on
fail_with_cl). Replace with a single FIND_ZE_CL: cl_data stays in the
hash, the global mutex is touched once instead of three times.

Safe because L0 forbids racing Append against Destroy on the same cl
handle (both carry the not-thread-safe-per-cl-handle restriction), so
cl_data cannot be torn out by a concurrent _on_destroy_command_list.
cl_data->mtx still serializes us against another Append/Execute on
this same cl.

This was the single biggest contention point in the N-threads-N-CLs
workload (inorder_imm_Event_multithreaded_01 and friends).
---
 backends/ze/tracer_ze_helpers.include.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 2c0bc261..650dd731 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -814,7 +814,17 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     goto fail;
   inj->context = ctx;
 
-  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
+  /* Plain FIND, no DEL: cl_data stays in the global hash while we work.
+   * The L0 spec forbids the user from racing Append against Destroy on
+   * the same cl handle (cl handle is a not-thread-safe restriction for
+   * both zeCommandListAppend* and zeCommandListDestroy), so cl_data
+   * cannot be torn out from under us by a concurrent _on_destroy_*.
+   * cl_data->mtx still serializes our work against another Append /
+   * Execute on this same cl. Per-Append cost on _ze_cls_mutex drops
+   * from three acquires (DEL + ADD + the cleanup ADD on the fail path)
+   * to one — which is the single biggest contention point in the
+   * many-threads-many-CLs case. */
+  FIND_ZE_CL(&command_list, cl_data);
   if (!cl_data)
     goto fail;
   inline_path = cl_data->is_compute;
@@ -825,7 +835,7 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   if (!inline_path) {
     shadow_done = _get_profiling_event(command_list);
     if (!shadow_done)
-      goto fail_with_cl;
+      goto fail;
     shadow_done->context = ctx;
   }
 
@@ -847,7 +857,6 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     barrier_chained = 1; /* user_signal chained via the QKT itself */
     _slot_instantiate(cl_data, s);
     pthread_mutex_unlock(&cl_data->mtx);
-    ADD_ZE_CL(cl_data);
     return;
   }
 
@@ -868,7 +877,6 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     _slot_publish(cl_data, s, sh);
   }
   pthread_mutex_unlock(&cl_data->mtx);
-  ADD_ZE_CL(cl_data);
   return;
 
 fail_locked:
@@ -891,8 +899,6 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     }
   }
   pthread_mutex_unlock(&cl_data->mtx);
-fail_with_cl:
-  ADD_ZE_CL(cl_data);
 fail:
   /* If we never chained user_signal off inj, do it now. The prologue
    * swapped user's sig for inj->event; without this Append the user's

From faa6a30e0ce08c217d52c07ff00991966dff660e Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 10 Jun 2026 19:26:33 +0000
Subject: [PATCH 37/54] ze: fetch context once per profiled Append; thread it
 through prologue->epilogue

Previously each profiled Append issued zeCommandListGetContextHandle three
times: once inside the generated prologue's _get_profiling_event, once at
the top of _universal_record_append, and once at the first Execute via
_cl_cache_context. Same handle every time.

Fetch it once in the generated prologue, pass it to _get_profiling_event,
also pass it to _universal_record_append. _universal_record_append now
publishes cached_context up front so the first _on_execute_one_cl hits
the cache too. Net: one driver call per Append instead of three.

The cached_context write is unlocked. Safe: every writer stores the same
value (the cl's true context), and the only reader (_on_destroy_context's
per-cl sweep) is gated on a user contract forbidding concurrent Append +
DestroyContext on the same context.

All 51 correctness tests pass.
---
 backends/ze/tracer_ze_helpers.include.c | 39 +++++++++++++------------
 backends/ze/ze_model.rb                 | 15 +++++++---
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 650dd731..c2d97beb 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -622,33 +622,27 @@ static pthread_mutex_t _ze_event_wrappers_mutex = PTHREAD_MUTEX_INITIALIZER;
     pthread_mutex_unlock(&_ze_event_wrappers_mutex);                                               \
   } while (0)
 
-static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command_list) {
+/* Caller-supplied ctx: every call site already has it on the stack, so
+ * we'd otherwise issue an unnecessary zeCommandListGetContextHandle per
+ * profiled Append (caller did the same query moments earlier). */
+static struct _ze_event_h *_get_profiling_event(ze_context_handle_t context) {
   struct _ze_event_h *e_w;
-
-  ze_context_handle_t context = NULL;
-  ze_result_t res = ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &context);
-  if (res != ZE_RESULT_SUCCESS || !context) {
-    THAPI_DBGLOG("zeCommandListGetContextHandle failed with %d, for command list: %p", res,
-                 command_list);
-    return NULL;
-  }
   GET_ZE_EVENT(&context, e_w);
   if (e_w)
     return e_w;
 
   GET_ZE_EVENT_WRAPPER(e_w);
   if (!e_w) {
-    THAPI_DBGLOG("Could not create a new event wrapper for command list: %p", command_list);
+    THAPI_DBGLOG("Could not create a new event wrapper for context: %p", context);
     return NULL;
   }
 
   ze_event_pool_desc_t desc = {
       ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, NULL,
       ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP | ZE_EVENT_POOL_FLAG_HOST_VISIBLE, 1};
-  res = ZE_EVENT_POOL_CREATE_PTR(context, &desc, 0, NULL, &e_w->event_pool);
+  ze_result_t res = ZE_EVENT_POOL_CREATE_PTR(context, &desc, 0, NULL, &e_w->event_pool);
   if (res != ZE_RESULT_SUCCESS) {
-    THAPI_DBGLOG("zeEventPoolCreate failed with %d, for command list: %p, context: %p", res,
-                 command_list, context);
+    THAPI_DBGLOG("zeEventPoolCreate failed with %d, for context: %p", res, context);
     goto cleanup_wrapper;
   }
   ze_event_desc_t e_desc = {ZE_STRUCTURE_TYPE_EVENT_DESC, NULL, 0, ZE_EVENT_SCOPE_FLAG_HOST,
@@ -796,12 +790,16 @@ static void _slot_publish(struct _ze_command_list_obj_data *cl_data,
  *
  * Forks on cl_data->is_compute to pick the QKT placement (INLINE vs
  * SHADOW) — see the "QKT placement" diagram at the top of this file. */
+/* ctx is fetched once in the prologue (profiling_prologue in ze_model.rb) and
+ * threaded in here so _universal_record_append doesn't reissue the same
+ * zeCommandListGetContextHandle the prologue already made. */
 static void _universal_record_append(ze_command_list_handle_t command_list,
+                                     ze_context_handle_t ctx,
                                      struct _ze_event_h *inj,
                                      ze_event_handle_t user_signal,
                                      ze_event_handle_t *user_waits,
                                      uint32_t user_n_waits) {
-  if (!inj)
+  if (!inj || !ctx)
     return;
   struct _ze_event_h *shadow_done = NULL;
   struct _ze_command_list_obj_data *cl_data = NULL;
@@ -809,9 +807,6 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   int inline_path = 0;
   int barrier_chained = 0;
 
-  ze_context_handle_t ctx = NULL;
-  if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &ctx) != ZE_RESULT_SUCCESS || !ctx)
-    goto fail;
   inj->context = ctx;
 
   /* Plain FIND, no DEL: cl_data stays in the global hash while we work.
@@ -828,12 +823,19 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   if (!cl_data)
     goto fail;
   inline_path = cl_data->is_compute;
+  /* Publish the cl->ctx mapping up front so the first _on_execute_one_cl
+   * on this cl skips its zeCommandListGetContextHandle round-trip via the
+   * _cl_cache_context fast path. Race-safe: every writer stores the same
+   * value (the cl's true context), and _on_destroy_context's reader of
+   * this field is gated on a user contract that forbids concurrent
+   * Append + DestroyContext on the same context. */
+  cl_data->cached_context = ctx;
 
   /* Shadow path needs a fence event (Query lives on the shadow cl;
    * drain host-syncs on it). Inline path uses user_signal as the fence
    * via the dep graph, no extra event needed. */
   if (!inline_path) {
-    shadow_done = _get_profiling_event(command_list);
+    shadow_done = _get_profiling_event(ctx);
     if (!shadow_done)
       goto fail;
     shadow_done->context = ctx;
@@ -844,7 +846,6 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   s = _cl_slot_append(cl_data, ctx, inj, shadow_done, user_signal, user_waits, user_n_waits);
   if (!s)
     goto fail_locked;
-  cl_data->cached_context = ctx;
 
   if (inline_path) {
     /* Bake the QKT into the user cl. wait=inj, sig=user_signal.
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index 3b7268b7..89382233 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -297,10 +297,17 @@ def upper_snake_case(str)
   <<EOF
   ze_event_handle_t _user_signal = #{event_name};
   struct _ze_event_h * _ewrapper = NULL;
+  /* Fetched once per profiled Append and threaded to both
+   * _get_profiling_event (prologue) and _universal_record_append (epilogue)
+   * so the tracer issues exactly one zeCommandListGetContextHandle per
+   * Append instead of three. */
+  ze_context_handle_t _ctx = NULL;
   if (_do_profile) {
-    _ewrapper = _get_profiling_event(hCommandList);
-    if (_ewrapper)
-      #{event_name} = _ewrapper->event;
+    if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(hCommandList, &_ctx) == ZE_RESULT_SUCCESS && _ctx) {
+      _ewrapper = _get_profiling_event(_ctx);
+      if (_ewrapper)
+        #{event_name} = _ewrapper->event;
+    }
     /* If injection failed, fall through with the user's signal unchanged;
      * we won't be able to time this Append, but it still runs. */
   }
@@ -312,7 +319,7 @@ def upper_snake_case(str)
   if (_do_profile && _ewrapper) {
     if (_retval == ZE_RESULT_SUCCESS) {
       ze_event_handle_t _attr = _user_signal ? _user_signal : _ewrapper->event;
-      _universal_record_append(hCommandList, _ewrapper, _user_signal,
+      _universal_record_append(hCommandList, _ctx, _ewrapper, _user_signal,
                                #{waits_expr}, #{n_waits_expr});
       tracepoint(lttng_ust_ze_profiling, event_profiling, _attr);
     } else {

From 1f5a1b8ebf159d58adeadf7379d64622d9f7f5c2 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 10 Jun 2026 19:34:20 +0000
Subject: [PATCH 38/54] ze: inline cached_context read in _on_execute_one_cl;
 drop _cl_cache_context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since cached_context is now published at the top of _universal_record_append
(before any slot exists), the L0-fetch fallback inside _cl_cache_context is
provably dead — _on_execute_one_cl's only caller path is gated on
slot->shadow_done, which can only be set by a prior shadow-path Append
that already populated cached_context.

Replace the call with a direct field read and delete the helper. One
fewer indirection, one fewer "what if the cache is empty here?" branch
to think about.

All 51 correctness tests pass.
---
 backends/ze/tracer_ze_helpers.include.c | 33 +++++++++----------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index c2d97beb..429a39ed 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -447,20 +447,6 @@ static void _shadow_append_query(struct _ze_shadow_cl *sh,
   pthread_mutex_unlock(&sh->mtx);
 }
 
-/* Returns the cl's context, fetching from L0 on first call and caching
- * for the cl's lifetime. The cache is load-bearing for
- * _on_destroy_context, which scans cls by context. Returns NULL on L0
- * error. */
-static ze_context_handle_t _cl_cache_context(struct _ze_command_list_obj_data *cl_data,
-                                             ze_command_list_handle_t command_list) {
-  if (cl_data->cached_context)
-    return cl_data->cached_context;
-  ze_context_handle_t c = NULL;
-  if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &c) == ZE_RESULT_SUCCESS)
-    cl_data->cached_context = c;
-  return c;
-}
-
 static inline void _on_create_command_list(ze_command_list_handle_t command_list,
                                            ze_device_handle_t device,
                                            uint32_t ordinal, int immediate, int in_order) {
@@ -823,12 +809,14 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   if (!cl_data)
     goto fail;
   inline_path = cl_data->is_compute;
-  /* Publish the cl->ctx mapping up front so the first _on_execute_one_cl
-   * on this cl skips its zeCommandListGetContextHandle round-trip via the
-   * _cl_cache_context fast path. Race-safe: every writer stores the same
-   * value (the cl's true context), and _on_destroy_context's reader of
-   * this field is gated on a user contract that forbids concurrent
-   * Append + DestroyContext on the same context. */
+  /* Publish the cl->ctx mapping up front. _on_execute_one_cl reads it
+   * directly (no fallback fetch) when resolving the shadow cl, and
+   * _on_destroy_context's per-cl sweep matches against it. Doing this
+   * before any slot is appended guarantees both readers find a populated
+   * value. Race-safe unlocked: every writer stores the same value (the
+   * cl's true context), and _on_destroy_context's reader is gated on a
+   * user contract that forbids concurrent Append + DestroyContext on
+   * the same context. */
   cl_data->cached_context = ctx;
 
   /* Shadow path needs a fence event (Query lives on the shadow cl;
@@ -1251,7 +1239,10 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
       if (slot->live)
         continue;
       if (slot->shadow_done && !sh_resolved) {
-        ze_context_handle_t ctx = _cl_cache_context(cl_data, command_list);
+        /* cached_context was published by _universal_record_append before any
+         * shadow_done slot could exist, so it's always set here — no need
+         * for an L0 round-trip to recover it. */
+        ze_context_handle_t ctx = cl_data->cached_context;
         ze_device_handle_t dev = NULL;
         _ZE_MUST(ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev));
         sh = ctx ? _get_shadow_cl(ctx, dev) : NULL;

From 618297a30f6e36a2e7b5cdce146b40dee31d1725 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 10 Jun 2026 19:49:41 +0000
Subject: [PATCH 39/54] ze: PUT_ZE_EVENT macro -> _put_ze_event function; reset
 + alloc outside pools mutex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes, both targeting work that was happening with global mutexes held:

1. PUT_ZE_EVENT was a 20+ line macro spanning hash lookup, allocation,
   error-path L0 destroys, HostReset, and prepend. Convert to a function
   (debugger can step in, callers no longer pay the macro-expansion
   readability tax). Rename all 7 call sites (helpers + ze_model.rb
   generator); no shim macro left behind.

2. Move two pieces of work out from under the locks they shouldn't be
   under:
   - _put_ze_event: HostReset is thread-safe — issue it before taking
     _ze_event_pools_mutex instead of serializing an L0 round-trip
     behind a global lock. Pre-allocate the bucket entry outside the
     lock too, freeing the unused copy if we lose the publish race
     (same pattern _qgroup_cache_get already uses).
   - _event_latest_signaled_set: calloc outside the lock for the same
     reason; same lose-the-race-free-our-copy handling.

All 51 correctness tests pass.
---
 backends/ze/tracer_ze_helpers.include.c | 91 ++++++++++++++++---------
 backends/ze/ze_model.rb                 |  2 +-
 2 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 429a39ed..88551edb 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -514,20 +514,29 @@ static inline struct _ze_slot *_event_latest_signaled_get(ze_event_handle_t ev)
 static inline void _event_latest_signaled_set(ze_event_handle_t ev, struct _ze_slot *s) {
   if (!ev)
     return;
+  /* Allocate the new entry outside the lock; same pattern as _qgroup_cache_get.
+   * If we lose the race to publish it, free our unused copy. Saves a heap
+   * call worth of contention on _ze_event_latest_signaled_mutex per Append
+   * that touches an event the tracer hasn't seen before. */
+  struct _ze_event_latest_signaled_entry *pre =
+      (struct _ze_event_latest_signaled_entry *)calloc(1, sizeof(*pre));
+
   pthread_mutex_lock(&_ze_event_latest_signaled_mutex);
   struct _ze_event_latest_signaled_entry *e = NULL;
   HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
   if (!e) {
-    e = (struct _ze_event_latest_signaled_entry *)calloc(1, sizeof(*e));
-    if (!e) {
+    if (!pre) {
       pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
       return;
     }
+    e = pre;
+    pre = NULL;
     e->ev = ev;
     HASH_ADD_PTR(_ze_event_latest_signaled, ev, e);
   }
   e->slot = s;
   pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
+  free(pre); /* harmless if NULL; non-NULL only when we lost the entry race */
 }
 
 /* Remove event_latest_signaled[ev] only if it still points at slot s
@@ -559,31 +568,47 @@ static inline void _event_latest_signaled_clear_if(ze_event_handle_t ev, struct
     pthread_mutex_unlock(&_ze_event_pools_mutex);                                                  \
   } while (0)
 
-#define PUT_ZE_EVENT(val)                                                                          \
-  do {                                                                                             \
-    struct _ze_event_pool_entry *pool = NULL;                                                      \
-    pthread_mutex_lock(&_ze_event_pools_mutex);                                                    \
-    HASH_FIND_PTR(_ze_event_pools, &(val->context), pool);                                         \
-    if (!pool) {                                                                                   \
-      pool = (struct _ze_event_pool_entry *)calloc(1, sizeof(struct _ze_event_pool_entry));        \
-      if (!pool) {                                                                                 \
-        THAPI_DBGLOG("Failed to allocate memory");                                                 \
-        pthread_mutex_unlock(&_ze_event_pools_mutex);                                              \
-        if (val->event_pool) {                                                                     \
-          if (val->event)                                                                          \
-            ZE_EVENT_DESTROY_PTR(val->event);                                                      \
-          ZE_EVENT_POOL_DESTROY_PTR(val->event_pool);                                              \
-        }                                                                                          \
-        free(val);                                                                                 \
-        break;                                                                                     \
-      }                                                                                            \
-      pool->context = val->context;                                                                \
-      HASH_ADD_PTR(_ze_event_pools, context, pool);                                                \
-    }                                                                                              \
-    _ZE_MUST(ZE_EVENT_HOST_RESET_PTR(val->event));                                                 \
-    DL_PREPEND(pool->events, val);                                                                 \
-    pthread_mutex_unlock(&_ze_event_pools_mutex);                                                  \
-  } while (0)
+/* Return an event wrapper to its per-context freelist. Reset is issued
+ * BEFORE we take the lock — zeEventHostReset is thread-safe and resetting
+ * with the lock held would serialize an L0 driver round-trip behind the
+ * global pools mutex. Pre-allocate the new bucket entry outside the lock
+ * for the same reason; if we lose the race to create the bucket, free our
+ * unused copy.
+ *
+ * On total failure (couldn't allocate bucket AND there isn't one), there's
+ * nowhere to park the wrapper, so destroy its backing L0 objects and free
+ * the wrapper itself — we'd rather leak nothing than poison the freelist. */
+static void _put_ze_event(struct _ze_event_h *val) {
+  _ZE_MUST(ZE_EVENT_HOST_RESET_PTR(val->event));
+
+  struct _ze_event_pool_entry *pre =
+      (struct _ze_event_pool_entry *)calloc(1, sizeof(*pre));
+
+  pthread_mutex_lock(&_ze_event_pools_mutex);
+  struct _ze_event_pool_entry *pool = NULL;
+  HASH_FIND_PTR(_ze_event_pools, &val->context, pool);
+  if (!pool) {
+    if (!pre) {
+      pthread_mutex_unlock(&_ze_event_pools_mutex);
+      THAPI_DBGLOG("Failed to allocate memory");
+      if (val->event_pool) {
+        if (val->event)
+          ZE_EVENT_DESTROY_PTR(val->event);
+        ZE_EVENT_POOL_DESTROY_PTR(val->event_pool);
+      }
+      free(val);
+      return;
+    }
+    pool = pre;
+    pre = NULL;
+    pool->context = val->context;
+    HASH_ADD_PTR(_ze_event_pools, context, pool);
+  }
+  DL_PREPEND(pool->events, val);
+  pthread_mutex_unlock(&_ze_event_pools_mutex);
+  free(pre); /* harmless if NULL; non-NULL only when we lost the bucket race */
+}
+
 
 struct _ze_event_h *_ze_event_wrappers = NULL;
 static pthread_mutex_t _ze_event_wrappers_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -898,8 +923,8 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     _ZE_MUST(ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev));
   }
   if (shadow_done)
-    PUT_ZE_EVENT(shadow_done);
-  PUT_ZE_EVENT(inj);
+    _put_ze_event(shadow_done);
+  _put_ze_event(inj);
 }
 
 /* Reclaim a slot: PUT events back to the per-context pool, free waits,
@@ -910,11 +935,11 @@ static void _slot_release(struct _ze_slot *s) {
   if (!s || !s->owner || !s->owner->is_immediate)
     return;
   if (s->inj) {
-    PUT_ZE_EVENT(s->inj);
+    _put_ze_event(s->inj);
     s->inj = NULL;
   }
   if (s->shadow_done) {
-    PUT_ZE_EVENT(s->shadow_done);
+    _put_ze_event(s->shadow_done);
     s->shadow_done = NULL;
   }
   free(s->waits);
@@ -1050,9 +1075,9 @@ static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
     for (uint32_t i = 0; i < c->n_used; ++i) {
       struct _ze_slot *s = &c->slots[i];
       if (s->inj)
-        PUT_ZE_EVENT(s->inj);
+        _put_ze_event(s->inj);
       if (s->shadow_done)
-        PUT_ZE_EVENT(s->shadow_done);
+        _put_ze_event(s->shadow_done);
       free(s->waits);
       free(s->preds);
       _event_latest_signaled_clear_if(s->attr, s);
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index 89382233..dd53212d 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -323,7 +323,7 @@ def upper_snake_case(str)
                                #{waits_expr}, #{n_waits_expr});
       tracepoint(lttng_ust_ze_profiling, event_profiling, _attr);
     } else {
-      PUT_ZE_EVENT(_ewrapper);
+      _put_ze_event(_ewrapper);
     }
   }
 EOF

From a08a931803562c7c6252968406cfcabbf94b9050 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 10 Jun 2026 19:59:22 +0000
Subject: [PATCH 40/54] ze: convert remaining macros to functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FIND_ZE_CL / ADD_ZE_CL / FIND_AND_DEL_ZE_CL  -> _cl_find / _cl_add / _cl_find_and_del
GET_ZE_EVENT                                  -> _get_ze_event
GET_ZE_EVENT_WRAPPER / PUT_ZE_EVENT_WRAPPER  -> _get_ze_event_wrapper / _put_ze_event_wrapper

All six were thin lock-wrapped hash/list ops in macro form, with the
output threaded through an `out` parameter. Convert to ordinary
functions returning the value; debugger steps in, callers don't pay the
macro-expansion readability tax, and the lock-held region is easier to
audit when it's a function body instead of a macro you have to mentally
expand at each call site. No shims left behind — every call site updated.

While here: _get_ze_event_wrapper now allocates the fresh wrapper
outside the wrappers mutex (matches the [pre-alloc | lock | publish |
free-our-copy-on-race] pattern already established by _put_ze_event and
_event_latest_signaled_set), so the freelist mutex never wraps a heap
call.

All 51 correctness tests pass.
---
 backends/ze/tracer_ze_helpers.include.c | 151 +++++++++++++-----------
 1 file changed, 79 insertions(+), 72 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 88551edb..cd97d239 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -224,29 +224,29 @@ struct _ze_command_list_obj_data {
 struct _ze_command_list_obj_data *_ze_cls = NULL;
 pthread_mutex_t _ze_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-#define FIND_ZE_CL(key, val)                                                                       \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_cls_mutex);                                                            \
-    HASH_FIND_PTR(_ze_cls, key, val);                                                              \
-    pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
-  } while (0)
+static struct _ze_command_list_obj_data *_cl_find(ze_command_list_handle_t command_list) {
+  struct _ze_command_list_obj_data *cl = NULL;
+  pthread_mutex_lock(&_ze_cls_mutex);
+  HASH_FIND_PTR(_ze_cls, &command_list, cl);
+  pthread_mutex_unlock(&_ze_cls_mutex);
+  return cl;
+}
 
-#define ADD_ZE_CL(val)                                                                             \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_cls_mutex);                                                            \
-    HASH_ADD_PTR(_ze_cls, ptr, val);                                                               \
-    pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
-  } while (0)
+static void _cl_add(struct _ze_command_list_obj_data *cl) {
+  pthread_mutex_lock(&_ze_cls_mutex);
+  HASH_ADD_PTR(_ze_cls, ptr, cl);
+  pthread_mutex_unlock(&_ze_cls_mutex);
+}
 
-#define FIND_AND_DEL_ZE_CL(key, val)                                                               \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_cls_mutex);                                                            \
-    HASH_FIND_PTR(_ze_cls, key, val);                                                              \
-    if (val) {                                                                                     \
-      HASH_DEL(_ze_cls, val);                                                                      \
-    }                                                                                              \
-    pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
-  } while (0)
+static struct _ze_command_list_obj_data *_cl_find_and_del(ze_command_list_handle_t command_list) {
+  struct _ze_command_list_obj_data *cl = NULL;
+  pthread_mutex_lock(&_ze_cls_mutex);
+  HASH_FIND_PTR(_ze_cls, &command_list, cl);
+  if (cl)
+    HASH_DEL(_ze_cls, cl);
+  pthread_mutex_unlock(&_ze_cls_mutex);
+  return cl;
+}
 
 /* Per-device cache of the queue-group flag bitmap. The lookup is
  * read-mostly: scan zeDeviceGetCommandQueueGroupProperties once,
@@ -450,15 +450,13 @@ static void _shadow_append_query(struct _ze_shadow_cl *sh,
 static inline void _on_create_command_list(ze_command_list_handle_t command_list,
                                            ze_device_handle_t device,
                                            uint32_t ordinal, int immediate, int in_order) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
-
-  FIND_ZE_CL(&command_list, cl_data);
-  if (cl_data) {
+  if (_cl_find(command_list)) {
     THAPI_DBGLOG("Command list already registered: %p", command_list);
     return;
   }
 
-  cl_data = (struct _ze_command_list_obj_data *)calloc(1, sizeof(*cl_data));
+  struct _ze_command_list_obj_data *cl_data =
+      (struct _ze_command_list_obj_data *)calloc(1, sizeof(*cl_data));
   if (!cl_data) {
     THAPI_DBGLOG("Failed to allocate memory");
     return;
@@ -468,7 +466,7 @@ static inline void _on_create_command_list(ze_command_list_handle_t command_list
   cl_data->is_in_order = in_order ? 1 : 0;
   cl_data->is_compute = _ordinal_is_compute(device, ordinal) ? 1 : 0;
   pthread_mutex_init(&cl_data->mtx, NULL);
-  ADD_ZE_CL(cl_data);
+  _cl_add(cl_data);
 }
 
 /* Wrapper around an injected event we own. Lives either in the per-context
@@ -555,18 +553,21 @@ static inline void _event_latest_signaled_clear_if(ze_event_handle_t ev, struct
   pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
 }
 
-#define GET_ZE_EVENT(key, val)                                                                     \
-  do {                                                                                             \
-    struct _ze_event_pool_entry *pool = NULL;                                                      \
-    pthread_mutex_lock(&_ze_event_pools_mutex);                                                    \
-    HASH_FIND_PTR(_ze_event_pools, key, pool);                                                     \
-    if (pool && pool->events) {                                                                    \
-      val = pool->events;                                                                          \
-      DL_DELETE(pool->events, val);                                                                \
-    } else                                                                                         \
-      val = NULL;                                                                                  \
-    pthread_mutex_unlock(&_ze_event_pools_mutex);                                                  \
-  } while (0)
+/* Pop one recycled event wrapper from the per-context freelist; NULL if
+ * none cached. Caller (today only _get_profiling_event) will fall back
+ * to creating a fresh L0 event pool + event. */
+static struct _ze_event_h *_get_ze_event(ze_context_handle_t context) {
+  struct _ze_event_h *e = NULL;
+  pthread_mutex_lock(&_ze_event_pools_mutex);
+  struct _ze_event_pool_entry *pool = NULL;
+  HASH_FIND_PTR(_ze_event_pools, &context, pool);
+  if (pool && pool->events) {
+    e = pool->events;
+    DL_DELETE(pool->events, e);
+  }
+  pthread_mutex_unlock(&_ze_event_pools_mutex);
+  return e;
+}
 
 /* Return an event wrapper to its per-context freelist. Reset is issued
  * BEFORE we take the lock — zeEventHostReset is thread-safe and resetting
@@ -613,36 +614,45 @@ static void _put_ze_event(struct _ze_event_h *val) {
 struct _ze_event_h *_ze_event_wrappers = NULL;
 static pthread_mutex_t _ze_event_wrappers_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-#define GET_ZE_EVENT_WRAPPER(val)                                                                  \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_event_wrappers_mutex);                                                 \
-    if (_ze_event_wrappers) {                                                                      \
-      val = _ze_event_wrappers;                                                                    \
-      DL_DELETE(_ze_event_wrappers, val);                                                          \
-    } else {                                                                                       \
-      val = calloc(1, sizeof(struct _ze_event_h));                                                 \
-    }                                                                                              \
-    pthread_mutex_unlock(&_ze_event_wrappers_mutex);                                               \
-  } while (0)
+/* Get a zeroed event wrapper struct: pop from the global recycle list if
+ * any, else calloc a fresh one. The wrapper is context-agnostic — only
+ * the backing L0 event + pool inside it bind to a specific ctx. */
+static struct _ze_event_h *_get_ze_event_wrapper(void) {
+  struct _ze_event_h *e = NULL;
+  pthread_mutex_lock(&_ze_event_wrappers_mutex);
+  if (_ze_event_wrappers) {
+    e = _ze_event_wrappers;
+    DL_DELETE(_ze_event_wrappers, e);
+  }
+  pthread_mutex_unlock(&_ze_event_wrappers_mutex);
+  if (!e)
+    e = (struct _ze_event_h *)calloc(1, sizeof(*e));
+  return e;
+}
 
-#define PUT_ZE_EVENT_WRAPPER(val)                                                                  \
-  do {                                                                                             \
-    memset(val, 0, sizeof(struct _ze_event_h));                                                    \
-    pthread_mutex_lock(&_ze_event_wrappers_mutex);                                                 \
-    DL_PREPEND(_ze_event_wrappers, val);                                                           \
-    pthread_mutex_unlock(&_ze_event_wrappers_mutex);                                               \
-  } while (0)
+/* Return a wrapper struct to the recycle list. Used in two situations:
+ *   1) wrapper construction failed, no L0 objects ever attached;
+ *   2) the wrapper's context is being destroyed — caller has already
+ *      arranged for the L0 event/pool inside to be released (or left
+ *      them to die with the context).
+ * We zero before publishing so a future _get_ze_event_wrapper returns
+ * something equivalent to a fresh calloc. */
+static void _put_ze_event_wrapper(struct _ze_event_h *val) {
+  memset(val, 0, sizeof(*val));
+  pthread_mutex_lock(&_ze_event_wrappers_mutex);
+  DL_PREPEND(_ze_event_wrappers, val);
+  pthread_mutex_unlock(&_ze_event_wrappers_mutex);
+}
 
 /* Caller-supplied ctx: every call site already has it on the stack, so
  * we'd otherwise issue an unnecessary zeCommandListGetContextHandle per
  * profiled Append (caller did the same query moments earlier). */
 static struct _ze_event_h *_get_profiling_event(ze_context_handle_t context) {
-  struct _ze_event_h *e_w;
-  GET_ZE_EVENT(&context, e_w);
+  struct _ze_event_h *e_w = _get_ze_event(context);
   if (e_w)
     return e_w;
 
-  GET_ZE_EVENT_WRAPPER(e_w);
+  e_w = _get_ze_event_wrapper();
   if (!e_w) {
     THAPI_DBGLOG("Could not create a new event wrapper for context: %p", context);
     return NULL;
@@ -668,7 +678,7 @@ static struct _ze_event_h *_get_profiling_event(ze_context_handle_t context) {
 cleanup_ep:
   ZE_EVENT_POOL_DESTROY_PTR(e_w->event_pool);
 cleanup_wrapper:
-  PUT_ZE_EVENT_WRAPPER(e_w);
+  _put_ze_event_wrapper(e_w);
   return NULL;
 }
 
@@ -820,7 +830,7 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
 
   inj->context = ctx;
 
-  /* Plain FIND, no DEL: cl_data stays in the global hash while we work.
+  /* Plain find, no delete: cl_data stays in the global hash while we work.
    * The L0 spec forbids the user from racing Append against Destroy on
    * the same cl handle (cl handle is a not-thread-safe restriction for
    * both zeCommandListAppend* and zeCommandListDestroy), so cl_data
@@ -830,7 +840,7 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
    * from three acquires (DEL + ADD + the cleanup ADD on the fail path)
    * to one — which is the single biggest contention point in the
    * many-threads-many-CLs case. */
-  FIND_ZE_CL(&command_list, cl_data);
+  cl_data = _cl_find(command_list);
   if (!cl_data)
     goto fail;
   inline_path = cl_data->is_compute;
@@ -1044,8 +1054,7 @@ static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
 
 /* Drain a single cl. */
 static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
-  FIND_ZE_CL(&command_list, cl_data);
+  struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
   if (!cl_data)
     return;
   pthread_mutex_lock(&cl_data->mtx);
@@ -1065,8 +1074,7 @@ static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
  * L0; immediate cls' slots have likely already been released at drain
  * time but any stragglers get cleaned up too. */
 static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
-  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
+  struct _ze_command_list_obj_data *cl_data = _cl_find_and_del(command_list);
   if (!cl_data)
     return;
   pthread_mutex_lock(&cl_data->mtx);
@@ -1124,9 +1132,9 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
          * underlying L0 event/pool destroyed there too, not here. The
          * wrappers themselves are context-agnostic, so reuse them. */
         if (s->inj)
-          PUT_ZE_EVENT_WRAPPER(s->inj);
+          _put_ze_event_wrapper(s->inj);
         if (s->shadow_done)
-          PUT_ZE_EVENT_WRAPPER(s->shadow_done);
+          _put_ze_event_wrapper(s->shadow_done);
         free(s->waits);
         free(s->preds);
         _event_latest_signaled_clear_if(s->attr, s);
@@ -1170,7 +1178,7 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
       if (w->event_pool)
         ZE_EVENT_POOL_DESTROY_PTR(w->event_pool);
       DL_DELETE(pe->events, w);
-      PUT_ZE_EVENT_WRAPPER(w);
+      _put_ze_event_wrapper(w);
     }
     free(pe);
   }
@@ -1237,8 +1245,7 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
  *      it to the dep graph + as the "owner" of this queue. */
 static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
                                ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
-  FIND_ZE_CL(&command_list, cl_data);
+  struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
   if (!cl_data)
     return;
   pthread_mutex_lock(&cl_data->mtx);

From 0f271d0909b5fcfe88d37eee5e900635c7066b8b Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 10 Jun 2026 20:04:05 +0000
Subject: [PATCH 41/54] ze: extract _cl_chunk_free helper

The "DL_DELETE the chunk + zeMemFree the slab (sometimes) + free the
chunk struct" body was open-coded in 5 places: _universal_record_append's
fail_locked rollback, _slot_release, _cl_drain, _on_destroy_command_list,
and _on_destroy_context step 1. They differed only in whether to issue
zeMemFree on the slab (skipped in _on_destroy_context because the ctx
is dying and the driver reclaims).

Extract _cl_chunk_free(cl_data, c, free_slab). Cleaner site-by-site,
and the `free_slab=0` call site documents the "ctx-dying skip-zeMemFree"
invariant in one place (the helper's comment) instead of relying on
readers to spot the missing zeMemFree at a single open-coded site.

No behavior change. All 51 correctness tests pass.
---
 backends/ze/tracer_ze_helpers.include.c | 50 ++++++++++++-------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index cd97d239..e7b67774 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -682,6 +682,21 @@ static struct _ze_event_h *_get_profiling_event(ze_context_handle_t context) {
   return NULL;
 }
 
+/* Unlink chunk c from cl_data->chunks and free its slab + struct.
+ * `free_slab` controls whether to issue zeMemFree on the slab — false when
+ * the chunk's context is being destroyed (driver reclaims; zeMemFree on a
+ * doomed ctx is at best racy). Slot-side cleanup (events, waits, preds)
+ * is the caller's responsibility — this helper only owns the chunk
+ * envelope and the slab. */
+static void _cl_chunk_free(struct _ze_command_list_obj_data *cl_data,
+                           struct _ze_slab_chunk *c,
+                           int free_slab) {
+  DL_DELETE(cl_data->chunks, c);
+  if (free_slab && c->slab)
+    ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+  free(c);
+}
+
 /* Allocate a new chunk and append it to cl_data->chunks. */
 static struct _ze_slab_chunk *_cl_chunk_alloc(struct _ze_command_list_obj_data *cl_data,
                                               ze_context_handle_t ctx) {
@@ -915,12 +930,8 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     c->n_used--;
     c->n_held--;
     memset(s, 0, sizeof(*s));
-    if (c->n_used == 0) {
-      DL_DELETE(cl_data->chunks, c);
-      if (c->slab)
-        ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
-      free(c);
-    }
+    if (c->n_used == 0)
+      _cl_chunk_free(cl_data, c, /*free_slab=*/1);
   }
   pthread_mutex_unlock(&cl_data->mtx);
 fail:
@@ -962,12 +973,8 @@ static void _slot_release(struct _ze_slot *s) {
   if (!c)
     return;
   c->n_held--;
-  if (c->n_held == 0 && c != cl->chunks->prev) {
-    DL_DELETE(cl->chunks, c);
-    if (c->slab)
-      ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
-    free(c);
-  }
+  if (c->n_held == 0 && c != cl->chunks->prev)
+    _cl_chunk_free(cl, c, /*free_slab=*/1);
 }
 
 /* Drain one slot. Recurses on its preds, emits the slot's tracepoint,
@@ -1042,12 +1049,8 @@ static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
     for (uint32_t i = 0; i < c->n_used; ++i)
       _slot_drain(&c->slots[i]);
     c->n_held--;
-    if (c->n_held == 0 && c != cl_data->chunks->prev) {
-      DL_DELETE(cl_data->chunks, c);
-      if (c->slab)
-        ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
-      free(c);
-    }
+    if (c->n_held == 0 && c != cl_data->chunks->prev)
+      _cl_chunk_free(cl_data, c, /*free_slab=*/1);
   }
   cl_data->in_flight_q = NULL;
 }
@@ -1090,10 +1093,7 @@ static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
       free(s->preds);
       _event_latest_signaled_clear_if(s->attr, s);
     }
-    DL_DELETE(cl_data->chunks, c);
-    if (c->slab)
-      ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
-    free(c);
+    _cl_chunk_free(cl_data, c, /*free_slab=*/1);
   }
   pthread_mutex_unlock(&cl_data->mtx);
   pthread_mutex_destroy(&cl_data->mtx);
@@ -1139,11 +1139,7 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
         free(s->preds);
         _event_latest_signaled_clear_if(s->attr, s);
       }
-      DL_DELETE(cl_data->chunks, c);
-      /* Skip zeMemFree on the slab — the ctx is being destroyed; the
-       * driver will reclaim the device allocation. Calling zeMemFree
-       * on a doomed ctx is at best racy. */
-      free(c);
+      _cl_chunk_free(cl_data, c, /*free_slab=*/0);
     }
     pthread_mutex_unlock(&cl_data->mtx);
     pthread_mutex_destroy(&cl_data->mtx);

From 4003262020ec527b7945b8e73ccbbfa2244efe67 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 10 Jun 2026 20:07:03 +0000
Subject: [PATCH 42/54] ze: unify cl_data destroy paths via
 _cl_data_destroy(ctx_dying)

_on_destroy_command_list and _on_destroy_context step 1 were ~20 lines
of near-identical code each: walk all chunks, free per-slot events/waits/
preds, free the chunk, then free cl_data itself. They differed only in
two ctx-scoped resource decisions that move in lockstep:

  - event wrappers: recycle to the per-ctx pool (ctx alive) vs
    recycle wrapper struct only (ctx dying, pool is about to die too)
  - chunk slab: zeMemFree (ctx alive) vs skip (ctx dying)

Extract _cl_data_destroy(cl_data, int ctx_dying) that captures both
choices. _on_destroy_command_list passes 0; _on_destroy_context step 1
passes 1. The "why dying differs from alive" reasoning now lives in one
place (the helper's comment) instead of being split across two sites
where future readers had to diff them to spot the invariant.

Caller still owns hash removal (single-cl: _cl_find_and_del; per-ctx:
HASH_DEL inside the iter) since those are genuinely different. Lock
order unchanged.

All 51 correctness tests pass.
---
 backends/ze/tracer_ze_helpers.include.c | 74 ++++++++++++-------------
 1 file changed, 35 insertions(+), 39 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index e7b67774..4b2432ff 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -1065,41 +1065,58 @@ static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
   pthread_mutex_unlock(&cl_data->mtx);
 }
 
-/* zeCommandListDestroy epilogue. The L0 spec says the user must have
- * ensured the device is no longer referencing the cl, so we don't drain
- * (the GPU is already idle on this cl). We just release our state:
- * PUT every slot's tracer-owned events back to the per-context pool,
- * free per-slot allocations, free every chunk's slab + chunk struct,
- * remove cl_data from the registry, free cl_data itself.
+/* Release everything cl_data owns and free cl_data itself. Caller must
+ * have already removed cl_data from the global _ze_cls hash (single-cl:
+ * _cl_find_and_del; per-ctx sweep: HASH_DEL inside the iter), since the
+ * two callers find their cls differently.
  *
- * Works for both cl kinds: regular cls (inj baked into the cl body)
- * can recycle inj here because the cl body is about to be destroyed by
- * L0; immediate cls' slots have likely already been released at drain
- * time but any stragglers get cleaned up too. */
-static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = _cl_find_and_del(command_list);
-  if (!cl_data)
-    return;
+ * ctx_dying changes how we dispose of two ctx-scoped resources:
+ *
+ *   - per-slot event wrappers: when ctx is alive, recycle them to the
+ *     per-context event pool via _put_ze_event so a later cl on the same
+ *     ctx can grab them. When ctx is dying, the pool itself is about to
+ *     be wiped in _on_destroy_context step 3 (which will destroy the L0
+ *     event/pool too), so we only recycle the wrapper struct.
+ *
+ *   - chunk slabs: zeMemFree the slab when ctx is alive; skip when ctx
+ *     is dying — the driver reclaims, and zeMemFree on a doomed ctx is
+ *     at best racy. */
+static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_dying) {
   pthread_mutex_lock(&cl_data->mtx);
   struct _ze_slab_chunk *c, *tmp;
   DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
     for (uint32_t i = 0; i < c->n_used; ++i) {
       struct _ze_slot *s = &c->slots[i];
       if (s->inj)
-        _put_ze_event(s->inj);
+        ctx_dying ? _put_ze_event_wrapper(s->inj) : _put_ze_event(s->inj);
       if (s->shadow_done)
-        _put_ze_event(s->shadow_done);
+        ctx_dying ? _put_ze_event_wrapper(s->shadow_done) : _put_ze_event(s->shadow_done);
       free(s->waits);
       free(s->preds);
       _event_latest_signaled_clear_if(s->attr, s);
     }
-    _cl_chunk_free(cl_data, c, /*free_slab=*/1);
+    _cl_chunk_free(cl_data, c, /*free_slab=*/!ctx_dying);
   }
   pthread_mutex_unlock(&cl_data->mtx);
   pthread_mutex_destroy(&cl_data->mtx);
   free(cl_data);
 }
 
+/* zeCommandListDestroy epilogue. The L0 spec says the user must have
+ * ensured the device is no longer referencing the cl, so we don't drain
+ * (the GPU is already idle on this cl). We just release our state.
+ *
+ * Works for both cl kinds: regular cls (inj baked into the cl body)
+ * can recycle inj here because the cl body is about to be destroyed by
+ * L0; immediate cls' slots have likely already been released at drain
+ * time but any stragglers get cleaned up too. */
+static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
+  struct _ze_command_list_obj_data *cl_data = _cl_find_and_del(command_list);
+  if (!cl_data)
+    return;
+  _cl_data_destroy(cl_data, /*ctx_dying=*/0);
+}
+
 /* zeContextDestroy prologue. The user contract is that the device is no
  * longer referencing the context, so all cls/events bound to it are
  * conceptually dead from the user's perspective. Our job here is solely
@@ -1122,28 +1139,7 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
     if (cl_data->cached_context != hContext)
       continue;
     HASH_DEL(_ze_cls, cl_data);
-    pthread_mutex_lock(&cl_data->mtx);
-    struct _ze_slab_chunk *c, *ctmp;
-    DL_FOREACH_SAFE(cl_data->chunks, c, ctmp) {
-      for (uint32_t i = 0; i < c->n_used; ++i) {
-        struct _ze_slot *s = &c->slots[i];
-        /* Recycle our event wrappers but DON'T return them to the per-ctx
-         * pool — the pool entry will be wiped in step 3, and we want the
-         * underlying L0 event/pool destroyed there too, not here. The
-         * wrappers themselves are context-agnostic, so reuse them. */
-        if (s->inj)
-          _put_ze_event_wrapper(s->inj);
-        if (s->shadow_done)
-          _put_ze_event_wrapper(s->shadow_done);
-        free(s->waits);
-        free(s->preds);
-        _event_latest_signaled_clear_if(s->attr, s);
-      }
-      _cl_chunk_free(cl_data, c, /*free_slab=*/0);
-    }
-    pthread_mutex_unlock(&cl_data->mtx);
-    pthread_mutex_destroy(&cl_data->mtx);
-    free(cl_data);
+    _cl_data_destroy(cl_data, /*ctx_dying=*/1);
   }
   pthread_mutex_unlock(&_ze_cls_mutex);
 

From 4e49526cc5cb43f4a0fac65ad2138c851057d4f4 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 10 Jun 2026 22:18:39 +0000
Subject: [PATCH 43/54] ze: collapse every per-domain tracer mutex into one
 _ze_state_mutex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dep-graph caches cross-cl slot pointers in s->preds[], so any drain
may mutate ANY cl's chunks. A per-cl mtx scheme requires cross-cl lock
acquisition with ordering rules. One mutex covering all tracer state
makes that go away — drain freely follows pred pointers; Append on
different cls serializes through the same lock.

Once we had a single mutex for cl_data state, every other per-domain
lock (event freelists, event-pool registry, qgroup cache, shadow-cl
registry, per-shadow-cl L0 Append serializer, latest-signaled map) was
guarding its own little island while every caller already held
_ze_state_mutex. Folded them all in.

Removed:
  cl_data->mtx          (per-cl)
  _ze_cls_mutex         (cl registry)
  _ze_event_wrappers_mutex
  _ze_event_pools_mutex
  _ze_event_latest_signaled_mutex
  _ze_qgroup_cache_mutex
  _ze_shadow_cls_mutex
  sh->mtx               (per shadow cl)

Kept:
  _ze_state_mutex       (the one)
  ze_closures_mutex     (separate domain — ffi closures)

Fixes two TSan-confirmed bugs in the process:

  R3: _on_destroy_context read cl_data->cached_context under
      _ze_cls_mutex while _universal_record_append wrote it without
      any lock. Different mutexes, observable race.

  Bug 2: _slot_drain recursed through s->preds[i] into another cl's
      slots and mutated chunk->n_held / slot->live while
      _cl_slot_append on that cl mutated the same bytes. Reproduced
      by ooo_imm_Event_multithreaded_01 / _04.

All the _cl_*, _get_*, _put_*, _event_latest_signaled_*, _qgroup_cache_get,
_get_shadow_cl, _shadow_append_query, _get_profiling_event helpers run
under the assumption that the caller holds _ze_state_mutex. Entry points
that used to call these without the lock (_on_create_command_list calling
_ordinal_is_compute, _on_destroy_context's three steps, the generated
profiling_prologue calling _get_profiling_event) take it now.

Other simplifications that fall out:
  - _Atomic on cached_context reverts to plain ze_context_handle_t.
  - __atomic_* on slot->refs reverts to plain ++/--.
  - The "allocate outside lock, race-publish inside" pattern in
    _event_latest_signaled_set / _put_ze_event / _get_ze_event_wrapper
    / _get_shadow_cl all collapse to straight HASH/DL ops.
  - sh->live_queries no longer needs its own mtx — manipulated under
    the state mutex like everything else.

Perf trade: Append on different cls and the various freelist accesses
serialize through one mutex. The L0 calls inside the critical section
(zeCommandListAppendBarrier or AppendQueryKernelTimestamps) are short —
the GPU just queues, doesn't execute. Worth it for the dramatic
simplification: one mutex, zero lock-ordering rules, zero atomics, zero
cross-cl acquisitions.

Net -81 lines on the helpers file. 10/10 multithreaded tests under TSan
report 0 races. 51/51 correctness tests pass; the
imm_Event_multithreaded_01 case that exposed an unlocked-prologue
regression mid-development passes 5/5 on loop.
---
 backends/ze/tracer_ze_helpers.include.c | 337 +++++++++---------------
 backends/ze/ze_model.rb                 |   2 +
 2 files changed, 129 insertions(+), 210 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 4b2432ff..92ee5842 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -164,9 +164,8 @@ struct _ze_slot {
   /* Incoming pred edges: count of downstream slots whose preds[] points
    * here AND that have not yet been drained. Incremented at downstream
    * _slot_instantiate (one per pred edge), decremented at downstream
-   * _slot_drain. Slot is reclaimable iff live==0 AND refs==0. Atomic
-   * because increment/decrement happen across cl boundaries without
-   * holding the slot's owner mtx. */
+   * _slot_drain. Slot is reclaimable iff live==0 AND refs==0. All
+   * accesses run under _ze_state_mutex, so plain increment/decrement. */
   uint32_t refs;
 };
 
@@ -202,10 +201,6 @@ struct _ze_command_list_obj_data {
    *
    * Held only for regular cls; immediate cls never Execute. */
   ze_command_queue_handle_t in_flight_q;
-  /* Serializes the Execute prologue: if two threads race to Execute the
-   * same closed cl on different queues, we need to force-sync the prior
-   * one before letting the second run instantiate. */
-  pthread_mutex_t mtx;
   unsigned char is_immediate;
   unsigned char is_in_order;
   /* 1 if this cl's queue group exposes COMPUTE — its body can host
@@ -217,34 +212,38 @@ struct _ze_command_list_obj_data {
 
   /* Cached on first use: context handle for this cl. Immutable for the
    * cl's lifetime. Load-bearing for _on_destroy_context's sweep: lets it
-   * associate cls back to their ctx without an L0 roundtrip per cl. */
+   * associate cls back to their ctx without an L0 roundtrip per cl.
+   * Protected (along with everything else here) by _ze_state_mutex. */
   ze_context_handle_t cached_context;
 };
 
 struct _ze_command_list_obj_data *_ze_cls = NULL;
-pthread_mutex_t _ze_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
 
+/* Single mutex guarding ALL per-cl tracer state: the _ze_cls registry,
+ * cl_data fields, every cl's chunks/slots/preds. Append, Execute, Drain,
+ * and the destroy paths all take it. The dep-graph lets a single drain
+ * walk preds into other cls and mutate their chunks (n_held--, free
+ * chunk), so any per-cl locking scheme would need cross-cl acquisition
+ * with lock ordering — one global mutex avoids that entirely. Drain is
+ * host-blocking (zeEventHostSynchronize on shadow events) anyway, so
+ * the perf cost of serializing it across cls is bounded. */
+pthread_mutex_t _ze_state_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* The _cl_* helpers are pure HASH wrappers. Caller holds _ze_state_mutex. */
 static struct _ze_command_list_obj_data *_cl_find(ze_command_list_handle_t command_list) {
   struct _ze_command_list_obj_data *cl = NULL;
-  pthread_mutex_lock(&_ze_cls_mutex);
   HASH_FIND_PTR(_ze_cls, &command_list, cl);
-  pthread_mutex_unlock(&_ze_cls_mutex);
   return cl;
 }
 
 static void _cl_add(struct _ze_command_list_obj_data *cl) {
-  pthread_mutex_lock(&_ze_cls_mutex);
   HASH_ADD_PTR(_ze_cls, ptr, cl);
-  pthread_mutex_unlock(&_ze_cls_mutex);
 }
 
 static struct _ze_command_list_obj_data *_cl_find_and_del(ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl = NULL;
-  pthread_mutex_lock(&_ze_cls_mutex);
-  HASH_FIND_PTR(_ze_cls, &command_list, cl);
+  struct _ze_command_list_obj_data *cl = _cl_find(command_list);
   if (cl)
     HASH_DEL(_ze_cls, cl);
-  pthread_mutex_unlock(&_ze_cls_mutex);
   return cl;
 }
 
@@ -261,21 +260,17 @@ struct _ze_qgroup_cache_entry {
   UT_hash_handle hh;
 };
 static struct _ze_qgroup_cache_entry *_ze_qgroup_cache = NULL;
-static pthread_mutex_t _ze_qgroup_cache_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 /* Populate (or return cached) flag bitmap for device. The cache lives
- * for process lifetime — once published, the entry pointer and its flags
- * array are immutable, so callers can dereference them without the
- * mutex. Returns NULL on driver error / OOM. */
+ * for process lifetime. Caller holds _ze_state_mutex — first-touch L0
+ * queries (zeDeviceGetCommandQueueGroupProperties) happen under the
+ * state mutex; cost is bounded since lookups are once per device. */
 static struct _ze_qgroup_cache_entry *_qgroup_cache_get(ze_device_handle_t device) {
-  pthread_mutex_lock(&_ze_qgroup_cache_mutex);
   struct _ze_qgroup_cache_entry *e = NULL;
   HASH_FIND_PTR(_ze_qgroup_cache, &device, e);
-  pthread_mutex_unlock(&_ze_qgroup_cache_mutex);
   if (e)
     return e;
 
-  /* Slow path: scan queue groups outside the lock. */
   uint32_t n_groups = 0;
   if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, NULL) !=
           ZE_RESULT_SUCCESS ||
@@ -302,20 +297,15 @@ static struct _ze_qgroup_cache_entry *_qgroup_cache_get(ze_device_handle_t devic
     flags[i] = groups[i].flags;
   free(groups);
 
-  pthread_mutex_lock(&_ze_qgroup_cache_mutex);
-  HASH_FIND_PTR(_ze_qgroup_cache, &device, e);
+  e = (struct _ze_qgroup_cache_entry *)calloc(1, sizeof(*e));
   if (!e) {
-    e = (struct _ze_qgroup_cache_entry *)calloc(1, sizeof(*e));
-    if (e) {
-      e->device = device;
-      e->flags = flags;
-      e->n_groups = n_groups;
-      HASH_ADD_PTR(_ze_qgroup_cache, device, e);
-    }
-  }
-  pthread_mutex_unlock(&_ze_qgroup_cache_mutex);
-  if (!e || e->flags != flags)
     free(flags);
+    return NULL;
+  }
+  e->device = device;
+  e->flags = flags;
+  e->n_groups = n_groups;
+  HASH_ADD_PTR(_ze_qgroup_cache, device, e);
   return e;
 }
 
@@ -355,29 +345,25 @@ struct _ze_shadow_key {
 struct _ze_shadow_cl {
   struct _ze_shadow_key key;
   ze_command_list_handle_t cl;
-  pthread_mutex_t mtx;
-  uint32_t live_queries; /* QKTs appended but not yet host-synced; protected by mtx */
+  uint32_t live_queries; /* QKTs appended but not yet host-synced */
   UT_hash_handle hh;
 };
 static struct _ze_shadow_cl *_ze_shadow_cls = NULL;
-static pthread_mutex_t _ze_shadow_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 /* Returns the shadow cl for (context, device), creating it lazily on
  * first use. Returns NULL if the device has no compute group (fatal:
- * we log to stderr) or if creation fails. */
+ * we log to stderr) or if creation fails. Caller holds _ze_state_mutex
+ * (lookup, lazy create, and the L0 zeCommandListCreateImmediate call
+ * all happen under it; the create is first-touch per (ctx,device) so
+ * the cost is bounded). */
 static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
                                             ze_device_handle_t device) {
   struct _ze_shadow_key key = {context, device};
-  pthread_mutex_lock(&_ze_shadow_cls_mutex);
   struct _ze_shadow_cl *sh = NULL;
   HASH_FIND(hh, _ze_shadow_cls, &key, sizeof(key), sh);
-  if (sh) {
-    pthread_mutex_unlock(&_ze_shadow_cls_mutex);
+  if (sh)
     return sh;
-  }
-  pthread_mutex_unlock(&_ze_shadow_cls_mutex);
 
-  /* Slow path: create outside the registry lock. */
   uint32_t ord = _get_compute_ordinal(device);
   if (ord == (uint32_t)-1) {
     fprintf(stderr,
@@ -406,55 +392,37 @@ static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
             (void *)context, (void *)device);
     return NULL;
   }
-
-  pthread_mutex_lock(&_ze_shadow_cls_mutex);
-  HASH_FIND(hh, _ze_shadow_cls, &key, sizeof(key), sh);
-  if (sh) {
-    /* Lost the race; destroy ours and return the winner. */
-    pthread_mutex_unlock(&_ze_shadow_cls_mutex);
-    ZE_COMMAND_LIST_DESTROY_PTR(new_cl);
-    return sh;
-  }
   sh = (struct _ze_shadow_cl *)calloc(1, sizeof(*sh));
   if (!sh) {
-    pthread_mutex_unlock(&_ze_shadow_cls_mutex);
     ZE_COMMAND_LIST_DESTROY_PTR(new_cl);
     return NULL;
   }
   sh->key = key;
   sh->cl = new_cl;
-  pthread_mutex_init(&sh->mtx, NULL);
   HASH_ADD(hh, _ze_shadow_cls, key, sizeof(sh->key), sh);
-  pthread_mutex_unlock(&_ze_shadow_cls_mutex);
   return sh;
 }
 
 /* Append AppendQueryKernelTimestamps on the shadow cl: wait on inj,
- * signal shadow_done, write timestamps into slab[*off]. Serialized on
- * sh->mtx because L0 doesn't allow concurrent Appends to one cl.
- * Aborts on L0 failure (defensive — a missing Query would silently
- * drop this kernel's timing). */
+ * signal shadow_done, write timestamps into slab[*off]. Caller holds
+ * _ze_state_mutex, which also serializes the not-thread-safe-per-cl-
+ * handle L0 Append on the shared shadow cl. Aborts on L0 failure
+ * (defensive — a missing Query would silently drop this kernel's
+ * timing). */
 static void _shadow_append_query(struct _ze_shadow_cl *sh,
                                  ze_event_handle_t inj_event,
                                  void *slab,
                                  size_t *off,
                                  ze_event_handle_t shadow_done_event) {
-  pthread_mutex_lock(&sh->mtx);
   sh->live_queries++;
   _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(sh->cl, 1, &inj_event, slab, off,
                                                               /*hSignalEvent=*/shadow_done_event,
                                                               /*numWaitEvents=*/1, &inj_event));
-  pthread_mutex_unlock(&sh->mtx);
 }
 
 static inline void _on_create_command_list(ze_command_list_handle_t command_list,
                                            ze_device_handle_t device,
                                            uint32_t ordinal, int immediate, int in_order) {
-  if (_cl_find(command_list)) {
-    THAPI_DBGLOG("Command list already registered: %p", command_list);
-    return;
-  }
-
   struct _ze_command_list_obj_data *cl_data =
       (struct _ze_command_list_obj_data *)calloc(1, sizeof(*cl_data));
   if (!cl_data) {
@@ -464,9 +432,18 @@ static inline void _on_create_command_list(ze_command_list_handle_t command_list
   cl_data->ptr = (void *)command_list;
   cl_data->is_immediate = immediate ? 1 : 0;
   cl_data->is_in_order = in_order ? 1 : 0;
+
+  pthread_mutex_lock(&_ze_state_mutex);
+  /* _ordinal_is_compute touches the qgroup cache (state-mutex-protected). */
   cl_data->is_compute = _ordinal_is_compute(device, ordinal) ? 1 : 0;
-  pthread_mutex_init(&cl_data->mtx, NULL);
+  if (_cl_find(command_list)) {
+    pthread_mutex_unlock(&_ze_state_mutex);
+    THAPI_DBGLOG("Command list already registered: %p", command_list);
+    free(cl_data);
+    return;
+  }
   _cl_add(cl_data);
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
 /* Wrapper around an injected event we own. Lives either in the per-context
@@ -486,7 +463,6 @@ struct _ze_event_pool_entry {
 };
 
 struct _ze_event_pool_entry *_ze_event_pools = NULL;
-static pthread_mutex_t _ze_event_pools_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 /* event_latest_signaled[ev] -> the most recent slot whose attr==ev.
  * Used to resolve happens-before edges: when a new Append says "wait on
@@ -498,43 +474,29 @@ struct _ze_event_latest_signaled_entry {
   UT_hash_handle hh;
 };
 static struct _ze_event_latest_signaled_entry *_ze_event_latest_signaled = NULL;
-static pthread_mutex_t _ze_event_latest_signaled_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* All three accessors below run under _ze_state_mutex (caller's
+ * responsibility). */
 
 static inline struct _ze_slot *_event_latest_signaled_get(ze_event_handle_t ev) {
   struct _ze_event_latest_signaled_entry *e = NULL;
-  pthread_mutex_lock(&_ze_event_latest_signaled_mutex);
   HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
-  struct _ze_slot *s = e ? e->slot : NULL;
-  pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
-  return s;
+  return e ? e->slot : NULL;
 }
 
 static inline void _event_latest_signaled_set(ze_event_handle_t ev, struct _ze_slot *s) {
   if (!ev)
     return;
-  /* Allocate the new entry outside the lock; same pattern as _qgroup_cache_get.
-   * If we lose the race to publish it, free our unused copy. Saves a heap
-   * call worth of contention on _ze_event_latest_signaled_mutex per Append
-   * that touches an event the tracer hasn't seen before. */
-  struct _ze_event_latest_signaled_entry *pre =
-      (struct _ze_event_latest_signaled_entry *)calloc(1, sizeof(*pre));
-
-  pthread_mutex_lock(&_ze_event_latest_signaled_mutex);
   struct _ze_event_latest_signaled_entry *e = NULL;
   HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
   if (!e) {
-    if (!pre) {
-      pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
+    e = (struct _ze_event_latest_signaled_entry *)calloc(1, sizeof(*e));
+    if (!e)
       return;
-    }
-    e = pre;
-    pre = NULL;
     e->ev = ev;
     HASH_ADD_PTR(_ze_event_latest_signaled, ev, e);
   }
   e->slot = s;
-  pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
-  free(pre); /* harmless if NULL; non-NULL only when we lost the entry race */
 }
 
 /* Remove event_latest_signaled[ev] only if it still points at slot s
@@ -543,54 +505,40 @@ static inline void _event_latest_signaled_set(ze_event_handle_t ev, struct _ze_s
 static inline void _event_latest_signaled_clear_if(ze_event_handle_t ev, struct _ze_slot *s) {
   if (!ev)
     return;
-  pthread_mutex_lock(&_ze_event_latest_signaled_mutex);
   struct _ze_event_latest_signaled_entry *e = NULL;
   HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
   if (e && e->slot == s) {
     HASH_DEL(_ze_event_latest_signaled, e);
     free(e);
   }
-  pthread_mutex_unlock(&_ze_event_latest_signaled_mutex);
 }
 
+/* All four event helpers below run under _ze_state_mutex (caller's
+ * responsibility). */
+
 /* Pop one recycled event wrapper from the per-context freelist; NULL if
  * none cached. Caller (today only _get_profiling_event) will fall back
  * to creating a fresh L0 event pool + event. */
 static struct _ze_event_h *_get_ze_event(ze_context_handle_t context) {
-  struct _ze_event_h *e = NULL;
-  pthread_mutex_lock(&_ze_event_pools_mutex);
   struct _ze_event_pool_entry *pool = NULL;
   HASH_FIND_PTR(_ze_event_pools, &context, pool);
-  if (pool && pool->events) {
-    e = pool->events;
-    DL_DELETE(pool->events, e);
-  }
-  pthread_mutex_unlock(&_ze_event_pools_mutex);
+  if (!pool || !pool->events)
+    return NULL;
+  struct _ze_event_h *e = pool->events;
+  DL_DELETE(pool->events, e);
   return e;
 }
 
-/* Return an event wrapper to its per-context freelist. Reset is issued
- * BEFORE we take the lock — zeEventHostReset is thread-safe and resetting
- * with the lock held would serialize an L0 driver round-trip behind the
- * global pools mutex. Pre-allocate the new bucket entry outside the lock
- * for the same reason; if we lose the race to create the bucket, free our
- * unused copy.
- *
- * On total failure (couldn't allocate bucket AND there isn't one), there's
- * nowhere to park the wrapper, so destroy its backing L0 objects and free
- * the wrapper itself — we'd rather leak nothing than poison the freelist. */
+/* Return an event wrapper to its per-context freelist. On total failure
+ * (no bucket can be allocated), destroy the backing L0 objects and free
+ * the wrapper — we'd rather leak nothing than poison the freelist. */
 static void _put_ze_event(struct _ze_event_h *val) {
   _ZE_MUST(ZE_EVENT_HOST_RESET_PTR(val->event));
-
-  struct _ze_event_pool_entry *pre =
-      (struct _ze_event_pool_entry *)calloc(1, sizeof(*pre));
-
-  pthread_mutex_lock(&_ze_event_pools_mutex);
   struct _ze_event_pool_entry *pool = NULL;
   HASH_FIND_PTR(_ze_event_pools, &val->context, pool);
   if (!pool) {
-    if (!pre) {
-      pthread_mutex_unlock(&_ze_event_pools_mutex);
+    pool = (struct _ze_event_pool_entry *)calloc(1, sizeof(*pool));
+    if (!pool) {
       THAPI_DBGLOG("Failed to allocate memory");
       if (val->event_pool) {
         if (val->event)
@@ -600,32 +548,22 @@ static void _put_ze_event(struct _ze_event_h *val) {
       free(val);
       return;
     }
-    pool = pre;
-    pre = NULL;
     pool->context = val->context;
     HASH_ADD_PTR(_ze_event_pools, context, pool);
   }
   DL_PREPEND(pool->events, val);
-  pthread_mutex_unlock(&_ze_event_pools_mutex);
-  free(pre); /* harmless if NULL; non-NULL only when we lost the bucket race */
 }
 
-
 struct _ze_event_h *_ze_event_wrappers = NULL;
-static pthread_mutex_t _ze_event_wrappers_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 /* Get a zeroed event wrapper struct: pop from the global recycle list if
  * any, else calloc a fresh one. The wrapper is context-agnostic — only
  * the backing L0 event + pool inside it bind to a specific ctx. */
 static struct _ze_event_h *_get_ze_event_wrapper(void) {
-  struct _ze_event_h *e = NULL;
-  pthread_mutex_lock(&_ze_event_wrappers_mutex);
-  if (_ze_event_wrappers) {
-    e = _ze_event_wrappers;
+  struct _ze_event_h *e = _ze_event_wrappers;
+  if (e)
     DL_DELETE(_ze_event_wrappers, e);
-  }
-  pthread_mutex_unlock(&_ze_event_wrappers_mutex);
-  if (!e)
+  else
     e = (struct _ze_event_h *)calloc(1, sizeof(*e));
   return e;
 }
@@ -639,19 +577,20 @@ static struct _ze_event_h *_get_ze_event_wrapper(void) {
  * something equivalent to a fresh calloc. */
 static void _put_ze_event_wrapper(struct _ze_event_h *val) {
   memset(val, 0, sizeof(*val));
-  pthread_mutex_lock(&_ze_event_wrappers_mutex);
   DL_PREPEND(_ze_event_wrappers, val);
-  pthread_mutex_unlock(&_ze_event_wrappers_mutex);
 }
 
 /* Caller-supplied ctx: every call site already has it on the stack, so
  * we'd otherwise issue an unnecessary zeCommandListGetContextHandle per
- * profiled Append (caller did the same query moments earlier). */
+ * profiled Append (caller did the same query moments earlier).
+ *
+ * Caller holds _ze_state_mutex. The L0 event/pool create calls run under
+ * the lock; they're cold (only when the freelist for this ctx is empty)
+ * so the cost is bounded. */
 static struct _ze_event_h *_get_profiling_event(ze_context_handle_t context) {
   struct _ze_event_h *e_w = _get_ze_event(context);
   if (e_w)
     return e_w;
-
   e_w = _get_ze_event_wrapper();
   if (!e_w) {
     THAPI_DBGLOG("Could not create a new event wrapper for context: %p", context);
@@ -798,7 +737,7 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
   }
   /* Each new pred edge holds a ref on its target. */
   for (uint32_t i = 0; i < s->n_preds; ++i)
-    __atomic_fetch_add(&s->preds[i]->refs, 1, __ATOMIC_RELAXED);
+    s->preds[i]->refs++;
   if (s->attr)
     _event_latest_signaled_set(s->attr, s);
 }
@@ -807,7 +746,7 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
  * shadow cl; inline path is a no-op here (its QKT is baked into the user cl
  * body at Append). Then instantiate in the dep graph. `s->shadow_done` is
  * the single source of truth for "shadow vs inline" — no is_compute branch
- * at the call site. Caller holds cl_data->mtx. */
+ * at the call site. Caller holds _ze_state_mutex. */
 static void _slot_publish(struct _ze_command_list_obj_data *cl_data,
                           struct _ze_slot *s,
                           struct _ze_shadow_cl *sh) {
@@ -822,13 +761,11 @@ static void _slot_publish(struct _ze_command_list_obj_data *cl_data,
 /* Append-time hook called from profiling_epilogue. Caller already
  * swapped user's hSignalEvent for inj->event. user_signal is the
  * ORIGINAL value (possibly NULL). user_waits is the user's wait list
- * (NULL,0 if none).
+ * (NULL,0 if none). ctx is fetched once in the prologue and threaded
+ * in to avoid a second zeCommandListGetContextHandle.
  *
  * Forks on cl_data->is_compute to pick the QKT placement (INLINE vs
  * SHADOW) — see the "QKT placement" diagram at the top of this file. */
-/* ctx is fetched once in the prologue (profiling_prologue in ze_model.rb) and
- * threaded in here so _universal_record_append doesn't reissue the same
- * zeCommandListGetContextHandle the prologue already made. */
 static void _universal_record_append(ze_command_list_handle_t command_list,
                                      ze_context_handle_t ctx,
                                      struct _ze_event_h *inj,
@@ -838,36 +775,16 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   if (!inj || !ctx)
     return;
   struct _ze_event_h *shadow_done = NULL;
-  struct _ze_command_list_obj_data *cl_data = NULL;
   struct _ze_slot *s = NULL;
-  int inline_path = 0;
   int barrier_chained = 0;
 
   inj->context = ctx;
 
-  /* Plain find, no delete: cl_data stays in the global hash while we work.
-   * The L0 spec forbids the user from racing Append against Destroy on
-   * the same cl handle (cl handle is a not-thread-safe restriction for
-   * both zeCommandListAppend* and zeCommandListDestroy), so cl_data
-   * cannot be torn out from under us by a concurrent _on_destroy_*.
-   * cl_data->mtx still serializes our work against another Append /
-   * Execute on this same cl. Per-Append cost on _ze_cls_mutex drops
-   * from three acquires (DEL + ADD + the cleanup ADD on the fail path)
-   * to one — which is the single biggest contention point in the
-   * many-threads-many-CLs case. */
-  cl_data = _cl_find(command_list);
+  pthread_mutex_lock(&_ze_state_mutex);
+  struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
   if (!cl_data)
-    goto fail;
-  inline_path = cl_data->is_compute;
-  /* Publish the cl->ctx mapping up front. _on_execute_one_cl reads it
-   * directly (no fallback fetch) when resolving the shadow cl, and
-   * _on_destroy_context's per-cl sweep matches against it. Doing this
-   * before any slot is appended guarantees both readers find a populated
-   * value. Race-safe unlocked: every writer stores the same value (the
-   * cl's true context), and _on_destroy_context's reader is gated on a
-   * user contract that forbids concurrent Append + DestroyContext on
-   * the same context. */
-  cl_data->cached_context = ctx;
+    goto fail_locked;
+  int inline_path = cl_data->is_compute;
 
   /* Shadow path needs a fence event (Query lives on the shadow cl;
    * drain host-syncs on it). Inline path uses user_signal as the fence
@@ -875,11 +792,14 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   if (!inline_path) {
     shadow_done = _get_profiling_event(ctx);
     if (!shadow_done)
-      goto fail;
+      goto fail_locked;
     shadow_done->context = ctx;
   }
 
-  pthread_mutex_lock(&cl_data->mtx);
+  /* Publish the cl->ctx mapping. _on_execute_one_cl reads it directly
+   * (no fallback fetch) when resolving the shadow cl, and
+   * _on_destroy_context's per-cl sweep matches against it. */
+  cl_data->cached_context = ctx;
 
   s = _cl_slot_append(cl_data, ctx, inj, shadow_done, user_signal, user_waits, user_n_waits);
   if (!s)
@@ -895,7 +815,7 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
                                                                 user_signal, 1, &wait_ev));
     barrier_chained = 1; /* user_signal chained via the QKT itself */
     _slot_instantiate(cl_data, s);
-    pthread_mutex_unlock(&cl_data->mtx);
+    pthread_mutex_unlock(&_ze_state_mutex);
     return;
   }
 
@@ -915,13 +835,13 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
       goto fail_locked;
     _slot_publish(cl_data, s, sh);
   }
-  pthread_mutex_unlock(&cl_data->mtx);
+  pthread_mutex_unlock(&_ze_state_mutex);
   return;
 
 fail_locked:
   if (s) {
     /* Roll back the slot we just appended. We were the very last to
-     * touch the tail chunk and we hold cl_data->mtx, so decrementing
+     * touch the tail chunk and we hold _ze_state_mutex, so decrementing
      * n_used/n_held and clearing the slot is safe. If the chunk
      * was freshly allocated only for this Append (n_used now 0), free
      * it back so we don't leak a chunk per slot-append failure. */
@@ -933,19 +853,20 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     if (c->n_used == 0)
       _cl_chunk_free(cl_data, c, /*free_slab=*/1);
   }
-  pthread_mutex_unlock(&cl_data->mtx);
-fail:
+  if (shadow_done)
+    _put_ze_event(shadow_done);
+  _put_ze_event(inj);
+  pthread_mutex_unlock(&_ze_state_mutex);
   /* If we never chained user_signal off inj, do it now. The prologue
    * swapped user's sig for inj->event; without this Append the user's
    * Sync(user_signal) would hang forever. Aborts on failure — we have
-   * no second-chance recovery and a silent hang is worse than a crash. */
+   * no second-chance recovery and a silent hang is worse than a crash.
+   * Outside the state mutex: barrier on the user's cl is L0-side and
+   * doesn't touch tracer state. */
   if (user_signal && !barrier_chained) {
     ze_event_handle_t wait_ev = inj->event;
     _ZE_MUST(ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev));
   }
-  if (shadow_done)
-    _put_ze_event(shadow_done);
-  _put_ze_event(inj);
 }
 
 /* Reclaim a slot: PUT events back to the per-context pool, free waits,
@@ -985,6 +906,10 @@ static void _slot_release(struct _ze_slot *s) {
  * Slab read uses s->chunk->slab — preds may live in another cl, so we
  * can't use the caller's slab.
  *
+ * Caller holds _ze_state_mutex. That single mutex covers every cl's
+ * chunks/slots, so cross-cl pred recursion is safe with no further
+ * locking: Append on the pred's cl also takes _ze_state_mutex.
+ *
  * No cycle guard: preds come from in-order prev (strictly earlier slot
  * in the same cl, DAG) and from event_latest_signaled[wait_event] (a slot published
  * BEFORE us). Forming a cycle would require user-declared mutual waits,
@@ -1008,11 +933,9 @@ static void _slot_drain(struct _ze_slot *s) {
      * this shadow cl is in flight, Reset it: the L0 driver leaks ~10 KB
      * per AppendQueryKernelTimestamps and only reclaims at Reset/Destroy. */
     if (s->sh) {
-      pthread_mutex_lock(&s->sh->mtx);
       s->sh->live_queries--;
       if (s->sh->live_queries == 0)
         _ZE_MUST(ZE_COMMAND_LIST_RESET_PTR(s->sh->cl));
-      pthread_mutex_unlock(&s->sh->mtx);
     }
   }
   ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
@@ -1028,13 +951,13 @@ static void _slot_drain(struct _ze_slot *s) {
   /* Drop refs on preds; release any that hit 0 and are already drained. */
   for (uint32_t i = 0; i < s->n_preds; ++i) {
     struct _ze_slot *p = s->preds[i];
-    if (__atomic_sub_fetch(&p->refs, 1, __ATOMIC_RELAXED) == 0 && !p->live)
+    if (--p->refs == 0 && !p->live)
       _slot_release(p);
   }
   free(s->preds);
   s->preds = NULL;
   s->n_preds = 0;
-  if (__atomic_load_n(&s->refs, __ATOMIC_RELAXED) == 0)
+  if (s->refs == 0)
     _slot_release(s);
 }
 
@@ -1057,12 +980,11 @@ static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
 
 /* Drain a single cl. */
 static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
+  pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
-  if (!cl_data)
-    return;
-  pthread_mutex_lock(&cl_data->mtx);
-  _cl_drain(cl_data);
-  pthread_mutex_unlock(&cl_data->mtx);
+  if (cl_data)
+    _cl_drain(cl_data);
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
 /* Release everything cl_data owns and free cl_data itself. Caller must
@@ -1081,8 +1003,8 @@ static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
  *   - chunk slabs: zeMemFree the slab when ctx is alive; skip when ctx
  *     is dying — the driver reclaims, and zeMemFree on a doomed ctx is
  *     at best racy. */
+/* Caller holds _ze_state_mutex. */
 static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_dying) {
-  pthread_mutex_lock(&cl_data->mtx);
   struct _ze_slab_chunk *c, *tmp;
   DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
     for (uint32_t i = 0; i < c->n_used; ++i) {
@@ -1097,8 +1019,6 @@ static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_
     }
     _cl_chunk_free(cl_data, c, /*free_slab=*/!ctx_dying);
   }
-  pthread_mutex_unlock(&cl_data->mtx);
-  pthread_mutex_destroy(&cl_data->mtx);
   free(cl_data);
 }
 
@@ -1111,10 +1031,11 @@ static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_
  * L0; immediate cls' slots have likely already been released at drain
  * time but any stragglers get cleaned up too. */
 static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
+  pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_command_list_obj_data *cl_data = _cl_find_and_del(command_list);
-  if (!cl_data)
-    return;
-  _cl_data_destroy(cl_data, /*ctx_dying=*/0);
+  if (cl_data)
+    _cl_data_destroy(cl_data, /*ctx_dying=*/0);
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
 /* zeContextDestroy prologue. The user contract is that the device is no
@@ -1133,7 +1054,7 @@ static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
  * user takes care of those (or accepts the contract). */
 static void _on_destroy_context(ze_context_handle_t hContext) {
   /* 1) Drop cls bound to this ctx. */
-  pthread_mutex_lock(&_ze_cls_mutex);
+  pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_command_list_obj_data *cl_data = NULL, *cl_tmp = NULL;
   HASH_ITER(hh, _ze_cls, cl_data, cl_tmp) {
     if (cl_data->cached_context != hContext)
@@ -1141,10 +1062,8 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
     HASH_DEL(_ze_cls, cl_data);
     _cl_data_destroy(cl_data, /*ctx_dying=*/1);
   }
-  pthread_mutex_unlock(&_ze_cls_mutex);
 
   /* 2) Shadow cls keyed by (ctx, device). */
-  pthread_mutex_lock(&_ze_shadow_cls_mutex);
   struct _ze_shadow_cl *sh = NULL, *sh_tmp = NULL;
   HASH_ITER(hh, _ze_shadow_cls, sh, sh_tmp) {
     if (sh->key.context != hContext)
@@ -1152,13 +1071,10 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
     HASH_DEL(_ze_shadow_cls, sh);
     if (sh->cl)
       ZE_COMMAND_LIST_DESTROY_PTR(sh->cl);
-    pthread_mutex_destroy(&sh->mtx);
     free(sh);
   }
-  pthread_mutex_unlock(&_ze_shadow_cls_mutex);
 
   /* 3) Per-ctx event pool freelist. */
-  pthread_mutex_lock(&_ze_event_pools_mutex);
   struct _ze_event_pool_entry *pe = NULL;
   HASH_FIND_PTR(_ze_event_pools, &hContext, pe);
   if (pe) {
@@ -1174,29 +1090,28 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
     }
     free(pe);
   }
-  pthread_mutex_unlock(&_ze_event_pools_mutex);
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
 /* Drain every cl whose in_flight_q matches. */
 static void _on_sync_drain_queue(ze_command_queue_handle_t hQueue) {
-  pthread_mutex_lock(&_ze_cls_mutex);
+  pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
   HASH_ITER(hh, _ze_cls, cl_data, tmp) {
-    if (cl_data->in_flight_q == hQueue) {
-      pthread_mutex_lock(&cl_data->mtx);
+    if (cl_data->in_flight_q == hQueue)
       _cl_drain(cl_data);
-      pthread_mutex_unlock(&cl_data->mtx);
-    }
   }
-  pthread_mutex_unlock(&_ze_cls_mutex);
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
 /* Drain the slot that most recently signaled `ev` (recursing on preds). */
 static void _on_sync_drain_event(ze_event_handle_t ev) {
+  pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_slot *s = _event_latest_signaled_get(ev);
-  if (!s || !s->owner)
+  if (!s || !s->owner) {
+    pthread_mutex_unlock(&_ze_state_mutex);
     return;
-  pthread_mutex_lock(&s->owner->mtx);
+  }
   _slot_drain(s);
   /* The drained slot may have left siblings live; only clear
    * in_flight_q if nothing in this cl remains in flight. */
@@ -1213,13 +1128,13 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
   }
   if (!any_live)
     s->owner->in_flight_q = NULL;
-  pthread_mutex_unlock(&s->owner->mtx);
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
 /* Execute-epilogue handler for ONE cl. Runs AFTER L0's actual Execute
  * has returned, with the user cl in flight on its engine.
  *
- * Three phases, all under cl_data->mtx so a concurrent Execute or Sync
+ * Three phases, all under _ze_state_mutex so a concurrent Execute or Sync
  * on another thread sees them atomically:
  *
  *   1) If in_flight_q is set from a prior Execute by *another* thread,
@@ -1237,10 +1152,12 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
  *      it to the dep graph + as the "owner" of this queue. */
 static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
                                ze_command_list_handle_t command_list) {
+  pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
-  if (!cl_data)
+  if (!cl_data) {
+    pthread_mutex_unlock(&_ze_state_mutex);
     return;
-  pthread_mutex_lock(&cl_data->mtx);
+  }
 
   if (cl_data->in_flight_q) {
     _ZE_MUST(ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX));
@@ -1279,7 +1196,7 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
   }
   cl_data->in_flight_q = hQueue;
 
-  pthread_mutex_unlock(&cl_data->mtx);
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
 static void _on_execute_command_lists_epilogue(ze_command_queue_handle_t hQueue,
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index dd53212d..2411c701 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -304,7 +304,9 @@ def upper_snake_case(str)
   ze_context_handle_t _ctx = NULL;
   if (_do_profile) {
     if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(hCommandList, &_ctx) == ZE_RESULT_SUCCESS && _ctx) {
+      pthread_mutex_lock(&_ze_state_mutex);
       _ewrapper = _get_profiling_event(_ctx);
+      pthread_mutex_unlock(&_ze_state_mutex);
       if (_ewrapper)
         #{event_name} = _ewrapper->event;
     }

From 272ade793fd50d43c809059bf163caebc4a7a6da Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 10 Jun 2026 22:39:44 +0000
Subject: [PATCH 44/54] =?UTF-8?q?ze:=20comment=20cleanup=20=E2=80=94=20add?=
 =?UTF-8?q?=20Concurrency=20header=20section,=20trim=20verbose=20docs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a "Concurrency" section to the algorithm header that explains why a
single global mutex covers all tracer state (cross-cl pred edges in the
dep graph would force any per-cl scheme into multi-mutex acquisition
with ordering rules; one global mutex sidesteps that entirely; the perf
trade is bounded because the held region is just short L0 queue ops).

With that in place, scrub per-function "Caller holds _ze_state_mutex"
boilerplate that the section comment now covers, fix the one stale
"lock cl.mtx" line in the algorithm pseudocode, and tighten the longer
docstrings on _cl_data_destroy / _on_destroy_context / _on_execute_one_cl
/ _universal_record_append / shadow-cl helpers.

Pure comments diff: -44 lines, no code change. 52/52 correctness tests
still pass.
---
 backends/ze/tracer_ze_helpers.include.c | 200 +++++++++---------------
 1 file changed, 78 insertions(+), 122 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 92ee5842..31ddcd0d 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -13,12 +13,11 @@
  *   - s.live = true; event_latest_signaled[s.attr] = &s
  *
  * On Execute(q, cl) prologue:
- *   - lock cl.mtx
  *   - if cl.in_flight_q: Synchronize(in_flight_q); drain_cl(cl)
  *   - shadow-path slots: re-Append Query on shadow cl
  *     inline-path slots: nothing (Query is baked into cl body)
  *   - instantiate every slot in cl
- *   - cl.in_flight_q = q; unlock
+ *   - cl.in_flight_q = q
  *
  * On Sync (the synced anchor tells us what to drain):
  *   - Sync(ev):  drain(event_latest_signaled[ev])
@@ -34,6 +33,27 @@
  *   (Build-time fields inj, attr, off, waits stay so the next Execute
  *    can re-instantiate without re-Appending.)
  *
+ * Concurrency
+ * ===========
+ *
+ * One global mutex (_ze_state_mutex) covers all tracer state: the cl
+ * registry, every cl's chunks/slots/preds, the event freelist + pool
+ * registry, the latest-signaled map, the shadow cl registry, the
+ * qgroup cache. Append / Execute / Drain / Destroy all take it.
+ *
+ * Per-cl mutexes don't work because drain follows cross-cl pred edges
+ * (event_latest_signaled[ev] can point at a slot in any cl) and
+ * mutates the pred's chunk via _slot_release. Any per-cl scheme has
+ * to acquire multiple cl mutexes with cross-cl ordering rules. One
+ * global mutex sidesteps that entirely.
+ *
+ * Perf: Append on different cls and freelist accesses serialize
+ * through one lock. The L0 calls inside the critical section
+ * (AppendBarrier, AppendQueryKernelTimestamps) just queue work on
+ * the GPU — the GPU executes asynchronously, so the held region is
+ * short. Drain is host-blocking (zeEventHostSynchronize on shadow
+ * fence events) and was effectively serial anyway.
+ *
  * QKT placement
  * =============
  *
@@ -163,9 +183,8 @@ struct _ze_slot {
   unsigned char live; /* in-flight (instantiated, not drained) */
   /* Incoming pred edges: count of downstream slots whose preds[] points
    * here AND that have not yet been drained. Incremented at downstream
-   * _slot_instantiate (one per pred edge), decremented at downstream
-   * _slot_drain. Slot is reclaimable iff live==0 AND refs==0. All
-   * accesses run under _ze_state_mutex, so plain increment/decrement. */
+   * _slot_instantiate, decremented at downstream _slot_drain. Slot is
+   * reclaimable iff live==0 AND refs==0. */
   uint32_t refs;
 };
 
@@ -212,24 +231,18 @@ struct _ze_command_list_obj_data {
 
   /* Cached on first use: context handle for this cl. Immutable for the
    * cl's lifetime. Load-bearing for _on_destroy_context's sweep: lets it
-   * associate cls back to their ctx without an L0 roundtrip per cl.
-   * Protected (along with everything else here) by _ze_state_mutex. */
+   * associate cls back to their ctx without an L0 roundtrip per cl. */
   ze_context_handle_t cached_context;
 };
 
 struct _ze_command_list_obj_data *_ze_cls = NULL;
 
-/* Single mutex guarding ALL per-cl tracer state: the _ze_cls registry,
- * cl_data fields, every cl's chunks/slots/preds. Append, Execute, Drain,
- * and the destroy paths all take it. The dep-graph lets a single drain
- * walk preds into other cls and mutate their chunks (n_held--, free
- * chunk), so any per-cl locking scheme would need cross-cl acquisition
- * with lock ordering — one global mutex avoids that entirely. Drain is
- * host-blocking (zeEventHostSynchronize on shadow events) anyway, so
- * the perf cost of serializing it across cls is bounded. */
+/* The single mutex covering all tracer state — see the "Concurrency"
+ * section in the file header for rationale. Every static helper in this
+ * file that touches tracer state assumes the caller holds it. */
 pthread_mutex_t _ze_state_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-/* The _cl_* helpers are pure HASH wrappers. Caller holds _ze_state_mutex. */
+/* Pure HASH wrappers. */
 static struct _ze_command_list_obj_data *_cl_find(ze_command_list_handle_t command_list) {
   struct _ze_command_list_obj_data *cl = NULL;
   HASH_FIND_PTR(_ze_cls, &command_list, cl);
@@ -262,9 +275,8 @@ struct _ze_qgroup_cache_entry {
 static struct _ze_qgroup_cache_entry *_ze_qgroup_cache = NULL;
 
 /* Populate (or return cached) flag bitmap for device. The cache lives
- * for process lifetime. Caller holds _ze_state_mutex — first-touch L0
- * queries (zeDeviceGetCommandQueueGroupProperties) happen under the
- * state mutex; cost is bounded since lookups are once per device. */
+ * for process lifetime. First-touch L0 queries happen under the state
+ * mutex; cost is bounded since lookups are once per device. */
 static struct _ze_qgroup_cache_entry *_qgroup_cache_get(ze_device_handle_t device) {
   struct _ze_qgroup_cache_entry *e = NULL;
   HASH_FIND_PTR(_ze_qgroup_cache, &device, e);
@@ -351,11 +363,9 @@ struct _ze_shadow_cl {
 static struct _ze_shadow_cl *_ze_shadow_cls = NULL;
 
 /* Returns the shadow cl for (context, device), creating it lazily on
- * first use. Returns NULL if the device has no compute group (fatal:
- * we log to stderr) or if creation fails. Caller holds _ze_state_mutex
- * (lookup, lazy create, and the L0 zeCommandListCreateImmediate call
- * all happen under it; the create is first-touch per (ctx,device) so
- * the cost is bounded). */
+ * first use (first-touch L0 zeCommandListCreateImmediate runs under
+ * the state mutex; cost bounded). Returns NULL if the device has no
+ * compute group (fatal: log to stderr) or if creation fails. */
 static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
                                             ze_device_handle_t device) {
   struct _ze_shadow_key key = {context, device};
@@ -404,11 +414,10 @@ static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
 }
 
 /* Append AppendQueryKernelTimestamps on the shadow cl: wait on inj,
- * signal shadow_done, write timestamps into slab[*off]. Caller holds
- * _ze_state_mutex, which also serializes the not-thread-safe-per-cl-
- * handle L0 Append on the shared shadow cl. Aborts on L0 failure
- * (defensive — a missing Query would silently drop this kernel's
- * timing). */
+ * signal shadow_done, write timestamps into slab[*off]. The state
+ * mutex also serializes the not-thread-safe-per-cl-handle L0 Append
+ * on the shared shadow cl. Aborts on L0 failure (defensive — a missing
+ * Query would silently drop this kernel's timing). */
 static void _shadow_append_query(struct _ze_shadow_cl *sh,
                                  ze_event_handle_t inj_event,
                                  void *slab,
@@ -475,9 +484,6 @@ struct _ze_event_latest_signaled_entry {
 };
 static struct _ze_event_latest_signaled_entry *_ze_event_latest_signaled = NULL;
 
-/* All three accessors below run under _ze_state_mutex (caller's
- * responsibility). */
-
 static inline struct _ze_slot *_event_latest_signaled_get(ze_event_handle_t ev) {
   struct _ze_event_latest_signaled_entry *e = NULL;
   HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
@@ -513,12 +519,8 @@ static inline void _event_latest_signaled_clear_if(ze_event_handle_t ev, struct
   }
 }
 
-/* All four event helpers below run under _ze_state_mutex (caller's
- * responsibility). */
-
-/* Pop one recycled event wrapper from the per-context freelist; NULL if
- * none cached. Caller (today only _get_profiling_event) will fall back
- * to creating a fresh L0 event pool + event. */
+/* Pop one recycled event wrapper from the per-context freelist; NULL
+ * if none cached (caller falls back to creating a fresh L0 event). */
 static struct _ze_event_h *_get_ze_event(ze_context_handle_t context) {
   struct _ze_event_pool_entry *pool = NULL;
   HASH_FIND_PTR(_ze_event_pools, &context, pool);
@@ -580,13 +582,9 @@ static void _put_ze_event_wrapper(struct _ze_event_h *val) {
   DL_PREPEND(_ze_event_wrappers, val);
 }
 
-/* Caller-supplied ctx: every call site already has it on the stack, so
- * we'd otherwise issue an unnecessary zeCommandListGetContextHandle per
- * profiled Append (caller did the same query moments earlier).
- *
- * Caller holds _ze_state_mutex. The L0 event/pool create calls run under
- * the lock; they're cold (only when the freelist for this ctx is empty)
- * so the cost is bounded. */
+/* Caller-supplied ctx avoids a redundant zeCommandListGetContextHandle
+ * (the prologue already fetched it). L0 event/pool create runs under
+ * the state mutex; cold path, bounded cost. */
 static struct _ze_event_h *_get_profiling_event(ze_context_handle_t context) {
   struct _ze_event_h *e_w = _get_ze_event(context);
   if (e_w)
@@ -746,7 +744,7 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
  * shadow cl; inline path is a no-op here (its QKT is baked into the user cl
  * body at Append). Then instantiate in the dep graph. `s->shadow_done` is
  * the single source of truth for "shadow vs inline" — no is_compute branch
- * at the call site. Caller holds _ze_state_mutex. */
+ * at the call site. */
 static void _slot_publish(struct _ze_command_list_obj_data *cl_data,
                           struct _ze_slot *s,
                           struct _ze_shadow_cl *sh) {
@@ -758,14 +756,11 @@ static void _slot_publish(struct _ze_command_list_obj_data *cl_data,
   _slot_instantiate(cl_data, s);
 }
 
-/* Append-time hook called from profiling_epilogue. Caller already
- * swapped user's hSignalEvent for inj->event. user_signal is the
- * ORIGINAL value (possibly NULL). user_waits is the user's wait list
- * (NULL,0 if none). ctx is fetched once in the prologue and threaded
- * in to avoid a second zeCommandListGetContextHandle.
- *
- * Forks on cl_data->is_compute to pick the QKT placement (INLINE vs
- * SHADOW) — see the "QKT placement" diagram at the top of this file. */
+/* Append-time hook from profiling_epilogue. The prologue swapped user's
+ * hSignalEvent for inj->event; user_signal is the original (possibly NULL),
+ * user_waits is the user's wait list, ctx is the cl's context (fetched
+ * once in the prologue, threaded in). Forks on cl_data->is_compute to
+ * pick the QKT placement — see "QKT placement" in the file header. */
 static void _universal_record_append(ze_command_list_handle_t command_list,
                                      ze_context_handle_t ctx,
                                      struct _ze_event_h *inj,
@@ -901,19 +896,13 @@ static void _slot_release(struct _ze_slot *s) {
 /* Drain one slot. Recurses on its preds, emits the slot's tracepoint,
  * drops one ref on each pred (releasing fully-drained-and-unreferenced
  * preds), then releases s if its own refs hit 0. Safe to call on an
- * already-drained (live=0) slot.
- *
- * Slab read uses s->chunk->slab — preds may live in another cl, so we
- * can't use the caller's slab.
- *
- * Caller holds _ze_state_mutex. That single mutex covers every cl's
- * chunks/slots, so cross-cl pred recursion is safe with no further
- * locking: Append on the pred's cl also takes _ze_state_mutex.
+ * already-drained (live=0) slot. Slab read uses s->chunk->slab — preds
+ * may live in another cl, so we can't use the caller's slab.
  *
  * No cycle guard: preds come from in-order prev (strictly earlier slot
- * in the same cl, DAG) and from event_latest_signaled[wait_event] (a slot published
- * BEFORE us). Forming a cycle would require user-declared mutual waits,
- * which L0 itself deadlocks on. */
+ * in the same cl, DAG) and from event_latest_signaled[wait_event] (a
+ * slot published BEFORE us). A cycle would need user-declared mutual
+ * waits, which L0 itself deadlocks on. */
 static void _slot_drain(struct _ze_slot *s) {
   if (!s || !s->live)
     return;
@@ -987,23 +976,12 @@ static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
-/* Release everything cl_data owns and free cl_data itself. Caller must
- * have already removed cl_data from the global _ze_cls hash (single-cl:
- * _cl_find_and_del; per-ctx sweep: HASH_DEL inside the iter), since the
- * two callers find their cls differently.
- *
- * ctx_dying changes how we dispose of two ctx-scoped resources:
- *
- *   - per-slot event wrappers: when ctx is alive, recycle them to the
- *     per-context event pool via _put_ze_event so a later cl on the same
- *     ctx can grab them. When ctx is dying, the pool itself is about to
- *     be wiped in _on_destroy_context step 3 (which will destroy the L0
- *     event/pool too), so we only recycle the wrapper struct.
- *
- *   - chunk slabs: zeMemFree the slab when ctx is alive; skip when ctx
- *     is dying — the driver reclaims, and zeMemFree on a doomed ctx is
- *     at best racy. */
-/* Caller holds _ze_state_mutex. */
+/* Release everything cl_data owns and free cl_data itself. Caller has
+ * already removed cl_data from _ze_cls (single-cl: _cl_find_and_del;
+ * per-ctx sweep: HASH_DEL inside the iter). When ctx is dying we just
+ * recycle wrapper structs (the L0 event/pool will be destroyed in
+ * _on_destroy_context step 3) and skip zeMemFree on the slab (the
+ * driver reclaims, and zeMemFree on a doomed ctx is racy). */
 static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_dying) {
   struct _ze_slab_chunk *c, *tmp;
   DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
@@ -1022,14 +1000,10 @@ static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_
   free(cl_data);
 }
 
-/* zeCommandListDestroy epilogue. The L0 spec says the user must have
- * ensured the device is no longer referencing the cl, so we don't drain
- * (the GPU is already idle on this cl). We just release our state.
- *
- * Works for both cl kinds: regular cls (inj baked into the cl body)
- * can recycle inj here because the cl body is about to be destroyed by
- * L0; immediate cls' slots have likely already been released at drain
- * time but any stragglers get cleaned up too. */
+/* zeCommandListDestroy epilogue. Per L0 spec the device is no longer
+ * referencing the cl, so we don't drain — just release our state.
+ * Regular cls recycle inj here (cl body is about to die anyway);
+ * immediate cls' slots are typically already released at drain. */
 static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
   pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_command_list_obj_data *cl_data = _cl_find_and_del(command_list);
@@ -1038,20 +1012,9 @@ static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
-/* zeContextDestroy prologue. The user contract is that the device is no
- * longer referencing the context, so all cls/events bound to it are
- * conceptually dead from the user's perspective. Our job here is solely
- * to avoid leaking our own L0 objects that live inside this context:
- *
- *   1) cls registered against this ctx: free their slot/slab/chunk state
- *      (drop tracer-owned events to L0 without re-pooling — the pool is
- *      about to die anyway).
- *   2) per-(ctx, device) shadow cls: zeCommandListDestroy them.
- *   3) per-ctx event-pool freelist: zeEventDestroy + zeEventPoolDestroy
- *      each wrapper, recycle the wrapper structs.
- *
- * Forwards no calls about the user's own cls/events to the driver — the
- * user takes care of those (or accepts the contract). */
+/* zeContextDestroy prologue. Three sweeps to drop our own L0 objects
+ * that live inside this ctx; the user's own cls/events are their
+ * responsibility per the L0 contract. */
 static void _on_destroy_context(ze_context_handle_t hContext) {
   /* 1) Drop cls bound to this ctx. */
   pthread_mutex_lock(&_ze_state_mutex);
@@ -1131,25 +1094,18 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
-/* Execute-epilogue handler for ONE cl. Runs AFTER L0's actual Execute
- * has returned, with the user cl in flight on its engine.
- *
- * Three phases, all under _ze_state_mutex so a concurrent Execute or Sync
- * on another thread sees them atomically:
+/* Execute-epilogue handler for ONE cl. Runs AFTER L0 Execute returned,
+ * with the user cl in flight. Three phases:
  *
- *   1) If in_flight_q is set from a prior Execute by *another* thread,
- *      force-sync that queue and drain the slab before we overwrite it
- *      (regression test: inorder_reg_Event_multithreaded_01 — same cl on two queues
- *      from two threads, expect both rounds' timings).
- *   2) (SHADOW PATH ONLY) Append a fresh Query on the per-(ctx,device)
- *      shadow cl for each slot. Must run AFTER L0 Execute (not before) —
- *      Appending on the shadow cl before the user cl is in flight
- *      deadlocks when the shadow shares the engine with the user cl
- *      (see tests/bugs/query_on_separate_cl_regular_user_cl). Inline
- *      (compute) cls have the QKT baked into the cl body at Append; it
- *      re-fires automatically on every Execute, no work here.
- *   3) Stamp in_flight_q = hQueue and instantiate each slot, publishing
- *      it to the dep graph + as the "owner" of this queue. */
+ *   1) If in_flight_q is set (prior Execute by another thread),
+ *      force-sync that queue and drain before we overwrite it.
+ *      Regression test: inorder_reg_Event_multithreaded_01.
+ *   2) Shadow-path slots: Append a fresh Query on the per-(ctx,device)
+ *      shadow cl. Must run AFTER L0 Execute — appending earlier
+ *      deadlocks if the shadow shares an engine with the user cl
+ *      (tests/bugs/query_on_separate_cl_regular_user_cl). Inline-path
+ *      cls bake the QKT into the cl body at Append, no work here.
+ *   3) Stamp in_flight_q = hQueue and instantiate each slot. */
 static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
                                ze_command_list_handle_t command_list) {
   pthread_mutex_lock(&_ze_state_mutex);

From 1b8d41e6619bab666965f432dc323358693e2004 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Fri, 12 Jun 2026 16:13:21 +0000
Subject: [PATCH 45/54] ze: attribute device-profiling results from
 AppendSignalEvent correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

zeCommandListAppendSignalEvent's payload field is `hEvent`, not
`hSignalEvent`, so it never qualifies for the hSignalEvent_* matching
sets in btx_zematching_model.yaml and hSignalEvent_rest_entry_callback
never fires for it. threadToLastLaunchInfo therefore retains whatever
prior Append last populated it (typically MemoryFill / Kernel / Memcpy
on the same thread). The next event_profiling — which IS emitted for
AppendSignalEvent — pushes a ring entry tagged with that stale
commandName, and the downstream device tally counts the signal's
profiling result as one more MemoryFill/Kernel/etc.

Symptom in the wild: a trace with 12000 host AppendMemoryFill calls
reports 16002 device MemoryFill(D) — the extra 4002 are AppendSignalEvent
results mis-attributed to the prior fill on each of two reused events.

Add a dedicated zeCommandListAppendSignalEvent_entry_callback that
refreshes threadToLastLaunchInfo with the real command name and a new
btx_event_t::SIGNAL tag, then short-circuit event_profiling_result_callback
on SIGNAL so no device-tally record is emitted (AppendSignalEvent is a
host-side signal and does no GPU work to time). The ring entry itself is
still pushed and the cursor still advances, so subsequent profiling
results for the same event handle land on the correct slot.

Verified against the full bats correctness suite (53/53) and a new
reproducer in thapi_ze_test (inorder_imm_Event_08).
---
 backends/ze/btx_zeinterval_callbacks.cpp | 37 ++++++++++++++++++++----
 backends/ze/btx_zeinterval_callbacks.hpp |  4 ++-
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/backends/ze/btx_zeinterval_callbacks.cpp b/backends/ze/btx_zeinterval_callbacks.cpp
index 2e2d9716..2a1980e8 100644
--- a/backends/ze/btx_zeinterval_callbacks.cpp
+++ b/backends/ze/btx_zeinterval_callbacks.cpp
@@ -559,6 +559,20 @@ static void hSignalEvent_rest_entry_callback(void *btx_handle,
       hCommandList, name, ts, btx_event_t::OTHER, {}};
 }
 
+static void zeCommandListAppendSignalEvent_entry_callback(void *btx_handle,
+                                                          void *usr_data,
+                                                          int64_t ts,
+                                                          const char *hostname,
+                                                          int64_t vpid,
+                                                          uint64_t vtid,
+                                                          ze_command_list_handle_t hCommandList,
+                                                          ze_event_handle_t hEvent) {
+  (void)hEvent;
+  auto *data = static_cast<data_t *>(usr_data);
+  data->threadToLastLaunchInfo[{hostname, vpid, vtid}] = {
+      hCommandList, "zeCommandListAppendSignalEvent", ts, btx_event_t::SIGNAL, {}};
+}
+
 /*
  *             _                              _                   _
  *     _   _  /   _  ._ _  ._ _   _. ._   _| / \      _       _  |_     _   _    _|_  _
@@ -835,11 +849,8 @@ static void event_profiling_callback(void *btx_handle,
     ring.entries.clear();
     ring.cursor = 0;
   }
-  ring.entries.push_back({vtid,         commandQueueDesc,
-                          hCommandList, hCommandListIsImmediate,
-                          hDevice,      commandName,
-                          ts_min,       clockLttngDevice,
-                          type,         ptr});
+  ring.entries.push_back({vtid, commandQueueDesc, hCommandList, hCommandListIsImmediate, hDevice,
+                          commandName, ts_min, clockLttngDevice, type, ptr});
   // Prepare job for non IMM
   if (!hCommandListIsImmediate)
     data->commandListToEvents[{hostname, vpid, hCommandList}].insert(hEvent);
@@ -896,7 +907,8 @@ static void event_profiling_result_callback(void *btx_handle,
   if (it_p == data->eventToBtxDesct.cend() || it_p->second.entries.empty())
     return;
   auto &ring = it_p->second;
-  if (ring.cursor >= ring.entries.size()) ring.cursor = 0;
+  if (ring.cursor >= ring.entries.size())
+    ring.cursor = 0;
   const auto &[vtid_submission, commandQueueDesc, hCommandList, hCommandListIsImmediate, device,
                commandName, lltngMin, clockLttngDevice, type, ptr] = ring.entries[ring.cursor];
   ring.cursor++;
@@ -913,6 +925,13 @@ static void event_profiling_result_callback(void *btx_handle,
   if (!hCommandListIsImmediate)
     data->commandListToEvents[{hostname, vpid, hCommandList}].erase(hEvent);
 
+  /* AppendSignalEvent is a host-side signal with no GPU work to time.
+   * We pushed a ring entry to keep state consistent (so a future
+   * profiling_results lookup doesn't walk a stale prior entry), but
+   * suppress the device-side tally emission here. */
+  if (type == btx_event_t::SIGNAL)
+    return;
+
   if ((type == btx_event_t::TRAFFIC) && (status == ZE_RESULT_SUCCESS)) {
     auto &[ts, size] = std::get<btx_additional_info_traffic_t>(ptr);
     btx_push_message_lttng_traffic(btx_handle, hostname, vpid, vtid, ts, BACKEND_ZE,
@@ -1412,6 +1431,12 @@ void btx_register_usr_callbacks(void *btx_handle) {
   REGISTER_ASSOCIATED_CALLBACK(eventMemory_without_hSignalEvent_exit);
   REGISTER_ASSOCIATED_CALLBACK(hSignalEvent_rest_entry);
 
+  /* zeCommandListAppendSignalEvent doesn't match the hSignalEvent_* sets
+   * (payload is `hEvent`, not `hSignalEvent`), so it needs its own entry
+   * callback to keep threadToLastLaunchInfo from going stale. */
+  btx_register_callbacks_lttng_ust_ze_zeCommandListAppendSignalEvent_entry(
+      btx_handle, &zeCommandListAppendSignalEvent_entry_callback);
+
   /* Remove Memory */
   REGISTER_ASSOCIATED_CALLBACK(memFree_entry);
   REGISTER_ASSOCIATED_CALLBACK(memFree_exit);
diff --git a/backends/ze/btx_zeinterval_callbacks.hpp b/backends/ze/btx_zeinterval_callbacks.hpp
index 165e6c13..a6cdb0e0 100644
--- a/backends/ze/btx_zeinterval_callbacks.hpp
+++ b/backends/ze/btx_zeinterval_callbacks.hpp
@@ -55,7 +55,9 @@ using btx_kernel_group_size_t = std::tuple<uint32_t, uint32_t, uint32_t>;
 using btx_kernel_desct_t =
     std::tuple<std::string /*ze_kernel_desc_t*/, ze_kernel_properties_t, btx_kernel_group_size_t>;
 
-enum class btx_event_t { TRAFFIC, KERNEL, OTHER };
+// SIGNAL = zeCommandListAppendSignalEvent. Ring entry is created so state
+// stays consistent, but filtered out of the device tally (no GPU work).
+enum class btx_event_t { TRAFFIC, KERNEL, SIGNAL, OTHER };
 using btx_additional_info_traffic_t = std::tuple<int64_t /*ts*/, size_t /*size*/>;
 using btx_additional_info_kernel_t = std::string /*metadata*/;
 using btx_additional_info =

From a898759df31e4aaa88f79003071770fe239fa640 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Fri, 12 Jun 2026 21:49:05 +0000
Subject: [PATCH 46/54] ze tests: document ring/cursor scenarios; drop
 unreachable _fast test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The four shared_event / resubmit BTX tests are synthetic inputs fed
straight into the interval filter — they don't include Create/Execute
because the test_wrapper converter can't synthesize struct values, and
that omission makes the trace read as physically impossible (4 results
from 2 Appends with no Execute in between). Add a short header to each
explaining what ring/cursor branch it exercises.

interval_profiling_fast assumed event_profiling_results could arrive
inside the Append_exit window. With the new tracer, results are only
emitted at sync time, so that interleaving is no longer reachable —
delete the test and drop it from TRACE_COMMON.

To allow header comments in the .thapi_text_pretty files, teach
thapi_log_to_bt_source_component.rb to skip blank/# lines.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backends/ze/Makefile.am                                     | 1 -
 backends/ze/tests/interval_profiling_fast.bt_text_pretty    | 2 --
 backends/ze/tests/interval_profiling_fast.thapi_text_pretty | 4 ----
 .../interval_profiling_resubmit_event.thapi_text_pretty     | 3 +++
 .../tests/interval_profiling_shared_event.thapi_text_pretty | 2 ++
 ...terval_profiling_shared_event_resubmit.thapi_text_pretty | 3 +++
 ...interval_profiling_shared_event_xphase.thapi_text_pretty | 6 ++++++
 utils/thapi_log_to_bt_source_component.rb                   | 5 ++++-
 8 files changed, 18 insertions(+), 8 deletions(-)
 delete mode 100644 backends/ze/tests/interval_profiling_fast.bt_text_pretty
 delete mode 100644 backends/ze/tests/interval_profiling_fast.thapi_text_pretty

diff --git a/backends/ze/Makefile.am b/backends/ze/Makefile.am
index d0ecaf54..942c0947 100644
--- a/backends/ze/Makefile.am
+++ b/backends/ze/Makefile.am
@@ -278,7 +278,6 @@ TRACE_COMMON = \
 	tests/interval_profiling_normal.thapi_text_pretty \
 	tests/interval_profiling_multithread.thapi_text_pretty \
 	tests/interval_profiling_API_call.thapi_text_pretty \
-	tests/interval_profiling_fast.thapi_text_pretty \
 	tests/interval_profiling_interleave_process.thapi_text_pretty \
 	tests/interval_profiling_ignore.thapi_text_pretty \
 	tests/interval_profiling_shared_event.thapi_text_pretty \
diff --git a/backends/ze/tests/interval_profiling_fast.bt_text_pretty b/backends/ze/tests/interval_profiling_fast.bt_text_pretty
deleted file mode 100644
index 3403ebcd..00000000
--- a/backends/ze/tests/interval_profiling_fast.bt_text_pretty
+++ /dev/null
@@ -1,2 +0,0 @@
-lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
-lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, err = false }
diff --git a/backends/ze/tests/interval_profiling_fast.thapi_text_pretty b/backends/ze/tests/interval_profiling_fast.thapi_text_pretty
deleted file mode 100644
index fb6f10a7..00000000
--- a/backends/ze/tests/interval_profiling_fast.thapi_text_pretty
+++ /dev/null
@@ -1,4 +0,0 @@
-12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
-12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x1000000000000000 }
-12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x1000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
-12:00:00.030000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
diff --git a/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
index b7898be0..a2b2fca2 100644
--- a/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
+++ b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
@@ -1,3 +1,6 @@
+# 1 Append, but the underlying cl is Executed twice in a real run, so
+# 2 result tracepoints arrive for the same hEvent. The ring has 1 entry; cursor
+# wraps 0 -> 1 -> 0 -> 1 across the 2 results.
 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
 12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
 12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
diff --git a/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
index 341d8d3b..e38b4f2d 100644
--- a/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
+++ b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
@@ -1,3 +1,5 @@
+# 4 Appends share one hEvent, all results consumed in submission
+# order. Exercises ring growth without cursor wrap.
 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
 12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
 12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
diff --git a/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
index c6acc6e5..6abef300 100644
--- a/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
+++ b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
@@ -1,3 +1,6 @@
+# 2 Appends share one hEvent, then the underlying cl is Executed
+# twice, so 4 result tracepoints arrive. ring.entries = [E1, E2]; cursor walks
+# 0,1,(wrap)0,1 to deal them out in submission order across both submissions.
 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
 12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
 12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
diff --git a/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
index c69025a0..287a7ca3 100644
--- a/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
+++ b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
@@ -1,3 +1,9 @@
+# Two build phases on the same cl, both reusing the same hEvent.
+# Phase 1: 2 Appends, cl Executed twice -> 4 results, cursor wraps 0,1,0,1.
+# Phase 2: cl is Reset and 1 fresh Append happens. The next event_profiling
+# arrives with cursor > 0, which triggers the "clear+reset cursor" branch
+# (btx_zeinterval_callbacks.cpp:847-851) so the new Append starts a fresh ring
+# instead of appending to the stale phase-1 entries. 1 Execute -> 2 results.
 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
 12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
 12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
diff --git a/utils/thapi_log_to_bt_source_component.rb b/utils/thapi_log_to_bt_source_component.rb
index 3e16753f..f27fe412 100755
--- a/utils/thapi_log_to_bt_source_component.rb
+++ b/utils/thapi_log_to_bt_source_component.rb
@@ -146,7 +146,10 @@ def parse_event(model, line, exclude_fields)
 
 def parse_log(model, input_path, exclude_fields)
   File.open(input_path, 'r') do |file|
-    file.each_line.map do |line|
+    file.each_line.filter_map do |line|
+      stripped = line.strip
+      next if stripped.empty? || stripped.start_with?('#')
+
       parse_event(model, line, exclude_fields)
     end
   end

From 72c2703a0a39da113b725f192f0ea771952f29df Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 17 Jun 2026 15:43:59 +0000
Subject: [PATCH 47/54] ze tests: rewrite ring fixture headers to describe
 observable behavior

The four shared_event / resubmit BTX fixture headers leaked consumer
internals (ring, cursor, entries, a clear+reset branch name, and a
hard-coded source line number that was already off-by-one). Rewrite each
to describe only what is observable: shared events, resubmissions, and
results attributed to their Appends in submission order. Also fix the
xphase header's result count (phase 2 is 3 results, not 2).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../interval_profiling_resubmit_event.thapi_text_pretty  | 4 ++--
 .../interval_profiling_shared_event.thapi_text_pretty    | 4 ++--
 ...val_profiling_shared_event_resubmit.thapi_text_pretty | 6 +++---
 ...erval_profiling_shared_event_xphase.thapi_text_pretty | 9 ++++-----
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
index a2b2fca2..b4c3ca9b 100644
--- a/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
+++ b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
@@ -1,6 +1,6 @@
 # 1 Append, but the underlying cl is Executed twice in a real run, so
-# 2 result tracepoints arrive for the same hEvent. The ring has 1 entry; cursor
-# wraps 0 -> 1 -> 0 -> 1 across the 2 results.
+# 2 results arrive for the same hEvent. Both are attributed to that one
+# Append.
 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
 12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
 12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
diff --git a/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
index e38b4f2d..64199d25 100644
--- a/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
+++ b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
@@ -1,5 +1,5 @@
-# 4 Appends share one hEvent, all results consumed in submission
-# order. Exercises ring growth without cursor wrap.
+# 4 Appends share one hEvent. Each Append's result is attributed back to
+# its own Append, in submission order.
 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
 12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
 12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
diff --git a/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
index 6abef300..fb64b5d7 100644
--- a/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
+++ b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
@@ -1,6 +1,6 @@
-# 2 Appends share one hEvent, then the underlying cl is Executed
-# twice, so 4 result tracepoints arrive. ring.entries = [E1, E2]; cursor walks
-# 0,1,(wrap)0,1 to deal them out in submission order across both submissions.
+# 2 Appends share one hEvent, then the underlying cl is Executed twice,
+# so 4 results arrive. Each submission's pair of results is attributed to
+# the two Appends in submission order.
 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
 12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
 12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
diff --git a/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
index 287a7ca3..e9d336d8 100644
--- a/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
+++ b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
@@ -1,9 +1,8 @@
 # Two build phases on the same cl, both reusing the same hEvent.
-# Phase 1: 2 Appends, cl Executed twice -> 4 results, cursor wraps 0,1,0,1.
-# Phase 2: cl is Reset and 1 fresh Append happens. The next event_profiling
-# arrives with cursor > 0, which triggers the "clear+reset cursor" branch
-# (btx_zeinterval_callbacks.cpp:847-851) so the new Append starts a fresh ring
-# instead of appending to the stale phase-1 entries. 1 Execute -> 2 results.
+# Phase 1: 2 Appends, cl Executed twice -> 4 results.
+# Phase 2: cl is Reset, then 1 Append, cl Executed three times -> 3 results.
+# Results from a phase are attributed only to that phase's Appends; the
+# phase-1 Appends do not bleed into the phase-2 results.
 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
 12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
 12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }

From a7dffd7009401e103d721c63f909ed37b75d6c42 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 17 Jun 2026 15:44:08 +0000
Subject: [PATCH 48/54] ze: drain profiling slots on fence-only sync

A program that Executed a regular cl with a fence and waited only via
zeFenceHostSynchronize lost all its profiling results: drain ran from the
queue / event / cl-host sync hooks but not from fence sync, so that anchor
never triggered a drain and the slots were freed un-emitted at teardown.

Stamp the Execute's fence onto each cl (in_flight_fence, alongside the
existing in_flight_q) and add an _on_sync_drain_fence hook on
zeFenceHostSynchronize that drains every cl whose fence matches. The fence
signals when all cls in its Execute complete, so it is a valid drain anchor
for exactly those cls. zeFenceQueryStatus is intentionally not hooked (a
non-blocking poll could race a still-building reuse). Cleared together with
in_flight_q wherever a cl is drained.

Measured on PVC (8 fills): before, fence-sync gave results=0; after,
results=8, matching the queue-sync control.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 47 ++++++++++++++++++++-----
 backends/ze/ze_model.rb                 | 14 ++++++--
 2 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 31ddcd0d..86e62376 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -220,6 +220,12 @@ struct _ze_command_list_obj_data {
    *
    * Held only for regular cls; immediate cls never Execute. */
   ze_command_queue_handle_t in_flight_q;
+  /* The fence (if any) passed to that same Execute. NULL when the user
+   * Executed without a fence. Lets a fence-only sync find which cls to
+   * drain — the fence signals when all cls in its Execute complete, so
+   * zeFenceHostSynchronize(f) drains every cl whose in_flight_fence == f.
+   * Set on Execute alongside in_flight_q, cleared together on drain. */
+  ze_fence_handle_t in_flight_fence;
   unsigned char is_immediate;
   unsigned char is_in_order;
   /* 1 if this cl's queue group exposes COMPUTE — its body can host
@@ -965,6 +971,7 @@ static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
       _cl_chunk_free(cl_data, c, /*free_slab=*/1);
   }
   cl_data->in_flight_q = NULL;
+  cl_data->in_flight_fence = NULL;
 }
 
 /* Drain a single cl. */
@@ -1067,6 +1074,20 @@ static void _on_sync_drain_queue(ze_command_queue_handle_t hQueue) {
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
+/* Drain every cl whose in_flight_fence matches. A fence signals when all
+ * cls submitted in its Execute have completed, so waiting on the fence is
+ * a valid drain anchor for exactly those cls — the same role hQueue plays
+ * for zeCommandQueueSynchronize. */
+static void _on_sync_drain_fence(ze_fence_handle_t hFence) {
+  pthread_mutex_lock(&_ze_state_mutex);
+  struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
+  HASH_ITER(hh, _ze_cls, cl_data, tmp) {
+    if (cl_data->in_flight_fence == hFence)
+      _cl_drain(cl_data);
+  }
+  pthread_mutex_unlock(&_ze_state_mutex);
+}
+
 /* Drain the slot that most recently signaled `ev` (recursing on preds). */
 static void _on_sync_drain_event(ze_event_handle_t ev) {
   pthread_mutex_lock(&_ze_state_mutex);
@@ -1077,7 +1098,7 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
   }
   _slot_drain(s);
   /* The drained slot may have left siblings live; only clear
-   * in_flight_q if nothing in this cl remains in flight. */
+   * in_flight_q / in_flight_fence if nothing in this cl remains in flight. */
   int any_live = 0;
   struct _ze_slab_chunk *c;
   DL_FOREACH(s->owner->chunks, c) {
@@ -1089,8 +1110,10 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
     if (any_live)
       break;
   }
-  if (!any_live)
+  if (!any_live) {
     s->owner->in_flight_q = NULL;
+    s->owner->in_flight_fence = NULL;
+  }
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
@@ -1100,13 +1123,17 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
  *   1) If in_flight_q is set (prior Execute by another thread),
  *      force-sync that queue and drain before we overwrite it.
  *      Regression test: inorder_reg_Event_multithreaded_01.
- *   2) Shadow-path slots: Append a fresh Query on the per-(ctx,device)
- *      shadow cl. Must run AFTER L0 Execute — appending earlier
- *      deadlocks if the shadow shares an engine with the user cl
- *      (tests/bugs/query_on_separate_cl_regular_user_cl). Inline-path
- *      cls bake the QKT into the cl body at Append, no work here.
- *   3) Stamp in_flight_q = hQueue and instantiate each slot. */
+ *   2) Publish each not-yet-live slot (_slot_publish): shadow-path slots
+ *      Append a fresh Query on the per-(ctx,device) shadow cl, then every
+ *      slot is instantiated into the dep graph. The Append must run AFTER
+ *      L0 Execute — appending earlier deadlocks if the shadow shares an
+ *      engine with the user cl (tests/bugs/query_on_separate_cl_regular_user_cl).
+ *      Inline-path cls bake the QKT into the cl body at Append, so their
+ *      publish is instantiate-only.
+ *   3) Stamp in_flight_q = hQueue and in_flight_fence = hFence (the fence
+ *      the user passed to this Execute, or NULL). */
 static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
+                               ze_fence_handle_t hFence,
                                ze_command_list_handle_t command_list) {
   pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
@@ -1151,15 +1178,17 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
     }
   }
   cl_data->in_flight_q = hQueue;
+  cl_data->in_flight_fence = hFence;
 
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
 static void _on_execute_command_lists_epilogue(ze_command_queue_handle_t hQueue,
+                                               ze_fence_handle_t hFence,
                                                uint32_t numCommandLists,
                                                ze_command_list_handle_t *phCommandLists) {
   for (uint32_t i = 0; i < numCommandLists; ++i)
-    _on_execute_one_cl(hQueue, phCommandLists[i]);
+    _on_execute_one_cl(hQueue, hFence, phCommandLists[i]);
 }
 
 static pthread_once_t _init = PTHREAD_ONCE_INIT;
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index 2411c701..b168d5dc 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -193,7 +193,7 @@ def upper_snake_case(str)
 # shadow Query op holds the engine, deadlocking the user cl.
 register_epilogue 'zeCommandQueueExecuteCommandLists', <<EOF
   if (_do_profile && _retval == ZE_RESULT_SUCCESS && numCommandLists > 0 && phCommandLists)
-    _on_execute_command_lists_epilogue(hCommandQueue, numCommandLists, phCommandLists);
+    _on_execute_command_lists_epilogue(hCommandQueue, hFence, numCommandLists, phCommandLists);
 EOF
 
 # Sync hooks: walk dependency edges from the synced anchor and drain
@@ -213,8 +213,16 @@ def upper_snake_case(str)
     _on_sync_drain_cl(hCommandList);
 EOF
 
-# Fence sync: deferred (would need a fence->queue map). The tests using
-# fences (m_fence_sync) don't exist in the new matrix yet.
+# Fence sync: the fence the user passed to Execute is stamped onto each cl
+# (in_flight_fence), so a fence wait drains exactly the cls that Execute
+# submitted. zeFenceQueryStatus is NOT hooked: it's a non-blocking poll, so
+# a SUCCESS return means the work is done but we can't assume the user is
+# finished issuing — draining there could race a still-building reuse. The
+# blocking zeFenceHostSynchronize is the safe anchor.
+register_epilogue 'zeFenceHostSynchronize', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hFence)
+    _on_sync_drain_fence(hFence);
+EOF
 
 register_prologue 'zeEventPoolCreate', <<EOF
   ze_event_pool_desc_t _new_desc;

From 8a48098d52437e2f39c59625df7927683034d844 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 17 Jun 2026 18:47:55 +0000
Subject: [PATCH 49/54] ze: warn once when a regular cl exceeds its
 profiled-Append cap

Regular cls store profiling slots in a single 64-slot chunk (inj events are
baked into the closed cl body, so storage can't move). Appends past the cap
were dropped silently: the kernel ran and the issued tracepoint fired, but
no result tracepoint ever followed. Warn once (guarded, under the state
mutex) so the data loss is visible instead of silent. A full fix needs
multi-chunk regular-cl storage and is left as follow-up.

Confirmed on PVC: 100 Appends in one build -> 64 results; warning now fires.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 86e62376..5c6072a5 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -672,8 +672,20 @@ static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_dat
                                         uint32_t n_waits) {
   struct _ze_slab_chunk *tail = cl_data->chunks ? cl_data->chunks->prev : NULL;
   if (!tail || tail->n_used >= _ZE_SLAB_CHUNK_SLOTS) {
-    if (tail && !cl_data->is_immediate)
+    if (tail && !cl_data->is_immediate) {
+      /* Regular cl is capped at one chunk (inj events are baked into the
+       * closed cl body, so storage can't move). Past the cap we drop the
+       * Append's profiling silently — warn once so the data loss is at
+       * least visible. Called under _ze_state_mutex, so the guard is safe. */
+      static int warned = 0;
+      if (!warned) {
+        warned = 1;
+        _THAPI_LOG("warning: regular command list %p exceeded %d profiled "
+                   "Appends in one build; further Appends will not be timed",
+                   (void *)cl_data->ptr, _ZE_SLAB_CHUNK_SLOTS);
+      }
       return NULL;
+    }
     tail = _cl_chunk_alloc(cl_data, ctx);
     if (!tail)
       return NULL;

From 3a8a41b46d4a16dbab0433148f2e5573a6d54d39 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 17 Jun 2026 21:00:22 +0000
Subject: [PATCH 50/54] ze: reclaim slot state on cl reset/reuse; fix cross-cl
 UAF; cap driver QKT leak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three related fixes for command-list reuse:

1. zeCommandListReset hook (_on_reset_command_list). Regular cls never
   reclaim slots at drain (_slot_release is a no-op for them — their inj is
   baked into the cl body for cross-Execute reuse). Without a Reset hook the
   stale slots were re-published on the next Execute: a regular cl reused
   across Reset over-counted massively (80 rounds -> 3104 results). The hook
   drains defensively then reclaims, keeping cl_data registered and empty.

2. Cross-cl use-after-free in teardown (reset AND pre-existing destroy). A
   drained slot may still be a pred of a LIVE slot in another cl (refs>0);
   freeing its chunk dangled that preds[] pointer. Deterministically
   reproduced (quarantine probe, 3/3) on reset and on zeCommandListDestroy.
   Fix: _cl_chunk_reclaim frees a chunk only when no slot has refs>0; else it
   DETACHES the chunk (unlink from cl, null each slot's owner, free slab,
   set n_pinned) and the downstream drain frees the bare struct when the last
   ref drops (detached branch in _slot_release). ctx-dying destroy still
   frees wholesale (no slot outlives the ctx).

3. Driver per-QKT storage growth on a long-lived reused IMMEDIATE cl
   (~795 KB/round). The driver only reclaims QKT storage at Reset/Destroy of
   the cl. _imm_reset_if_drained raw-resets a fully-drained immediate cl on
   sync (untraced; safe — all work complete) and reclaims our bookkeeping.

Verified on PVC: reg_reset_reappend 80/80; cross-cl UAF probes clean post-fix;
mem_persistent_cl 159 MB -> 96 bytes (flat); correctness suite 57/57.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 141 +++++++++++++++++++++++-
 backends/ze/ze_model.rb                 |  16 ++-
 2 files changed, 147 insertions(+), 10 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 5c6072a5..ed36284d 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -204,6 +204,14 @@ struct _ze_slab_chunk {
   ze_context_handle_t slab_ctx; /* context the slab was allocated against (zeMemFree target) */
   uint32_t n_used;              /* slots ever assigned in this chunk (monotonic until chunk free) */
   uint32_t n_held;              /* unreleased slots (n_used minus _slot_release calls) */
+  /* Nonzero only on a DETACHED chunk: one whose owning cl was torn down
+   * (reset/destroy) while >=1 slot was still referenced as a pred by a live
+   * slot in ANOTHER cl. The chunk is removed from cl_data->chunks, its slots'
+   * resources are already released and owner==NULL — only the struct survives
+   * so the referrers' preds[] pointers stay valid. n_pinned counts those
+   * surviving referenced slots; the downstream drain that drops the last ref
+   * frees the struct. 0 for normal attached chunks. */
+  uint32_t n_pinned;
   struct _ze_slab_chunk *next, *prev;
   struct _ze_slot slots[_ZE_SLAB_CHUNK_SLOTS];
 };
@@ -887,7 +895,20 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
  * tail, unlink and free it. Regular cls are skipped (their inj is
  * baked into the cl body — reclaim happens at cl destroy instead). */
 static void _slot_release(struct _ze_slot *s) {
-  if (!s || !s->owner || !s->owner->is_immediate)
+  if (!s)
+    return;
+  /* Detached slot: its owning cl was torn down (reset/destroy) while this
+   * slot was still a pred of a live slot elsewhere. Its resources were freed
+   * at reclaim and owner was nulled; the chunk struct was kept alive only to
+   * keep this slot's refs addressable. We are the downstream drain dropping
+   * the last ref — drop the chunk's pin and free the bare struct at zero. */
+  if (!s->owner && s->chunk && s->chunk->n_pinned) {
+    struct _ze_slab_chunk *c = s->chunk;
+    if (--c->n_pinned == 0)
+      free(c);
+    return;
+  }
+  if (!s->owner || !s->owner->is_immediate)
     return;
   if (s->inj) {
     _put_ze_event(s->inj);
@@ -986,35 +1007,124 @@ static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
   cl_data->in_flight_fence = NULL;
 }
 
+static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data); /* fwd */
+
+/* Immediate cls only: once every slot in the cl is drained, raw-Reset the
+ * user's cl so the L0 driver reclaims its per-QKT storage (it accumulates
+ * otherwise on a long-lived reused immediate cl — see bench/mem_persistent_cl),
+ * then reclaim our own slot bookkeeping (the baked state is gone after the
+ * driver reset, exactly like a user zeCommandListReset on a regular cl).
+ * Raw *_PTR = untraced; safe only when no slot is still live (no in-flight
+ * work). Called at the tail of every sync-drain path that can touch an imm cl. */
+static void _imm_reset_if_drained(struct _ze_command_list_obj_data *cl_data) {
+  if (!cl_data || !cl_data->is_immediate)
+    return;
+  struct _ze_slab_chunk *c;
+  DL_FOREACH(cl_data->chunks, c)
+    for (uint32_t i = 0; i < c->n_used; ++i)
+      if (c->slots[i].live)
+        return; /* still in-flight work — unsafe to reset */
+  ZE_COMMAND_LIST_RESET_PTR((ze_command_list_handle_t)cl_data->ptr);
+  _cl_data_reset(cl_data);
+}
+
 /* Drain a single cl. */
 static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
   pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
-  if (cl_data)
+  if (cl_data) {
     _cl_drain(cl_data);
+    _imm_reset_if_drained(cl_data);
+  }
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
+/* Reclaim one chunk during cl teardown (reset or single-cl destroy, ctx
+ * alive). Releases every slot's resources (events to pool, waits, preds,
+ * clears latest-signaled), then either frees the chunk or — if any slot is
+ * still referenced as a pred by a live slot in ANOTHER cl (refs>0) — DETACHES
+ * it: unlink from cl_data->chunks, null each slot's owner, and keep the bare
+ * struct alive with n_pinned = #referenced slots. The downstream drains that
+ * drop those refs free the struct (see _slot_release's detached branch).
+ * Without this, freeing the chunk here would dangle the referrers' preds[]. */
+static void _cl_chunk_reclaim(struct _ze_command_list_obj_data *cl_data,
+                              struct _ze_slab_chunk *c) {
+  uint32_t pinned = 0;
+  for (uint32_t i = 0; i < c->n_used; ++i) {
+    struct _ze_slot *s = &c->slots[i];
+    if (s->inj) {
+      _put_ze_event(s->inj);
+      s->inj = NULL;
+    }
+    if (s->shadow_done) {
+      _put_ze_event(s->shadow_done);
+      s->shadow_done = NULL;
+    }
+    free(s->waits);
+    s->waits = NULL;
+    s->n_waits = 0;
+    free(s->preds);
+    s->preds = NULL;
+    s->n_preds = 0;
+    _event_latest_signaled_clear_if(s->attr, s);
+    s->attr = NULL;
+    if (s->refs)
+      pinned++;
+  }
+  if (pinned == 0) {
+    _cl_chunk_free(cl_data, c, /*free_slab=*/1);
+    return;
+  }
+  /* Detach: keep the struct alive for the surviving referenced slots. */
+  DL_DELETE(cl_data->chunks, c);
+  if (c->slab) {
+    ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+    c->slab = NULL;
+  }
+  for (uint32_t i = 0; i < c->n_used; ++i)
+    c->slots[i].owner = NULL;
+  c->n_pinned = pinned;
+}
+
+/* Reclaim all of a regular cl's slot state, keeping cl_data registered and
+ * empty for reuse. Used by the zeCommandListReset hook. */
+static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data) {
+  struct _ze_slab_chunk *c, *tmp;
+  DL_FOREACH_SAFE(cl_data->chunks, c, tmp)
+    _cl_chunk_reclaim(cl_data, c);
+  cl_data->in_flight_q = NULL;
+  cl_data->in_flight_fence = NULL;
+}
+
 /* Release everything cl_data owns and free cl_data itself. Caller has
  * already removed cl_data from _ze_cls (single-cl: _cl_find_and_del;
  * per-ctx sweep: HASH_DEL inside the iter). When ctx is dying we just
  * recycle wrapper structs (the L0 event/pool will be destroyed in
  * _on_destroy_context step 3) and skip zeMemFree on the slab (the
- * driver reclaims, and zeMemFree on a doomed ctx is racy). */
+ * driver reclaims, and zeMemFree on a doomed ctx is racy); no slot can
+ * outlive the ctx, so no detach is needed. When the ctx is alive a slot
+ * may still be referenced cross-cl, so we reclaim per-chunk (detaching
+ * referenced chunks) just like reset. */
 static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_dying) {
   struct _ze_slab_chunk *c, *tmp;
+  if (!ctx_dying) {
+    DL_FOREACH_SAFE(cl_data->chunks, c, tmp)
+      _cl_chunk_reclaim(cl_data, c);
+    free(cl_data);
+    return;
+  }
   DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
     for (uint32_t i = 0; i < c->n_used; ++i) {
       struct _ze_slot *s = &c->slots[i];
       if (s->inj)
-        ctx_dying ? _put_ze_event_wrapper(s->inj) : _put_ze_event(s->inj);
+        _put_ze_event_wrapper(s->inj);
       if (s->shadow_done)
-        ctx_dying ? _put_ze_event_wrapper(s->shadow_done) : _put_ze_event(s->shadow_done);
+        _put_ze_event_wrapper(s->shadow_done);
       free(s->waits);
       free(s->preds);
       _event_latest_signaled_clear_if(s->attr, s);
     }
-    _cl_chunk_free(cl_data, c, /*free_slab=*/!ctx_dying);
+    _cl_chunk_free(cl_data, c, /*free_slab=*/0);
   }
   free(cl_data);
 }
@@ -1031,6 +1141,24 @@ static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
+/* zeCommandListReset epilogue. The L0 spec requires the user to have
+ * synchronized before Reset, so our slots are drained — but for a REGULAR cl
+ * "drained" is not "reclaimed": _slot_release is a no-op for regular cls
+ * (their inj is baked into the cl body, kept for reuse across Executes), so
+ * the slots linger. Reset wipes that body, so we must reclaim now; otherwise
+ * the stale slots are re-published on the next Execute (massive over-count)
+ * and their chunks accumulate (leak). We drain defensively first in case the
+ * user under-synced, then reclaim. The cl stays registered, empty for reuse. */
+static void _on_reset_command_list(ze_command_list_handle_t command_list) {
+  pthread_mutex_lock(&_ze_state_mutex);
+  struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
+  if (cl_data) {
+    _cl_drain(cl_data);
+    _cl_data_reset(cl_data);
+  }
+  pthread_mutex_unlock(&_ze_state_mutex);
+}
+
 /* zeContextDestroy prologue. Three sweeps to drop our own L0 objects
  * that live inside this ctx; the user's own cls/events are their
  * responsibility per the L0 contract. */
@@ -1125,6 +1253,7 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
   if (!any_live) {
     s->owner->in_flight_q = NULL;
     s->owner->in_flight_fence = NULL;
+    _imm_reset_if_drained(s->owner);
   }
   pthread_mutex_unlock(&_ze_state_mutex);
 }
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index b168d5dc..45188d98 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -157,11 +157,19 @@ def upper_snake_case(str)
   }
 EOF
 
-# Reset hook intentionally omitted: the L0 spec
+# Reset hook: the L0 spec
 # (https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#zecommandlistreset)
-# says the user must have synchronized first, so all our slots are
-# already drained.
-#
+# says the user must have synchronized first, so our slots are drained — but
+# for a REGULAR cl "drained" is not "reclaimed" (_slot_release is a no-op for
+# regular cls; their inj is baked into the cl body for reuse across Executes).
+# Reset wipes that body, so we reclaim the slots/chunks/events now. Without it
+# the stale slots are re-published on the next Execute (over-count) and chunks
+# leak. The cl stays registered, empty for reuse.
+register_epilogue 'zeCommandListReset', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hCommandList)
+    _on_reset_command_list(hCommandList);
+EOF
+
 # Destroy hook: the same spec rule applies for the GPU side (no in-flight
 # work on the cl), but we still need to clean up OUR host-side state —
 # slot/slab chunks, per-slot waits, and tracer-owned events that haven't

From 221c02be372ed285775062dbf28ec09f0ddd48ba Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0002.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 17 Jun 2026 21:31:43 +0000
Subject: [PATCH 51/54] ze: serve kernel timestamps back to the user's own
 zeEventQueryKernelTimestamp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Append prologue swaps the user's signal event for our injected event, so
the user's event ends up carrying the QKT/barrier op timing instead of the
kernel's. A tracer-unaware program that signals a KERNEL_TIMESTAMP event from
a kernel and then calls zeEventQueryKernelTimestamp on it read the wrong
(op-scale) duration — measured ratio 0.77x vs the true 2x.

Stash the kernel result (already read from the slab at drain) keyed by the
user's event, and add a zeEventQueryKernelTimestamp epilogue that overwrites
*dstptr with it. Re-signaling overwrites the entry with the latest result.

Verified on PVC: user_kts_query_ratio 0.77 -> 2.00; correctness suite 58/58
(incl. tests where the user appends their own AppendQueryKernelTimestamps).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 70 +++++++++++++++++++++++--
 backends/ze/ze_model.rb                 | 10 ++++
 2 files changed, 75 insertions(+), 5 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index ed36284d..4c0e3417 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -533,6 +533,45 @@ static inline void _event_latest_signaled_clear_if(ze_event_handle_t ev, struct
   }
 }
 
+/* event -> last kernel-timestamp result we drained for it. The Append
+ * prologue swaps the user's signal event for our inj, so the user's event
+ * ends up carrying the QKT/barrier op timing, not the kernel's — a user who
+ * calls zeEventQueryKernelTimestamp on their own event would read garbage.
+ * At drain we already read the real kernel result from the slab; stash it
+ * here keyed by the user's event so the query hook can serve it back.
+ * Re-signaling an event just overwrites the entry with the latest result. */
+struct _ze_event_kts_entry {
+  ze_event_handle_t ev; /* key */
+  ze_kernel_timestamp_result_t result;
+  UT_hash_handle hh;
+};
+static struct _ze_event_kts_entry *_ze_event_kts = NULL;
+
+static inline void _event_kts_set(ze_event_handle_t ev, ze_kernel_timestamp_result_t r) {
+  if (!ev)
+    return;
+  struct _ze_event_kts_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_kts, &ev, e);
+  if (!e) {
+    e = (struct _ze_event_kts_entry *)calloc(1, sizeof(*e));
+    if (!e)
+      return;
+    e->ev = ev;
+    HASH_ADD_PTR(_ze_event_kts, ev, e);
+  }
+  e->result = r;
+}
+
+/* Copy the stashed kernel result for ev into *out; 1 if found, 0 otherwise. */
+static inline int _event_kts_get(ze_event_handle_t ev, ze_kernel_timestamp_result_t *out) {
+  struct _ze_event_kts_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_kts, &ev, e);
+  if (!e)
+    return 0;
+  *out = e->result;
+  return 1;
+}
+
 /* Pop one recycled event wrapper from the per-context freelist; NULL
  * if none cached (caller falls back to creating a fresh L0 event). */
 static struct _ze_event_h *_get_ze_event(ze_context_handle_t context) {
@@ -967,13 +1006,19 @@ static void _slot_drain(struct _ze_slot *s) {
     }
   }
   ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
-  if (s->chunk && s->chunk->slab && attr &&
-      tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) {
+  if (s->chunk && s->chunk->slab && attr) {
     ze_kernel_timestamp_result_t r =
         *(ze_kernel_timestamp_result_t *)((char *)s->chunk->slab + s->off);
-    do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr, ZE_RESULT_SUCCESS,
-                  ZE_RESULT_SUCCESS, r.global.kernelStart, r.global.kernelEnd,
-                  r.context.kernelStart, r.context.kernelEnd);
+    /* Stash the kernel result under the user's own event so the user's
+     * zeEventQueryKernelTimestamp returns kernel timing, not the QKT/barrier
+     * op timing their event actually carries (we swapped it for inj). Only
+     * when the user supplied an event (s->attr); inj is ours, not queryable. */
+    if (s->attr)
+      _event_kts_set(s->attr, r);
+    if (tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results))
+      do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr, ZE_RESULT_SUCCESS,
+                    ZE_RESULT_SUCCESS, r.global.kernelStart, r.global.kernelEnd,
+                    r.context.kernelStart, r.context.kernelEnd);
   }
   _event_latest_signaled_clear_if(s->attr, s);
   /* Drop refs on preds; release any that hit 0 and are already drained. */
@@ -1258,6 +1303,21 @@ static void _on_sync_drain_event(ze_event_handle_t ev) {
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
+/* zeEventQueryKernelTimestamp epilogue. If we drained a kernel result for
+ * this user event, overwrite *dstptr with it: the user's event carries the
+ * QKT/barrier op timing (we swapped their signal for inj at Append), but the
+ * caller wants the KERNEL timing, which we stashed at drain. Returns 1 if it
+ * served a stashed result. */
+static int _on_query_kernel_timestamp(ze_event_handle_t hEvent,
+                                      ze_kernel_timestamp_result_t *dstptr) {
+  if (!hEvent || !dstptr)
+    return 0;
+  pthread_mutex_lock(&_ze_state_mutex);
+  int found = _event_kts_get(hEvent, dstptr);
+  pthread_mutex_unlock(&_ze_state_mutex);
+  return found;
+}
+
 /* Execute-epilogue handler for ONE cl. Runs AFTER L0 Execute returned,
  * with the user cl in flight. Three phases:
  *
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index 45188d98..c5871202 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -221,6 +221,16 @@ def upper_snake_case(str)
     _on_sync_drain_cl(hCommandList);
 EOF
 
+# The Append prologue swaps the user's signal event for our injected event, so
+# the user's own event ends up carrying the QKT/barrier op timing, not the
+# kernel's. If the user queries their event's kernel timestamp themselves,
+# serve back the kernel result we stashed at drain so they see kernel timing.
+register_epilogue 'zeEventQueryKernelTimestamp', <<EOF
+  if (_do_profile && hEvent && dstptr &&
+      _on_query_kernel_timestamp(hEvent, dstptr))
+    _retval = ZE_RESULT_SUCCESS;
+EOF
+
 # Fence sync: the fence the user passed to Execute is stamped onto each cl
 # (in_flight_fence), so a fence wait drains exactly the cls that Execute
 # submitted. zeFenceQueryStatus is NOT hooked: it's a non-blocking poll, so

From 0614c2c95b287c70a87937a9a32efd8c2a0cc992 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Wed, 17 Jun 2026 23:04:07 +0000
Subject: [PATCH 52/54] =?UTF-8?q?ze:=20consolidate=20tracer=20engine=20?=
 =?UTF-8?q?=E2=80=94=20unified=20sync,=20shared=20slot=20helpers?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Behavior-preserving cleanup of the ZE profiling engine. No change to the
QKT placement paths, the single state mutex, the cross-cl pred-ref/detach
UAF fix, or the imm/shadow reset logic; verified 63/63 on the correctness
+ bench suite.

- Collapse the four sync entry points (queue/fence/event/cl) into one
  _on_sync(enum _ze_sync_kind, void *h); ze_model.rb hooks pass the kind.
  The QUEUE/FENCE in-flight match and the EVENT/CL direct lookups are now
  one switch instead of four near-parallel lock/find/drain/unlock funcs.
- Add _ZE_FOREACH_SLOT to replace the open-coded "walk every used slot"
  loop (8 sites), and _cl_any_live to fold the two identical in-flight
  scans into one predicate.
- Replace the three divergent per-slot teardown copies with one
  _slot_dispose_resources(s, mode) primitive (POOL vs WRAPPER disposal).
- Stamp the two event-keyed maps (latest_signaled, kts) from one
  _ZE_EVENT_MAP_DEFINE[_NOCLEAR] macro instead of hand-written accessors.
- Linearize _universal_record_append: extract _append_inline_query,
  _chain_user_signal, _slot_append_rollback; preserve the is_immediate
  gate around _slot_publish and the out-of-lock failure-path barrier.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 472 +++++++++++++-----------
 backends/ze/ze_model.rb                 |   8 +-
 2 files changed, 257 insertions(+), 223 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 4c0e3417..555458f2 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -216,6 +216,14 @@ struct _ze_slab_chunk {
   struct _ze_slot slots[_ZE_SLAB_CHUNK_SLOTS];
 };
 
+/* Iterate every used slot in a cl, oldest-to-newest (chunk DL order, then
+ * slot order within a chunk) — the natural time order. Binds `s` to each
+ * `struct _ze_slot *`. Only for read/dispose passes that do NOT free chunks
+ * mid-walk; the drain path bumps n_held by hand and uses DL_FOREACH_SAFE. */
+#define _ZE_FOREACH_SLOT(cl_data, s)                                                                \
+  for (struct _ze_slab_chunk *_c = (cl_data)->chunks; _c; _c = _c->next)                            \
+    for (struct _ze_slot *s = _c->slots, *_se = _c->slots + _c->n_used; s < _se; ++s)
+
 struct _ze_command_list_obj_data {
   void *ptr;
   UT_hash_handle hh;
@@ -487,90 +495,103 @@ struct _ze_event_pool_entry {
 
 struct _ze_event_pool_entry *_ze_event_pools = NULL;
 
-/* event_latest_signaled[ev] -> the most recent slot whose attr==ev.
- * Used to resolve happens-before edges: when a new Append says "wait on
- * ev", we record the latest slot for ev as a pred. Updated at
- * instantiate and cleared at drain. */
-struct _ze_event_latest_signaled_entry {
-  ze_event_handle_t ev; /* key */
-  struct _ze_slot *slot;
-  UT_hash_handle hh;
-};
-static struct _ze_event_latest_signaled_entry *_ze_event_latest_signaled = NULL;
-
-static inline struct _ze_slot *_event_latest_signaled_get(ze_event_handle_t ev) {
-  struct _ze_event_latest_signaled_entry *e = NULL;
-  HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
-  return e ? e->slot : NULL;
-}
-
-static inline void _event_latest_signaled_set(ze_event_handle_t ev, struct _ze_slot *s) {
-  if (!ev)
-    return;
-  struct _ze_event_latest_signaled_entry *e = NULL;
-  HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
-  if (!e) {
-    e = (struct _ze_event_latest_signaled_entry *)calloc(1, sizeof(*e));
-    if (!e)
-      return;
-    e->ev = ev;
-    HASH_ADD_PTR(_ze_event_latest_signaled, ev, e);
-  }
-  e->slot = s;
-}
-
-/* Remove event_latest_signaled[ev] only if it still points at slot s
- * (the slot is being drained — but if a newer Append already overwrote
- * the entry, don't clobber that). */
-static inline void _event_latest_signaled_clear_if(ze_event_handle_t ev, struct _ze_slot *s) {
-  if (!ev)
-    return;
-  struct _ze_event_latest_signaled_entry *e = NULL;
-  HASH_FIND_PTR(_ze_event_latest_signaled, &ev, e);
-  if (e && e->slot == s) {
-    HASH_DEL(_ze_event_latest_signaled, e);
-    free(e);
+/* Two tracer-state maps share the same "ze_event_handle_t key -> value,
+ * uthash ptr-keyed, calloc-on-miss" shape. Stamp the boilerplate once per
+ * (NAME, VALTYPE) instead of hand-writing each set of accessors.
+ *
+ * _ZE_EVENT_MAP_DEFINE emits, for a POINTER value type:
+ *   static <map global>;
+ *   VALTYPE _event_##NAME##_get(ev)          -> value or NULL
+ *   void    _event_##NAME##_set(ev, val)     -> no-op on ev==NULL
+ *   void    _event_##NAME##_clear_if(ev,val) -> delete iff stored value==val
+ *
+ * _ZE_EVENT_MAP_DEFINE_NOCLEAR emits the struct/global/set plus an
+ * out-param get (int _event_##NAME##_get(ev, VALTYPE *out) -> found?),
+ * for a struct value where a NULL sentinel and a _clear_if make no sense.
+ *
+ * Macro-stamped (not a void* container) so the value stays inline in the
+ * uthash entry — no per-set heap box, and the comparator stays a plain ==. */
+#define _ZE_EVENT_MAP_DEFINE(NAME, VALTYPE)                                                        \
+  struct _ze_event_##NAME##_entry {                                                                \
+    ze_event_handle_t ev; /* key */                                                                \
+    VALTYPE v;                                                                                     \
+    UT_hash_handle hh;                                                                             \
+  };                                                                                               \
+  static struct _ze_event_##NAME##_entry *_ze_event_##NAME = NULL;                                 \
+  static inline VALTYPE _event_##NAME##_get(ze_event_handle_t ev) {                                \
+    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
+    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
+    return e ? e->v : NULL;                                                                        \
+  }                                                                                                \
+  static inline void _event_##NAME##_set(ze_event_handle_t ev, VALTYPE val) {                      \
+    if (!ev)                                                                                       \
+      return;                                                                                      \
+    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
+    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
+    if (!e) {                                                                                      \
+      e = (struct _ze_event_##NAME##_entry *)calloc(1, sizeof(*e));                                \
+      if (!e)                                                                                      \
+        return;                                                                                    \
+      e->ev = ev;                                                                                  \
+      HASH_ADD_PTR(_ze_event_##NAME, ev, e);                                                       \
+    }                                                                                              \
+    e->v = val;                                                                                    \
+  }                                                                                                \
+  static inline void _event_##NAME##_clear_if(ze_event_handle_t ev, VALTYPE val) {                 \
+    if (!ev)                                                                                       \
+      return;                                                                                      \
+    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
+    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
+    if (e && e->v == val) {                                                                        \
+      HASH_DEL(_ze_event_##NAME, e);                                                               \
+      free(e);                                                                                     \
+    }                                                                                              \
   }
-}
-
-/* event -> last kernel-timestamp result we drained for it. The Append
- * prologue swaps the user's signal event for our inj, so the user's event
- * ends up carrying the QKT/barrier op timing, not the kernel's — a user who
- * calls zeEventQueryKernelTimestamp on their own event would read garbage.
- * At drain we already read the real kernel result from the slab; stash it
- * here keyed by the user's event so the query hook can serve it back.
- * Re-signaling an event just overwrites the entry with the latest result. */
-struct _ze_event_kts_entry {
-  ze_event_handle_t ev; /* key */
-  ze_kernel_timestamp_result_t result;
-  UT_hash_handle hh;
-};
-static struct _ze_event_kts_entry *_ze_event_kts = NULL;
 
-static inline void _event_kts_set(ze_event_handle_t ev, ze_kernel_timestamp_result_t r) {
-  if (!ev)
-    return;
-  struct _ze_event_kts_entry *e = NULL;
-  HASH_FIND_PTR(_ze_event_kts, &ev, e);
-  if (!e) {
-    e = (struct _ze_event_kts_entry *)calloc(1, sizeof(*e));
-    if (!e)
-      return;
-    e->ev = ev;
-    HASH_ADD_PTR(_ze_event_kts, ev, e);
+#define _ZE_EVENT_MAP_DEFINE_NOCLEAR(NAME, VALTYPE)                                                \
+  struct _ze_event_##NAME##_entry {                                                                \
+    ze_event_handle_t ev; /* key */                                                                \
+    VALTYPE v;                                                                                     \
+    UT_hash_handle hh;                                                                             \
+  };                                                                                               \
+  static struct _ze_event_##NAME##_entry *_ze_event_##NAME = NULL;                                 \
+  static inline void _event_##NAME##_set(ze_event_handle_t ev, VALTYPE val) {                      \
+    if (!ev)                                                                                       \
+      return;                                                                                      \
+    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
+    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
+    if (!e) {                                                                                      \
+      e = (struct _ze_event_##NAME##_entry *)calloc(1, sizeof(*e));                                \
+      if (!e)                                                                                      \
+        return;                                                                                    \
+      e->ev = ev;                                                                                  \
+      HASH_ADD_PTR(_ze_event_##NAME, ev, e);                                                       \
+    }                                                                                              \
+    e->v = val;                                                                                    \
+  }                                                                                                \
+  static inline int _event_##NAME##_get(ze_event_handle_t ev, VALTYPE *out) {                      \
+    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
+    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
+    if (!e)                                                                                        \
+      return 0;                                                                                    \
+    *out = e->v;                                                                                   \
+    return 1;                                                                                      \
   }
-  e->result = r;
-}
 
-/* Copy the stashed kernel result for ev into *out; 1 if found, 0 otherwise. */
-static inline int _event_kts_get(ze_event_handle_t ev, ze_kernel_timestamp_result_t *out) {
-  struct _ze_event_kts_entry *e = NULL;
-  HASH_FIND_PTR(_ze_event_kts, &ev, e);
-  if (!e)
-    return 0;
-  *out = e->result;
-  return 1;
-}
+/* event_latest_signaled[ev] -> the most recent slot whose attr==ev. Used to
+ * resolve happens-before edges: when a new Append says "wait on ev", we record
+ * the latest slot for ev as a pred. _set at instantiate; _clear_if at drain
+ * removes the entry only if it still points at the draining slot (a newer
+ * Append may have overwritten it — don't clobber that). */
+_ZE_EVENT_MAP_DEFINE(latest_signaled, struct _ze_slot *)
+
+/* event_kts[ev] -> last kernel-timestamp result we drained for it. The Append
+ * prologue swaps the user's signal event for our inj, so the user's event ends
+ * up carrying the QKT/barrier op timing, not the kernel's — a user who calls
+ * zeEventQueryKernelTimestamp on their own event would read garbage. At drain
+ * we read the real kernel result from the slab and stash it here keyed by the
+ * user's event so the query hook can serve it back; re-signaling overwrites. */
+_ZE_EVENT_MAP_DEFINE_NOCLEAR(kts, ze_kernel_timestamp_result_t)
 
 /* Pop one recycled event wrapper from the per-context freelist; NULL
  * if none cached (caller falls back to creating a fresh L0 event). */
@@ -821,6 +842,46 @@ static void _slot_publish(struct _ze_command_list_obj_data *cl_data,
   _slot_instantiate(cl_data, s);
 }
 
+/* INLINE path: bake the QKT into the user cl body (wait=inj, sig=user_signal).
+ * Fires when Appended for immediate cls and on every Execute for regular cls
+ * (it is now part of the cl body). The QKT signaling user_signal IS the
+ * user_signal chain — no separate barrier needed. */
+static void _append_inline_query(ze_command_list_handle_t command_list, struct _ze_slot *s,
+                                 ze_event_handle_t inj_event, ze_event_handle_t user_signal) {
+  _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(
+      command_list, 1, &inj_event, s->chunk->slab, &s->off, user_signal, 1, &inj_event));
+}
+
+/* Chain the user's signal event off our inj on the user cl: the prologue
+ * swapped user_signal for inj, so without this the user's Sync(user_signal)
+ * would hang forever. No-op (returns 0) when the user passed no signal;
+ * returns 1 when the barrier was appended. Mutex-agnostic — it issues an
+ * L0 Append on the user cl and touches no tracer state, so it is correct
+ * both inside the critical section (shadow path) and outside it (the
+ * failure-path compensation). Aborts on L0 failure (a silent hang is worse). */
+static int _chain_user_signal(ze_command_list_handle_t command_list, ze_event_handle_t inj_event,
+                              ze_event_handle_t user_signal) {
+  if (!user_signal)
+    return 0;
+  _ZE_MUST(ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &inj_event));
+  return 1;
+}
+
+/* Roll back the slot just handed out by _cl_slot_append. We were the last to
+ * touch the tail chunk and hold _ze_state_mutex, so decrementing n_used/n_held
+ * and zeroing the slot is safe; if the chunk was freshly allocated only for
+ * this Append (n_used now 0), free it back so a slot-append failure doesn't
+ * leak a chunk. */
+static void _slot_append_rollback(struct _ze_command_list_obj_data *cl_data, struct _ze_slot *s) {
+  free(s->waits);
+  struct _ze_slab_chunk *c = s->chunk;
+  c->n_used--;
+  c->n_held--;
+  memset(s, 0, sizeof(*s));
+  if (c->n_used == 0)
+    _cl_chunk_free(cl_data, c, /*free_slab=*/1);
+}
+
 /* Append-time hook from profiling_epilogue. The prologue swapped user's
  * hSignalEvent for inj->event; user_signal is the original (possibly NULL),
  * user_waits is the user's wait list, ctx is the cl's context (fetched
@@ -866,13 +927,7 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
     goto fail_locked;
 
   if (inline_path) {
-    /* Bake the QKT into the user cl. wait=inj, sig=user_signal.
-     * Holds for both immediate (fires when Appended) and regular cls
-     * (fires on every Execute — the QKT is now part of the cl body). */
-    ze_event_handle_t wait_ev = inj->event;
-    _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(command_list, 1, &wait_ev,
-                                                                s->chunk->slab, &s->off,
-                                                                user_signal, 1, &wait_ev));
+    _append_inline_query(command_list, s, inj->event, user_signal);
     barrier_chained = 1; /* user_signal chained via the QKT itself */
     _slot_instantiate(cl_data, s);
     pthread_mutex_unlock(&_ze_state_mutex);
@@ -882,11 +937,7 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   /* Shadow path: chain user_signal off inj on the user cl, then place
    * the Query on the shadow cl (immediate cls only — regular cls defer
    * to the Execute epilogue, see _on_execute_one_cl). */
-  if (user_signal) {
-    ze_event_handle_t wait_ev = inj->event;
-    _ZE_MUST(ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev));
-    barrier_chained = 1;
-  }
+  barrier_chained = _chain_user_signal(command_list, inj->event, user_signal);
   if (cl_data->is_immediate) {
     ze_device_handle_t dev = NULL;
     _ZE_MUST(ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev));
@@ -899,34 +950,52 @@ static void _universal_record_append(ze_command_list_handle_t command_list,
   return;
 
 fail_locked:
-  if (s) {
-    /* Roll back the slot we just appended. We were the very last to
-     * touch the tail chunk and we hold _ze_state_mutex, so decrementing
-     * n_used/n_held and clearing the slot is safe. If the chunk
-     * was freshly allocated only for this Append (n_used now 0), free
-     * it back so we don't leak a chunk per slot-append failure. */
-    free(s->waits);
-    struct _ze_slab_chunk *c = s->chunk;
-    c->n_used--;
-    c->n_held--;
-    memset(s, 0, sizeof(*s));
-    if (c->n_used == 0)
-      _cl_chunk_free(cl_data, c, /*free_slab=*/1);
-  }
+  if (s)
+    _slot_append_rollback(cl_data, s);
   if (shadow_done)
     _put_ze_event(shadow_done);
   _put_ze_event(inj);
   pthread_mutex_unlock(&_ze_state_mutex);
-  /* If we never chained user_signal off inj, do it now. The prologue
-   * swapped user's sig for inj->event; without this Append the user's
-   * Sync(user_signal) would hang forever. Aborts on failure — we have
-   * no second-chance recovery and a silent hang is worse than a crash.
-   * Outside the state mutex: barrier on the user's cl is L0-side and
-   * doesn't touch tracer state. */
-  if (user_signal && !barrier_chained) {
-    ze_event_handle_t wait_ev = inj->event;
-    _ZE_MUST(ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &wait_ev));
+  /* Compensate outside the state mutex: if we bailed before chaining
+   * user_signal off inj, do it now or the user's Sync(user_signal) hangs. */
+  if (!barrier_chained)
+    _chain_user_signal(command_list, inj->event, user_signal);
+}
+
+/* Dispose the per-slot resources shared by every teardown path: the inj and
+ * shadow_done events, the waits[] copy, the preds[] array, and the slot's
+ * entry in event_latest_signaled. The event-disposal target differs by caller:
+ *   _ZE_DISPOSE_POOL    -> _put_ze_event (ctx alive: events recycle to the pool)
+ *   _ZE_DISPOSE_WRAPPER -> _put_ze_event_wrapper (ctx dying: only recycle the
+ *                          wrapper struct; the L0 event/pool die with the ctx)
+ * Deliberately does NOT touch chunk accounting (n_held / n_pinned), refs,
+ * owner, or live — those are caller-specific and stay at the call site.
+ * Every field is nulled so the call is idempotent (safe to re-run on a slot
+ * whose preds/latest-signaled were already cleared during drain). */
+enum _ze_slot_dispose_mode { _ZE_DISPOSE_POOL, _ZE_DISPOSE_WRAPPER };
+static void _slot_dispose_resources(struct _ze_slot *s, enum _ze_slot_dispose_mode mode) {
+  if (s->inj) {
+    if (mode == _ZE_DISPOSE_WRAPPER)
+      _put_ze_event_wrapper(s->inj);
+    else
+      _put_ze_event(s->inj);
+    s->inj = NULL;
   }
+  if (s->shadow_done) {
+    if (mode == _ZE_DISPOSE_WRAPPER)
+      _put_ze_event_wrapper(s->shadow_done);
+    else
+      _put_ze_event(s->shadow_done);
+    s->shadow_done = NULL;
+  }
+  free(s->waits);
+  s->waits = NULL;
+  s->n_waits = 0;
+  free(s->preds);
+  s->preds = NULL;
+  s->n_preds = 0;
+  _event_latest_signaled_clear_if(s->attr, s);
+  s->attr = NULL;
 }
 
 /* Reclaim a slot: PUT events back to the per-context pool, free waits,
@@ -949,18 +1018,10 @@ static void _slot_release(struct _ze_slot *s) {
   }
   if (!s->owner || !s->owner->is_immediate)
     return;
-  if (s->inj) {
-    _put_ze_event(s->inj);
-    s->inj = NULL;
-  }
-  if (s->shadow_done) {
-    _put_ze_event(s->shadow_done);
-    s->shadow_done = NULL;
-  }
-  free(s->waits);
-  s->waits = NULL;
-  s->n_waits = 0;
-  s->attr = NULL;
+  /* Reached only from _slot_drain, which already freed s->preds and cleared
+   * event_latest_signaled[s->attr]; the primitive re-running those is a no-op
+   * (free(NULL); _clear_if on a missing/overwritten key does nothing). */
+  _slot_dispose_resources(s, _ZE_DISPOSE_POOL);
 
   struct _ze_slab_chunk *c = s->chunk;
   struct _ze_command_list_obj_data *cl = s->owner;
@@ -1054,6 +1115,14 @@ static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
 
 static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data); /* fwd */
 
+/* 1 if any slot in the cl is still in flight (instantiated, not yet drained). */
+static int _cl_any_live(struct _ze_command_list_obj_data *cl_data) {
+  _ZE_FOREACH_SLOT(cl_data, s)
+    if (s->live)
+      return 1;
+  return 0;
+}
+
 /* Immediate cls only: once every slot in the cl is drained, raw-Reset the
  * user's cl so the L0 driver reclaims its per-QKT storage (it accumulates
  * otherwise on a long-lived reused immediate cl — see bench/mem_persistent_cl),
@@ -1062,28 +1131,12 @@ static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data); /* fwd */
  * Raw *_PTR = untraced; safe only when no slot is still live (no in-flight
  * work). Called at the tail of every sync-drain path that can touch an imm cl. */
 static void _imm_reset_if_drained(struct _ze_command_list_obj_data *cl_data) {
-  if (!cl_data || !cl_data->is_immediate)
+  if (!cl_data || !cl_data->is_immediate || _cl_any_live(cl_data))
     return;
-  struct _ze_slab_chunk *c;
-  DL_FOREACH(cl_data->chunks, c)
-    for (uint32_t i = 0; i < c->n_used; ++i)
-      if (c->slots[i].live)
-        return; /* still in-flight work — unsafe to reset */
   ZE_COMMAND_LIST_RESET_PTR((ze_command_list_handle_t)cl_data->ptr);
   _cl_data_reset(cl_data);
 }
 
-/* Drain a single cl. */
-static void _on_sync_drain_cl(ze_command_list_handle_t command_list) {
-  pthread_mutex_lock(&_ze_state_mutex);
-  struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
-  if (cl_data) {
-    _cl_drain(cl_data);
-    _imm_reset_if_drained(cl_data);
-  }
-  pthread_mutex_unlock(&_ze_state_mutex);
-}
-
 /* Reclaim one chunk during cl teardown (reset or single-cl destroy, ctx
  * alive). Releases every slot's resources (events to pool, waits, preds,
  * clears latest-signaled), then either frees the chunk or — if any slot is
@@ -1097,22 +1150,7 @@ static void _cl_chunk_reclaim(struct _ze_command_list_obj_data *cl_data,
   uint32_t pinned = 0;
   for (uint32_t i = 0; i < c->n_used; ++i) {
     struct _ze_slot *s = &c->slots[i];
-    if (s->inj) {
-      _put_ze_event(s->inj);
-      s->inj = NULL;
-    }
-    if (s->shadow_done) {
-      _put_ze_event(s->shadow_done);
-      s->shadow_done = NULL;
-    }
-    free(s->waits);
-    s->waits = NULL;
-    s->n_waits = 0;
-    free(s->preds);
-    s->preds = NULL;
-    s->n_preds = 0;
-    _event_latest_signaled_clear_if(s->attr, s);
-    s->attr = NULL;
+    _slot_dispose_resources(s, _ZE_DISPOSE_POOL);
     if (s->refs)
       pinned++;
   }
@@ -1159,16 +1197,8 @@ static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_
     return;
   }
   DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
-    for (uint32_t i = 0; i < c->n_used; ++i) {
-      struct _ze_slot *s = &c->slots[i];
-      if (s->inj)
-        _put_ze_event_wrapper(s->inj);
-      if (s->shadow_done)
-        _put_ze_event_wrapper(s->shadow_done);
-      free(s->waits);
-      free(s->preds);
-      _event_latest_signaled_clear_if(s->attr, s);
-    }
+    for (uint32_t i = 0; i < c->n_used; ++i)
+      _slot_dispose_resources(&c->slots[i], _ZE_DISPOSE_WRAPPER);
     _cl_chunk_free(cl_data, c, /*free_slab=*/0);
   }
   free(cl_data);
@@ -1248,57 +1278,49 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
   pthread_mutex_unlock(&_ze_state_mutex);
 }
 
-/* Drain every cl whose in_flight_q matches. */
-static void _on_sync_drain_queue(ze_command_queue_handle_t hQueue) {
-  pthread_mutex_lock(&_ze_state_mutex);
-  struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
-  HASH_ITER(hh, _ze_cls, cl_data, tmp) {
-    if (cl_data->in_flight_q == hQueue)
-      _cl_drain(cl_data);
-  }
-  pthread_mutex_unlock(&_ze_state_mutex);
-}
-
-/* Drain every cl whose in_flight_fence matches. A fence signals when all
- * cls submitted in its Execute have completed, so waiting on the fence is
- * a valid drain anchor for exactly those cls — the same role hQueue plays
- * for zeCommandQueueSynchronize. */
-static void _on_sync_drain_fence(ze_fence_handle_t hFence) {
-  pthread_mutex_lock(&_ze_state_mutex);
-  struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
-  HASH_ITER(hh, _ze_cls, cl_data, tmp) {
-    if (cl_data->in_flight_fence == hFence)
-      _cl_drain(cl_data);
-  }
-  pthread_mutex_unlock(&_ze_state_mutex);
-}
-
-/* Drain the slot that most recently signaled `ev` (recursing on preds). */
-static void _on_sync_drain_event(ze_event_handle_t ev) {
+/* The four user sync APIs all reduce to "drain the slots the synced anchor
+ * covers". They differ only in how the anchor selects work:
+ *
+ *   _ZE_SYNC_CL     zeCommandListHostSynchronize  -> the one named cl
+ *   _ZE_SYNC_QUEUE  zeCommandQueueSynchronize     -> every cl with in_flight_q == h
+ *   _ZE_SYNC_FENCE  zeFenceHostSynchronize        -> every cl with in_flight_fence == h
+ *   _ZE_SYNC_EVENT  zeEventHostSynchronize        -> the slot that last signaled h,
+ *                                                    walking its pred edges
+ *
+ * QUEUE/FENCE share one rule: a queue/fence wait completes exactly the cls a
+ * given Execute submitted, identified by the handle stamped on the cl at
+ * Execute. CL/EVENT name their target directly. After draining, a fully-drained
+ * immediate cl is raw-Reset to cap the driver's per-QKT storage leak
+ * (_imm_reset_if_drained); for the cl/queue/fence anchors _cl_drain already
+ * cleared in_flight_*, while the event anchor may leave live siblings, so it
+ * clears in_flight_* only once the cl has no slot left in flight. */
+enum _ze_sync_kind { _ZE_SYNC_CL, _ZE_SYNC_QUEUE, _ZE_SYNC_FENCE, _ZE_SYNC_EVENT };
+static void _on_sync(enum _ze_sync_kind kind, void *h) {
   pthread_mutex_lock(&_ze_state_mutex);
-  struct _ze_slot *s = _event_latest_signaled_get(ev);
-  if (!s || !s->owner) {
-    pthread_mutex_unlock(&_ze_state_mutex);
-    return;
-  }
-  _slot_drain(s);
-  /* The drained slot may have left siblings live; only clear
-   * in_flight_q / in_flight_fence if nothing in this cl remains in flight. */
-  int any_live = 0;
-  struct _ze_slab_chunk *c;
-  DL_FOREACH(s->owner->chunks, c) {
-    for (uint32_t i = 0; i < c->n_used; ++i)
-      if (c->slots[i].live) {
-        any_live = 1;
-        break;
+  if (kind == _ZE_SYNC_EVENT) {
+    struct _ze_slot *s = _event_latest_signaled_get((ze_event_handle_t)h);
+    if (s && s->owner) {
+      _slot_drain(s);
+      if (!_cl_any_live(s->owner)) {
+        s->owner->in_flight_q = NULL;
+        s->owner->in_flight_fence = NULL;
+        _imm_reset_if_drained(s->owner);
       }
-    if (any_live)
-      break;
-  }
-  if (!any_live) {
-    s->owner->in_flight_q = NULL;
-    s->owner->in_flight_fence = NULL;
-    _imm_reset_if_drained(s->owner);
+    }
+  } else if (kind == _ZE_SYNC_CL) {
+    struct _ze_command_list_obj_data *cl_data = _cl_find((ze_command_list_handle_t)h);
+    if (cl_data) {
+      _cl_drain(cl_data);
+      _imm_reset_if_drained(cl_data);
+    }
+  } else { /* _ZE_SYNC_QUEUE / _ZE_SYNC_FENCE: match the stamped in-flight handle */
+    struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
+    HASH_ITER(hh, _ze_cls, cl_data, tmp) {
+      void *anchor = (kind == _ZE_SYNC_QUEUE) ? (void *)cl_data->in_flight_q
+                                              : (void *)cl_data->in_flight_fence;
+      if (anchor == h)
+        _cl_drain(cl_data);
+    }
   }
   pthread_mutex_unlock(&_ze_state_mutex);
 }
@@ -1392,6 +1414,18 @@ static void _on_execute_command_lists_epilogue(ze_command_queue_handle_t hQueue,
     _on_execute_one_cl(hQueue, hFence, phCommandLists[i]);
 }
 
+/* ========================================================================
+ * Property/info dumping + tracer init
+ *
+ * Separate concern from the slot/drain engine above: read device/driver/
+ * kernel/memory properties and emit the lttng_ust_ze_properties / _build
+ * tracepoints, plus one-time loader/symbol init. Self-contained — the
+ * engine never calls into this section, and the only external callers are
+ * ze_model.rb hooks (_do_state, _dump_memory_info,
+ * _dump_command_list_device_timer, _in_loader_init) and gen_ze.rb
+ * (_init_tracer / _init_tracer_dump).
+ * ======================================================================== */
+
 static pthread_once_t _init = PTHREAD_ONCE_INIT;
 static __thread volatile int _in_init = 0;
 static volatile unsigned int _in_loader_init = 0;
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index c5871202..154004fc 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -208,17 +208,17 @@ def upper_snake_case(str)
 # everything reachable. Each sync API has a different anchor.
 register_epilogue 'zeCommandQueueSynchronize', <<EOF
   if (_do_profile && _retval == ZE_RESULT_SUCCESS)
-    _on_sync_drain_queue(hCommandQueue);
+    _on_sync(_ZE_SYNC_QUEUE, hCommandQueue);
 EOF
 
 register_epilogue 'zeEventHostSynchronize', <<EOF
   if (_do_profile && _retval == ZE_RESULT_SUCCESS && hEvent)
-    _on_sync_drain_event(hEvent);
+    _on_sync(_ZE_SYNC_EVENT, hEvent);
 EOF
 
 register_epilogue 'zeCommandListHostSynchronize', <<EOF
   if (_do_profile && _retval == ZE_RESULT_SUCCESS && hCommandList)
-    _on_sync_drain_cl(hCommandList);
+    _on_sync(_ZE_SYNC_CL, hCommandList);
 EOF
 
 # The Append prologue swaps the user's signal event for our injected event, so
@@ -239,7 +239,7 @@ def upper_snake_case(str)
 # blocking zeFenceHostSynchronize is the safe anchor.
 register_epilogue 'zeFenceHostSynchronize', <<EOF
   if (_do_profile && _retval == ZE_RESULT_SUCCESS && hFence)
-    _on_sync_drain_fence(hFence);
+    _on_sync(_ZE_SYNC_FENCE, hFence);
 EOF
 
 register_prologue 'zeEventPoolCreate', <<EOF

From 7a9665f7741159a2a6b069650c9e084030949fbb Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Fri, 19 Jun 2026 18:07:18 +0000
Subject: [PATCH 53/54] ze: fix event-state stale reads on handle reuse + index
 queue/fence sync
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two correctness/perf fixes plus a refactor in the ZE tracer engine:

- Per-event state (latest signaling slot + stashed kernel timestamp) was
  keyed by the user's event handle address but never evicted on destroy.
  The L0 driver recycles freed event addresses, so a fresh event could be
  served the prior event's stale kernel timestamp (SUCCESS instead of
  NOT_READY) and a wait on the reused address could resolve to a freed slot
  (UAF in the pred walk). Evict on a successful zeEventDestroy.

- Queue/fence sync (_on_sync) scanned every live command list to find the
  ones stamped with the synced queue/fence — O(live cls) per sync (measured
  164x/176x cost growth at 4096 live cls; plain L0 is flat). Add per-queue
  and per-fence in-flight indexes so a sync drains only the matching cls.

- Merge the two event-keyed maps (latest_signaled + kts) into one
  _ze_event_state entry: same key, same lifecycle, one alloc/lookup/eviction,
  and they can no longer desync (the class of bug the first fix addressed).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 backends/ze/tracer_ze_helpers.include.c | 334 ++++++++++++++++--------
 backends/ze/ze_model.rb                 |  10 +
 2 files changed, 239 insertions(+), 105 deletions(-)

diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 555458f2..6969f590 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -17,11 +17,12 @@
  *   - shadow-path slots: re-Append Query on shadow cl
  *     inline-path slots: nothing (Query is baked into cl body)
  *   - instantiate every slot in cl
- *   - cl.in_flight_q = q
+ *   - cl.in_flight_q = q; index cl under q (and its fence) for sync lookup
  *
  * On Sync (the synced anchor tells us what to drain):
  *   - Sync(ev):  drain(event_latest_signaled[ev])
- *   - Sync(q):   drain_cl(cl) for every cl whose in_flight_q == q
+ *   - Sync(q):   drain_cl(cl) for every cl in the q-index bucket for q
+ *                (O(matching cls), not a scan of every live cl)
  *   - Sync(cl):  drain_cl(cl)
  *
  * drain(s):
@@ -255,6 +256,15 @@ struct _ze_command_list_obj_data {
    * cl's lifetime. Load-bearing for _on_destroy_context's sweep: lets it
    * associate cls back to their ctx without an L0 roundtrip per cl. */
   ze_context_handle_t cached_context;
+
+  /* Membership in the per-queue / per-fence in-flight indexes (see
+   * _ze_q_index / _ze_fence_index below). A cl in flight is linked into both
+   * its queue's bucket (q_prev/q_next) and, if Executed with a fence, its
+   * fence's bucket (f_prev/f_next), so a queue/fence sync drains exactly the
+   * matching cls without scanning every live cl. Linked at Execute, unlinked
+   * at drain, both via _cl_index_clear. */
+  struct _ze_command_list_obj_data *q_prev, *q_next;
+  struct _ze_command_list_obj_data *f_prev, *f_next;
 };
 
 struct _ze_command_list_obj_data *_ze_cls = NULL;
@@ -282,6 +292,73 @@ static struct _ze_command_list_obj_data *_cl_find_and_del(ze_command_list_handle
   return cl;
 }
 
+/* In-flight indexes: queue handle -> the cls currently in flight on that queue,
+ * and fence handle -> the cls in flight under that fence. A queue/fence sync
+ * completes exactly the cls of the matching Execute, so these let _on_sync
+ * drain just those cls instead of scanning every live cl (which is O(live cls)
+ * per sync — see bench/sync_scaling). Buckets are created lazily at Execute and
+ * freed when they go empty at drain. */
+struct _ze_inflight_bucket {
+  void *key; /* ze_command_queue_handle_t or ze_fence_handle_t */
+  struct _ze_command_list_obj_data *cls; /* DL via q_prev/q_next or f_prev/f_next */
+  UT_hash_handle hh;
+};
+static struct _ze_inflight_bucket *_ze_q_index = NULL;
+static struct _ze_inflight_bucket *_ze_fence_index = NULL;
+
+static void _index_link(struct _ze_inflight_bucket **index, void *key,
+                        struct _ze_command_list_obj_data *cl, int is_fence) {
+  if (!key)
+    return;
+  struct _ze_inflight_bucket *b = NULL;
+  HASH_FIND_PTR(*index, &key, b);
+  if (!b) {
+    b = (struct _ze_inflight_bucket *)calloc(1, sizeof(*b));
+    if (!b)
+      return;
+    b->key = key;
+    HASH_ADD_PTR(*index, key, b);
+  }
+  if (is_fence)
+    DL_APPEND2(b->cls, cl, f_prev, f_next);
+  else
+    DL_APPEND2(b->cls, cl, q_prev, q_next);
+}
+
+static void _index_unlink(struct _ze_inflight_bucket **index, void *key,
+                          struct _ze_command_list_obj_data *cl, int is_fence) {
+  if (!key)
+    return;
+  struct _ze_inflight_bucket *b = NULL;
+  HASH_FIND_PTR(*index, &key, b);
+  if (!b)
+    return;
+  if (is_fence)
+    DL_DELETE2(b->cls, cl, f_prev, f_next);
+  else
+    DL_DELETE2(b->cls, cl, q_prev, q_next);
+  if (!b->cls) {
+    HASH_DEL(*index, b);
+    free(b);
+  }
+}
+
+/* Link cl into the queue (and, if non-NULL, fence) in-flight indexes. Called
+ * once per Execute, after in_flight_q/in_flight_fence are stamped. */
+static void _cl_index_set(struct _ze_command_list_obj_data *cl,
+                          ze_command_queue_handle_t q, ze_fence_handle_t f) {
+  _index_link(&_ze_q_index, q, cl, /*is_fence=*/0);
+  _index_link(&_ze_fence_index, f, cl, /*is_fence=*/1);
+}
+
+/* Remove cl from both in-flight indexes. Uses cl's own in_flight_q/_fence as
+ * the keys, so it MUST run before those are cleared. Idempotent: a cl not in
+ * flight has NULL keys and is a no-op. */
+static void _cl_index_clear(struct _ze_command_list_obj_data *cl) {
+  _index_unlink(&_ze_q_index, cl->in_flight_q, cl, /*is_fence=*/0);
+  _index_unlink(&_ze_fence_index, cl->in_flight_fence, cl, /*is_fence=*/1);
+}
+
 /* Per-device cache of the queue-group flag bitmap. The lookup is
  * read-mostly: scan zeDeviceGetCommandQueueGroupProperties once,
  * remember the per-ordinal flags. flags==NULL means "we already checked
@@ -495,103 +572,112 @@ struct _ze_event_pool_entry {
 
 struct _ze_event_pool_entry *_ze_event_pools = NULL;
 
-/* Two tracer-state maps share the same "ze_event_handle_t key -> value,
- * uthash ptr-keyed, calloc-on-miss" shape. Stamp the boilerplate once per
- * (NAME, VALTYPE) instead of hand-writing each set of accessors.
- *
- * _ZE_EVENT_MAP_DEFINE emits, for a POINTER value type:
- *   static <map global>;
- *   VALTYPE _event_##NAME##_get(ev)          -> value or NULL
- *   void    _event_##NAME##_set(ev, val)     -> no-op on ev==NULL
- *   void    _event_##NAME##_clear_if(ev,val) -> delete iff stored value==val
+/* Per-event tracer state, keyed by the user's event handle. Two facts live
+ * here, both populated around drain and both bound to the event's lifetime, so
+ * they share one uthash entry (one lookup, one alloc, one eviction):
  *
- * _ZE_EVENT_MAP_DEFINE_NOCLEAR emits the struct/global/set plus an
- * out-param get (int _event_##NAME##_get(ev, VALTYPE *out) -> found?),
- * for a struct value where a NULL sentinel and a _clear_if make no sense.
+ *   latest  -> the most recent slot whose attr==ev. Resolves happens-before
+ *              edges: when a new Append waits on ev, that slot becomes a pred.
+ *              Set at instantiate; cleared at drain/dispose only if it still
+ *              points at the draining slot (a newer Append may have overwritten
+ *              it — don't clobber that).
+ *   kts     -> last kernel-timestamp result we drained for ev. The Append
+ *              prologue swaps the user's signal for our inj, so the user's event
+ *              carries QKT/barrier op timing, not the kernel's. At drain we read
+ *              the real kernel result from the slab and stash it here so the
+ *              user's own zeEventQueryKernelTimestamp can be served kernel
+ *              timing; re-signaling overwrites.
  *
- * Macro-stamped (not a void* container) so the value stays inline in the
- * uthash entry — no per-set heap box, and the comparator stays a plain ==. */
-#define _ZE_EVENT_MAP_DEFINE(NAME, VALTYPE)                                                        \
-  struct _ze_event_##NAME##_entry {                                                                \
-    ze_event_handle_t ev; /* key */                                                                \
-    VALTYPE v;                                                                                     \
-    UT_hash_handle hh;                                                                             \
-  };                                                                                               \
-  static struct _ze_event_##NAME##_entry *_ze_event_##NAME = NULL;                                 \
-  static inline VALTYPE _event_##NAME##_get(ze_event_handle_t ev) {                                \
-    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
-    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
-    return e ? e->v : NULL;                                                                        \
-  }                                                                                                \
-  static inline void _event_##NAME##_set(ze_event_handle_t ev, VALTYPE val) {                      \
-    if (!ev)                                                                                       \
-      return;                                                                                      \
-    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
-    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
-    if (!e) {                                                                                      \
-      e = (struct _ze_event_##NAME##_entry *)calloc(1, sizeof(*e));                                \
-      if (!e)                                                                                      \
-        return;                                                                                    \
-      e->ev = ev;                                                                                  \
-      HASH_ADD_PTR(_ze_event_##NAME, ev, e);                                                       \
-    }                                                                                              \
-    e->v = val;                                                                                    \
-  }                                                                                                \
-  static inline void _event_##NAME##_clear_if(ze_event_handle_t ev, VALTYPE val) {                 \
-    if (!ev)                                                                                       \
-      return;                                                                                      \
-    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
-    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
-    if (e && e->v == val) {                                                                        \
-      HASH_DEL(_ze_event_##NAME, e);                                                               \
-      free(e);                                                                                     \
-    }                                                                                              \
+ * The whole entry is evicted by _on_destroy_event so a recycled handle address
+ * (the L0 driver reuses freed event addresses) never serves a dead event's
+ * latest slot (a dangling pred -> UAF) or stale kts. The value stays inline in
+ * the entry — no per-set heap box. */
+struct _ze_event_state_entry {
+  ze_event_handle_t ev; /* key */
+  struct _ze_slot *latest;
+  ze_kernel_timestamp_result_t kts;
+  unsigned char has_kts;
+  UT_hash_handle hh;
+};
+static struct _ze_event_state_entry *_ze_event_state = NULL;
+
+/* Find-or-create the entry for ev. NULL only on ev==NULL or OOM. */
+static struct _ze_event_state_entry *_event_state_get_or_add(ze_event_handle_t ev) {
+  if (!ev)
+    return NULL;
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  if (!e) {
+    e = (struct _ze_event_state_entry *)calloc(1, sizeof(*e));
+    if (!e)
+      return NULL;
+    e->ev = ev;
+    HASH_ADD_PTR(_ze_event_state, ev, e);
   }
+  return e;
+}
 
-#define _ZE_EVENT_MAP_DEFINE_NOCLEAR(NAME, VALTYPE)                                                \
-  struct _ze_event_##NAME##_entry {                                                                \
-    ze_event_handle_t ev; /* key */                                                                \
-    VALTYPE v;                                                                                     \
-    UT_hash_handle hh;                                                                             \
-  };                                                                                               \
-  static struct _ze_event_##NAME##_entry *_ze_event_##NAME = NULL;                                 \
-  static inline void _event_##NAME##_set(ze_event_handle_t ev, VALTYPE val) {                      \
-    if (!ev)                                                                                       \
-      return;                                                                                      \
-    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
-    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
-    if (!e) {                                                                                      \
-      e = (struct _ze_event_##NAME##_entry *)calloc(1, sizeof(*e));                                \
-      if (!e)                                                                                      \
-        return;                                                                                    \
-      e->ev = ev;                                                                                  \
-      HASH_ADD_PTR(_ze_event_##NAME, ev, e);                                                       \
-    }                                                                                              \
-    e->v = val;                                                                                    \
-  }                                                                                                \
-  static inline int _event_##NAME##_get(ze_event_handle_t ev, VALTYPE *out) {                      \
-    struct _ze_event_##NAME##_entry *e = NULL;                                                     \
-    HASH_FIND_PTR(_ze_event_##NAME, &ev, e);                                                       \
-    if (!e)                                                                                        \
-      return 0;                                                                                    \
-    *out = e->v;                                                                                   \
-    return 1;                                                                                      \
+/* Drop the entry if it carries nothing worth keeping (no latest slot, no
+ * stashed kts) — keeps the map bounded as facts are cleared. */
+static inline void _event_state_gc(struct _ze_event_state_entry *e) {
+  if (e && !e->latest && !e->has_kts) {
+    HASH_DEL(_ze_event_state, e);
+    free(e);
   }
+}
 
-/* event_latest_signaled[ev] -> the most recent slot whose attr==ev. Used to
- * resolve happens-before edges: when a new Append says "wait on ev", we record
- * the latest slot for ev as a pred. _set at instantiate; _clear_if at drain
- * removes the entry only if it still points at the draining slot (a newer
- * Append may have overwritten it — don't clobber that). */
-_ZE_EVENT_MAP_DEFINE(latest_signaled, struct _ze_slot *)
-
-/* event_kts[ev] -> last kernel-timestamp result we drained for it. The Append
- * prologue swaps the user's signal event for our inj, so the user's event ends
- * up carrying the QKT/barrier op timing, not the kernel's — a user who calls
- * zeEventQueryKernelTimestamp on their own event would read garbage. At drain
- * we read the real kernel result from the slab and stash it here keyed by the
- * user's event so the query hook can serve it back; re-signaling overwrites. */
-_ZE_EVENT_MAP_DEFINE_NOCLEAR(kts, ze_kernel_timestamp_result_t)
+static inline struct _ze_slot *_event_latest_get(ze_event_handle_t ev) {
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  return e ? e->latest : NULL;
+}
+
+static inline void _event_latest_set(ze_event_handle_t ev, struct _ze_slot *slot) {
+  struct _ze_event_state_entry *e = _event_state_get_or_add(ev);
+  if (e)
+    e->latest = slot;
+}
+
+/* Clear latest iff it still points at `slot` (a newer Append may own it now). */
+static inline void _event_latest_clear_if(ze_event_handle_t ev, struct _ze_slot *slot) {
+  if (!ev)
+    return;
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  if (e && e->latest == slot) {
+    e->latest = NULL;
+    _event_state_gc(e);
+  }
+}
+
+static inline void _event_kts_set(ze_event_handle_t ev, ze_kernel_timestamp_result_t val) {
+  struct _ze_event_state_entry *e = _event_state_get_or_add(ev);
+  if (e) {
+    e->kts = val;
+    e->has_kts = 1;
+  }
+}
+
+static inline int _event_kts_get(ze_event_handle_t ev, ze_kernel_timestamp_result_t *out) {
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  if (!e || !e->has_kts)
+    return 0;
+  *out = e->kts;
+  return 1;
+}
+
+/* Evict the whole entry (both facts) — called when the event is destroyed. */
+static inline void _event_state_del(ze_event_handle_t ev) {
+  if (!ev)
+    return;
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  if (e) {
+    HASH_DEL(_ze_event_state, e);
+    free(e);
+  }
+}
 
 /* Pop one recycled event wrapper from the per-context freelist; NULL
  * if none cached (caller falls back to creating a fresh L0 event). */
@@ -793,7 +879,7 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
   s->preds = (struct _ze_slot **)calloc(cap, sizeof(struct _ze_slot *));
   s->n_preds = 0;
   for (uint32_t i = 0; i < s->n_waits; ++i) {
-    struct _ze_slot *p = _event_latest_signaled_get(s->waits[i]);
+    struct _ze_slot *p = _event_latest_get(s->waits[i]);
     if (p && p->live)
       s->preds[s->n_preds++] = p;
   }
@@ -823,7 +909,7 @@ static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct
   for (uint32_t i = 0; i < s->n_preds; ++i)
     s->preds[i]->refs++;
   if (s->attr)
-    _event_latest_signaled_set(s->attr, s);
+    _event_latest_set(s->attr, s);
 }
 
 /* Publish a fresh slot: shadow path appends a Query on the per-(ctx,device)
@@ -994,7 +1080,7 @@ static void _slot_dispose_resources(struct _ze_slot *s, enum _ze_slot_dispose_mo
   free(s->preds);
   s->preds = NULL;
   s->n_preds = 0;
-  _event_latest_signaled_clear_if(s->attr, s);
+  _event_latest_clear_if(s->attr, s);
   s->attr = NULL;
 }
 
@@ -1081,7 +1167,7 @@ static void _slot_drain(struct _ze_slot *s) {
                     ZE_RESULT_SUCCESS, r.global.kernelStart, r.global.kernelEnd,
                     r.context.kernelStart, r.context.kernelEnd);
   }
-  _event_latest_signaled_clear_if(s->attr, s);
+  _event_latest_clear_if(s->attr, s);
   /* Drop refs on preds; release any that hit 0 and are already drained. */
   for (uint32_t i = 0; i < s->n_preds; ++i) {
     struct _ze_slot *p = s->preds[i];
@@ -1109,6 +1195,7 @@ static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
     if (c->n_held == 0 && c != cl_data->chunks->prev)
       _cl_chunk_free(cl_data, c, /*free_slab=*/1);
   }
+  _cl_index_clear(cl_data);
   cl_data->in_flight_q = NULL;
   cl_data->in_flight_fence = NULL;
 }
@@ -1175,6 +1262,7 @@ static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data) {
   struct _ze_slab_chunk *c, *tmp;
   DL_FOREACH_SAFE(cl_data->chunks, c, tmp)
     _cl_chunk_reclaim(cl_data, c);
+  _cl_index_clear(cl_data);
   cl_data->in_flight_q = NULL;
   cl_data->in_flight_fence = NULL;
 }
@@ -1190,6 +1278,10 @@ static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data) {
  * referenced chunks) just like reset. */
 static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_dying) {
   struct _ze_slab_chunk *c, *tmp;
+  /* Unlink from the in-flight indexes before the struct is freed, or a later
+   * queue/fence sync would walk a dangling cl. (When ctx_dying the whole index
+   * is torn down separately, but unlinking here is still correct and cheap.) */
+  _cl_index_clear(cl_data);
   if (!ctx_dying) {
     DL_FOREACH_SAFE(cl_data->chunks, c, tmp)
       _cl_chunk_reclaim(cl_data, c);
@@ -1298,10 +1390,11 @@ enum _ze_sync_kind { _ZE_SYNC_CL, _ZE_SYNC_QUEUE, _ZE_SYNC_FENCE, _ZE_SYNC_EVENT
 static void _on_sync(enum _ze_sync_kind kind, void *h) {
   pthread_mutex_lock(&_ze_state_mutex);
   if (kind == _ZE_SYNC_EVENT) {
-    struct _ze_slot *s = _event_latest_signaled_get((ze_event_handle_t)h);
+    struct _ze_slot *s = _event_latest_get((ze_event_handle_t)h);
     if (s && s->owner) {
       _slot_drain(s);
       if (!_cl_any_live(s->owner)) {
+        _cl_index_clear(s->owner);
         s->owner->in_flight_q = NULL;
         s->owner->in_flight_fence = NULL;
         _imm_reset_if_drained(s->owner);
@@ -1313,13 +1406,23 @@ static void _on_sync(enum _ze_sync_kind kind, void *h) {
       _cl_drain(cl_data);
       _imm_reset_if_drained(cl_data);
     }
-  } else { /* _ZE_SYNC_QUEUE / _ZE_SYNC_FENCE: match the stamped in-flight handle */
-    struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
-    HASH_ITER(hh, _ze_cls, cl_data, tmp) {
-      void *anchor = (kind == _ZE_SYNC_QUEUE) ? (void *)cl_data->in_flight_q
-                                              : (void *)cl_data->in_flight_fence;
-      if (anchor == h)
-        _cl_drain(cl_data);
+  } else { /* _ZE_SYNC_QUEUE / _ZE_SYNC_FENCE: drain just the indexed cls */
+    struct _ze_inflight_bucket *b = NULL;
+    if (kind == _ZE_SYNC_QUEUE)
+      HASH_FIND_PTR(_ze_q_index, &h, b);
+    else
+      HASH_FIND_PTR(_ze_fence_index, &h, b);
+    if (b) {
+      struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
+      /* SAFE2 because _cl_drain -> _cl_index_clear unlinks cl_data from this
+       * very bucket (and may free the bucket on the last unlink). */
+      if (kind == _ZE_SYNC_QUEUE) {
+        DL_FOREACH_SAFE2(b->cls, cl_data, tmp, q_next)
+          _cl_drain(cl_data);
+      } else {
+        DL_FOREACH_SAFE2(b->cls, cl_data, tmp, f_next)
+          _cl_drain(cl_data);
+      }
     }
   }
   pthread_mutex_unlock(&_ze_state_mutex);
@@ -1340,6 +1443,23 @@ static int _on_query_kernel_timestamp(ze_event_handle_t hEvent,
   return found;
 }
 
+/* zeEventDestroy epilogue (success only). The per-event state entry is keyed by
+ * the event's HANDLE ADDRESS, which the L0 driver recycles: a fresh event
+ * created after this one is destroyed can land on the same address. Without
+ * eviction the new event inherits the dead one's entry —
+ *   .kts:    a never-signaled event's zeEventQueryKernelTimestamp would be
+ *            served the prior event's stale timing;
+ *   .latest: a wait on the reused address would resolve to a freed slot, a
+ *            use-after-free in the pred walk.
+ * Evicting the entry at destroy bounds the map to live events and closes the
+ * recycled-address reads. Gated on a successful destroy by the caller: a failed
+ * destroy leaves the event (and its address) alive, so its data stays. */
+static void _on_destroy_event(ze_event_handle_t hEvent) {
+  pthread_mutex_lock(&_ze_state_mutex);
+  _event_state_del(hEvent);
+  pthread_mutex_unlock(&_ze_state_mutex);
+}
+
 /* Execute-epilogue handler for ONE cl. Runs AFTER L0 Execute returned,
  * with the user cl in flight. Three phases:
  *
@@ -1402,6 +1522,10 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
   }
   cl_data->in_flight_q = hQueue;
   cl_data->in_flight_fence = hFence;
+  /* Index this cl under its queue (and fence) so a later queue/fence sync
+   * drains it without scanning every live cl. The force-sync+drain above
+   * already unlinked any prior in-flight membership, so no double-link. */
+  _cl_index_set(cl_data, hQueue, hFence);
 
   pthread_mutex_unlock(&_ze_state_mutex);
 }
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index 154004fc..7341d253 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -262,6 +262,16 @@ def upper_snake_case(str)
   }
 EOF
 
+# Evict our per-event state once the destroy SUCCEEDS: the driver recycles
+# handle addresses, so a fresh event can reuse this one's. Without eviction the
+# new event inherits the dead one's stashed kernel timing and a dangling latest-
+# signaled slot pointer. Epilogue gated on _retval — a failed destroy (e.g. a bad
+# handle) leaves the event alive, its address can't be recycled, data must stay.
+register_epilogue 'zeEventDestroy', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hEvent)
+    _on_destroy_event(hEvent);
+EOF
+
 # Dump memory info if required
 memory_info_dump = lambda { |ptr_name|
   "_dump_memory_info(hCommandList, #{ptr_name})"

From 1cc7086392925683ac57334142607ea8db8ce241 Mon Sep 17 00:00:00 2001
From: Thomas Applencourt
 <applenco@sunspot-uan-0001.head.cm.sunspot.alcf.anl.gov>
Date: Fri, 19 Jun 2026 19:47:02 +0000
Subject: [PATCH 54/54] ze: clang-format-18 the tracer helpers; declare ForEach
 macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run clang-format-18 over tracer_ze_helpers.include.c to satisfy CI. Also
add the tracer's loop macros (_ZE_FOREACH_SLOT, DL_FOREACH*, HASH_ITER) to
.clang-format's ForEachMacros so their bodies stay indented instead of being
flattened as if they weren't loops.

Formatting only — no behavior change.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .clang-format                           |  6 +++
 backends/ze/tracer_ze_helpers.include.c | 72 ++++++++++++++-----------
 2 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/.clang-format b/.clang-format
index 99557dc7..cb9c4a12 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,2 +1,8 @@
 BinPackParameters: false
 ColumnLimit: 100
+ForEachMacros:
+  - _ZE_FOREACH_SLOT
+  - DL_FOREACH
+  - DL_FOREACH_SAFE
+  - DL_FOREACH_SAFE2
+  - HASH_ITER
diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 6969f590..10693c1a 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -221,8 +221,8 @@ struct _ze_slab_chunk {
  * slot order within a chunk) — the natural time order. Binds `s` to each
  * `struct _ze_slot *`. Only for read/dispose passes that do NOT free chunks
  * mid-walk; the drain path bumps n_held by hand and uses DL_FOREACH_SAFE. */
-#define _ZE_FOREACH_SLOT(cl_data, s)                                                                \
-  for (struct _ze_slab_chunk *_c = (cl_data)->chunks; _c; _c = _c->next)                            \
+#define _ZE_FOREACH_SLOT(cl_data, s)                                                               \
+  for (struct _ze_slab_chunk *_c = (cl_data)->chunks; _c; _c = _c->next)                           \
     for (struct _ze_slot *s = _c->slots, *_se = _c->slots + _c->n_used; s < _se; ++s)
 
 struct _ze_command_list_obj_data {
@@ -281,9 +281,7 @@ static struct _ze_command_list_obj_data *_cl_find(ze_command_list_handle_t comma
   return cl;
 }
 
-static void _cl_add(struct _ze_command_list_obj_data *cl) {
-  HASH_ADD_PTR(_ze_cls, ptr, cl);
-}
+static void _cl_add(struct _ze_command_list_obj_data *cl) { HASH_ADD_PTR(_ze_cls, ptr, cl); }
 
 static struct _ze_command_list_obj_data *_cl_find_and_del(ze_command_list_handle_t command_list) {
   struct _ze_command_list_obj_data *cl = _cl_find(command_list);
@@ -299,15 +297,17 @@ static struct _ze_command_list_obj_data *_cl_find_and_del(ze_command_list_handle
  * per sync — see bench/sync_scaling). Buckets are created lazily at Execute and
  * freed when they go empty at drain. */
 struct _ze_inflight_bucket {
-  void *key; /* ze_command_queue_handle_t or ze_fence_handle_t */
+  void *key;                             /* ze_command_queue_handle_t or ze_fence_handle_t */
   struct _ze_command_list_obj_data *cls; /* DL via q_prev/q_next or f_prev/f_next */
   UT_hash_handle hh;
 };
 static struct _ze_inflight_bucket *_ze_q_index = NULL;
 static struct _ze_inflight_bucket *_ze_fence_index = NULL;
 
-static void _index_link(struct _ze_inflight_bucket **index, void *key,
-                        struct _ze_command_list_obj_data *cl, int is_fence) {
+static void _index_link(struct _ze_inflight_bucket **index,
+                        void *key,
+                        struct _ze_command_list_obj_data *cl,
+                        int is_fence) {
   if (!key)
     return;
   struct _ze_inflight_bucket *b = NULL;
@@ -325,8 +325,10 @@ static void _index_link(struct _ze_inflight_bucket **index, void *key,
     DL_APPEND2(b->cls, cl, q_prev, q_next);
 }
 
-static void _index_unlink(struct _ze_inflight_bucket **index, void *key,
-                          struct _ze_command_list_obj_data *cl, int is_fence) {
+static void _index_unlink(struct _ze_inflight_bucket **index,
+                          void *key,
+                          struct _ze_command_list_obj_data *cl,
+                          int is_fence) {
   if (!key)
     return;
   struct _ze_inflight_bucket *b = NULL;
@@ -346,7 +348,8 @@ static void _index_unlink(struct _ze_inflight_bucket **index, void *key,
 /* Link cl into the queue (and, if non-NULL, fence) in-flight indexes. Called
  * once per Execute, after in_flight_q/in_flight_fence are stamped. */
 static void _cl_index_set(struct _ze_command_list_obj_data *cl,
-                          ze_command_queue_handle_t q, ze_fence_handle_t f) {
+                          ze_command_queue_handle_t q,
+                          ze_fence_handle_t f) {
   _index_link(&_ze_q_index, q, cl, /*is_fence=*/0);
   _index_link(&_ze_fence_index, f, cl, /*is_fence=*/1);
 }
@@ -440,7 +443,9 @@ static int _ordinal_is_compute(ze_device_handle_t device, uint32_t ordinal) {
     return 0;
   struct _ze_qgroup_cache_entry *e = _qgroup_cache_get(device);
   return e && ordinal < e->n_groups &&
-         (e->flags[ordinal] & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) ? 1 : 0;
+                 (e->flags[ordinal] & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)
+             ? 1
+             : 0;
 }
 
 /* Per-(context, device) tracer-owned immediate OOO compute cl used by
@@ -530,7 +535,9 @@ static void _shadow_append_query(struct _ze_shadow_cl *sh,
 
 static inline void _on_create_command_list(ze_command_list_handle_t command_list,
                                            ze_device_handle_t device,
-                                           uint32_t ordinal, int immediate, int in_order) {
+                                           uint32_t ordinal,
+                                           int immediate,
+                                           int in_order) {
   struct _ze_command_list_obj_data *cl_data =
       (struct _ze_command_list_obj_data *)calloc(1, sizeof(*cl_data));
   if (!cl_data) {
@@ -785,9 +792,8 @@ static struct _ze_event_h *_get_profiling_event(ze_context_handle_t context) {
  * doomed ctx is at best racy). Slot-side cleanup (events, waits, preds)
  * is the caller's responsibility — this helper only owns the chunk
  * envelope and the slab. */
-static void _cl_chunk_free(struct _ze_command_list_obj_data *cl_data,
-                           struct _ze_slab_chunk *c,
-                           int free_slab) {
+static void
+_cl_chunk_free(struct _ze_command_list_obj_data *cl_data, struct _ze_slab_chunk *c, int free_slab) {
   DL_DELETE(cl_data->chunks, c);
   if (free_slab && c->slab)
     ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
@@ -932,8 +938,10 @@ static void _slot_publish(struct _ze_command_list_obj_data *cl_data,
  * Fires when Appended for immediate cls and on every Execute for regular cls
  * (it is now part of the cl body). The QKT signaling user_signal IS the
  * user_signal chain — no separate barrier needed. */
-static void _append_inline_query(ze_command_list_handle_t command_list, struct _ze_slot *s,
-                                 ze_event_handle_t inj_event, ze_event_handle_t user_signal) {
+static void _append_inline_query(ze_command_list_handle_t command_list,
+                                 struct _ze_slot *s,
+                                 ze_event_handle_t inj_event,
+                                 ze_event_handle_t user_signal) {
   _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(
       command_list, 1, &inj_event, s->chunk->slab, &s->off, user_signal, 1, &inj_event));
 }
@@ -945,7 +953,8 @@ static void _append_inline_query(ze_command_list_handle_t command_list, struct _
  * L0 Append on the user cl and touches no tracer state, so it is correct
  * both inside the critical section (shadow path) and outside it (the
  * failure-path compensation). Aborts on L0 failure (a silent hang is worse). */
-static int _chain_user_signal(ze_command_list_handle_t command_list, ze_event_handle_t inj_event,
+static int _chain_user_signal(ze_command_list_handle_t command_list,
+                              ze_event_handle_t inj_event,
                               ze_event_handle_t user_signal) {
   if (!user_signal)
     return 0;
@@ -1185,7 +1194,7 @@ static void _slot_drain(struct _ze_slot *s) {
  * low-to-high — natural time order for emission). */
 static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
   struct _ze_slab_chunk *c, *tmp;
-  DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
+  DL_FOREACH_SAFE (cl_data->chunks, c, tmp) {
     /* Bump refcount during traversal so the last _slot_drain doesn't
      * free c out from under the inner loop. Drop after, free here. */
     c->n_held++;
@@ -1204,7 +1213,7 @@ static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data); /* fwd */
 
 /* 1 if any slot in the cl is still in flight (instantiated, not yet drained). */
 static int _cl_any_live(struct _ze_command_list_obj_data *cl_data) {
-  _ZE_FOREACH_SLOT(cl_data, s)
+  _ZE_FOREACH_SLOT (cl_data, s)
     if (s->live)
       return 1;
   return 0;
@@ -1232,8 +1241,7 @@ static void _imm_reset_if_drained(struct _ze_command_list_obj_data *cl_data) {
  * struct alive with n_pinned = #referenced slots. The downstream drains that
  * drop those refs free the struct (see _slot_release's detached branch).
  * Without this, freeing the chunk here would dangle the referrers' preds[]. */
-static void _cl_chunk_reclaim(struct _ze_command_list_obj_data *cl_data,
-                              struct _ze_slab_chunk *c) {
+static void _cl_chunk_reclaim(struct _ze_command_list_obj_data *cl_data, struct _ze_slab_chunk *c) {
   uint32_t pinned = 0;
   for (uint32_t i = 0; i < c->n_used; ++i) {
     struct _ze_slot *s = &c->slots[i];
@@ -1260,7 +1268,7 @@ static void _cl_chunk_reclaim(struct _ze_command_list_obj_data *cl_data,
  * empty for reuse. Used by the zeCommandListReset hook. */
 static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data) {
   struct _ze_slab_chunk *c, *tmp;
-  DL_FOREACH_SAFE(cl_data->chunks, c, tmp)
+  DL_FOREACH_SAFE (cl_data->chunks, c, tmp)
     _cl_chunk_reclaim(cl_data, c);
   _cl_index_clear(cl_data);
   cl_data->in_flight_q = NULL;
@@ -1283,12 +1291,12 @@ static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_
    * is torn down separately, but unlinking here is still correct and cheap.) */
   _cl_index_clear(cl_data);
   if (!ctx_dying) {
-    DL_FOREACH_SAFE(cl_data->chunks, c, tmp)
+    DL_FOREACH_SAFE (cl_data->chunks, c, tmp)
       _cl_chunk_reclaim(cl_data, c);
     free(cl_data);
     return;
   }
-  DL_FOREACH_SAFE(cl_data->chunks, c, tmp) {
+  DL_FOREACH_SAFE (cl_data->chunks, c, tmp) {
     for (uint32_t i = 0; i < c->n_used; ++i)
       _slot_dispose_resources(&c->slots[i], _ZE_DISPOSE_WRAPPER);
     _cl_chunk_free(cl_data, c, /*free_slab=*/0);
@@ -1333,7 +1341,7 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
   /* 1) Drop cls bound to this ctx. */
   pthread_mutex_lock(&_ze_state_mutex);
   struct _ze_command_list_obj_data *cl_data = NULL, *cl_tmp = NULL;
-  HASH_ITER(hh, _ze_cls, cl_data, cl_tmp) {
+  HASH_ITER (hh, _ze_cls, cl_data, cl_tmp) {
     if (cl_data->cached_context != hContext)
       continue;
     HASH_DEL(_ze_cls, cl_data);
@@ -1342,7 +1350,7 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
 
   /* 2) Shadow cls keyed by (ctx, device). */
   struct _ze_shadow_cl *sh = NULL, *sh_tmp = NULL;
-  HASH_ITER(hh, _ze_shadow_cls, sh, sh_tmp) {
+  HASH_ITER (hh, _ze_shadow_cls, sh, sh_tmp) {
     if (sh->key.context != hContext)
       continue;
     HASH_DEL(_ze_shadow_cls, sh);
@@ -1357,7 +1365,7 @@ static void _on_destroy_context(ze_context_handle_t hContext) {
   if (pe) {
     HASH_DEL(_ze_event_pools, pe);
     struct _ze_event_h *w, *w_tmp;
-    DL_FOREACH_SAFE(pe->events, w, w_tmp) {
+    DL_FOREACH_SAFE (pe->events, w, w_tmp) {
       if (w->event)
         ZE_EVENT_DESTROY_PTR(w->event);
       if (w->event_pool)
@@ -1417,10 +1425,10 @@ static void _on_sync(enum _ze_sync_kind kind, void *h) {
       /* SAFE2 because _cl_drain -> _cl_index_clear unlinks cl_data from this
        * very bucket (and may free the bucket on the last unlink). */
       if (kind == _ZE_SYNC_QUEUE) {
-        DL_FOREACH_SAFE2(b->cls, cl_data, tmp, q_next)
+        DL_FOREACH_SAFE2 (b->cls, cl_data, tmp, q_next)
           _cl_drain(cl_data);
       } else {
-        DL_FOREACH_SAFE2(b->cls, cl_data, tmp, f_next)
+        DL_FOREACH_SAFE2 (b->cls, cl_data, tmp, f_next)
           _cl_drain(cl_data);
       }
     }
@@ -1494,7 +1502,7 @@ static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
   struct _ze_shadow_cl *sh = NULL;
   int sh_resolved = 0;
   struct _ze_slab_chunk *c;
-  DL_FOREACH(cl_data->chunks, c) {
+  DL_FOREACH (cl_data->chunks, c) {
     for (uint32_t j = 0; j < c->n_used; ++j) {
       struct _ze_slot *slot = &c->slots[j];
       if (!slot->inj)