diff --git a/.clang-format b/.clang-format
index 99557dc73..cb9c4a123 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,2 +1,8 @@
 BinPackParameters: false
 ColumnLimit: 100
+ForEachMacros:
+  - _ZE_FOREACH_SLOT
+  - DL_FOREACH
+  - DL_FOREACH_SAFE
+  - DL_FOREACH_SAFE2
+  - HASH_ITER
diff --git a/backends/ze/Makefile.am b/backends/ze/Makefile.am
index 2445e1d61..942c0947b 100644
--- a/backends/ze/Makefile.am
+++ b/backends/ze/Makefile.am
@@ -278,9 +278,12 @@ TRACE_COMMON = \
 	tests/interval_profiling_normal.thapi_text_pretty \
 	tests/interval_profiling_multithread.thapi_text_pretty \
 	tests/interval_profiling_API_call.thapi_text_pretty \
-	tests/interval_profiling_fast.thapi_text_pretty \
 	tests/interval_profiling_interleave_process.thapi_text_pretty \
-	tests/interval_profiling_ignore.thapi_text_pretty
+	tests/interval_profiling_ignore.thapi_text_pretty \
+	tests/interval_profiling_shared_event.thapi_text_pretty \
+	tests/interval_profiling_resubmit_event.thapi_text_pretty \
+	tests/interval_profiling_shared_event_resubmit.thapi_text_pretty \
+	tests/interval_profiling_shared_event_xphase.thapi_text_pretty
 
 BTX_ZE_GENERATED_SOURCE_TEST = \
         btx_source_ze_test/metababel/metababel.h \
diff --git a/backends/ze/btx_zeinterval_callbacks.cpp b/backends/ze/btx_zeinterval_callbacks.cpp
index c6355fdb0..2a1980e8f 100644
--- a/backends/ze/btx_zeinterval_callbacks.cpp
+++ b/backends/ze/btx_zeinterval_callbacks.cpp
@@ -559,6 +559,20 @@ static void hSignalEvent_rest_entry_callback(void *btx_handle,
       hCommandList, name, ts, btx_event_t::OTHER, {}};
 }
 
+static void zeCommandListAppendSignalEvent_entry_callback(void *btx_handle,
+                                                          void *usr_data,
+                                                          int64_t ts,
+                                                          const char *hostname,
+                                                          int64_t vpid,
+                                                          uint64_t vtid,
+                                                          ze_command_list_handle_t hCommandList,
+                                                          ze_event_handle_t hEvent) {
+  (void)hEvent;
+  auto *data = static_cast<data_t *>(usr_data);
+  data->threadToLastLaunchInfo[{hostname, vpid, vtid}] = {
+      hCommandList, "zeCommandListAppendSignalEvent", ts, btx_event_t::SIGNAL, {}};
+}
+
 /*
  *             _                              _                   _
  *     _   _  /   _  ._ _  ._ _   _. ._   _| / \      _       _  |_     _   _    _|_  _
@@ -584,9 +598,11 @@ zeCommandQueueExecuteCommandLists_entry_callback(void *btx_handle,
   const auto commandQueueDesc = data->commandQueueToDesc[{hostname, vpid, hCommandQueue}];
   for (size_t i = 0; i < _phCommandLists_vals_length; i++) {
     for (auto &hEvent : data->commandListToEvents[{hostname, vpid, phCommandLists_vals[i]}]) {
-      auto &h = data->eventToBtxDesct[{hostname, vpid, hEvent}];
-      std::get<ze_command_queue_desc_t>(h) = commandQueueDesc;
-      std::get<int64_t>(h) = ts;
+      auto &ring = data->eventToBtxDesct[{hostname, vpid, hEvent}];
+      for (auto &h : ring.entries) {
+        std::get<ze_command_queue_desc_t>(h) = commandQueueDesc;
+        std::get<int64_t>(h) = ts;
+      }
     }
   }
 }
@@ -825,11 +841,16 @@ static void event_profiling_callback(void *btx_handle,
   }
 
   // If not IMM will be commandQueueDesc overwrited latter
-  data->eventToBtxDesct[{hostname, vpid, hEvent}] = {vtid,         commandQueueDesc,
-                                                     hCommandList, hCommandListIsImmediate,
-                                                     hDevice,      commandName,
-                                                     ts_min,       clockLttngDevice,
-                                                     type,         ptr};
+  // Push onto the per-event ring. If the cursor has advanced (we've
+  // already consumed at least one result for this event), the prior
+  // ring belongs to a finished build phase — clear and start fresh.
+  auto &ring = data->eventToBtxDesct[{hostname, vpid, hEvent}];
+  if (ring.cursor > 0) {
+    ring.entries.clear();
+    ring.cursor = 0;
+  }
+  ring.entries.push_back({vtid, commandQueueDesc, hCommandList, hCommandListIsImmediate, hDevice,
+                          commandName, ts_min, clockLttngDevice, type, ptr});
   // Prepare job for non IMM
   if (!hCommandListIsImmediate)
     data->commandListToEvents[{hostname, vpid, hCommandList}].insert(hEvent);
@@ -880,14 +901,17 @@ static void event_profiling_result_callback(void *btx_handle,
 
   auto *data = static_cast<data_t *>(usr_data);
 
-  // TODO: Should  we always find the eventToBtxDesct?
-  // We didn't find the partial payload, that mean we should ignore it
+  // Read the current ring slot for this event; advance the cursor;
+  // wrap to 0 on overflow. Resubmits re-cycle through the same ring.
   const auto it_p = data->eventToBtxDesct.find({hostname, vpid, hEvent});
-  if (it_p == data->eventToBtxDesct.cend())
+  if (it_p == data->eventToBtxDesct.cend() || it_p->second.entries.empty())
     return;
-  // We don't erase, may have one entry for multiple result
+  auto &ring = it_p->second;
+  if (ring.cursor >= ring.entries.size())
+    ring.cursor = 0;
   const auto &[vtid_submission, commandQueueDesc, hCommandList, hCommandListIsImmediate, device,
-               commandName, lltngMin, clockLttngDevice, type, ptr] = it_p->second;
+               commandName, lltngMin, clockLttngDevice, type, ptr] = ring.entries[ring.cursor];
+  ring.cursor++;
   std::string metadata = "";
   {
     std::stringstream ss_metadata;
@@ -901,6 +925,13 @@ static void event_profiling_result_callback(void *btx_handle,
   if (!hCommandListIsImmediate)
     data->commandListToEvents[{hostname, vpid, hCommandList}].erase(hEvent);
 
+  /* AppendSignalEvent is a host-side signal with no GPU work to time.
+   * We pushed a ring entry to keep state consistent (so a future
+   * profiling_results lookup doesn't walk a stale prior entry), but
+   * suppress the device-side tally emission here. */
+  if (type == btx_event_t::SIGNAL)
+    return;
+
   if ((type == btx_event_t::TRAFFIC) && (status == ZE_RESULT_SUCCESS)) {
     auto &[ts, size] = std::get<btx_additional_info_traffic_t>(ptr);
     btx_push_message_lttng_traffic(btx_handle, hostname, vpid, vtid, ts, BACKEND_ZE,
@@ -1400,6 +1431,12 @@ void btx_register_usr_callbacks(void *btx_handle) {
   REGISTER_ASSOCIATED_CALLBACK(eventMemory_without_hSignalEvent_exit);
   REGISTER_ASSOCIATED_CALLBACK(hSignalEvent_rest_entry);
 
+  /* zeCommandListAppendSignalEvent doesn't match the hSignalEvent_* sets
+   * (payload is `hEvent`, not `hSignalEvent`), so it needs its own entry
+   * callback to keep threadToLastLaunchInfo from going stale. */
+  btx_register_callbacks_lttng_ust_ze_zeCommandListAppendSignalEvent_entry(
+      btx_handle, &zeCommandListAppendSignalEvent_entry_callback);
+
   /* Remove Memory */
   REGISTER_ASSOCIATED_CALLBACK(memFree_entry);
   REGISTER_ASSOCIATED_CALLBACK(memFree_exit);
diff --git a/backends/ze/btx_zeinterval_callbacks.hpp b/backends/ze/btx_zeinterval_callbacks.hpp
index 80c2dc119..a6cdb0e0f 100644
--- a/backends/ze/btx_zeinterval_callbacks.hpp
+++ b/backends/ze/btx_zeinterval_callbacks.hpp
@@ -55,7 +55,9 @@ using btx_kernel_group_size_t = std::tuple<uint32_t, uint32_t, uint32_t>;
 using btx_kernel_desct_t =
     std::tuple<std::string /*ze_kernel_desc_t*/, ze_kernel_properties_t, btx_kernel_group_size_t>;
 
-enum class btx_event_t { TRAFFIC, KERNEL, OTHER };
+// SIGNAL = zeCommandListAppendSignalEvent. Ring entry is created so state
+// stays consistent, but filtered out of the device tally (no GPU work).
+enum class btx_event_t { TRAFFIC, KERNEL, SIGNAL, OTHER };
 using btx_additional_info_traffic_t = std::tuple<int64_t /*ts*/, size_t /*size*/>;
 using btx_additional_info_kernel_t = std::string /*metadata*/;
 using btx_additional_info =
@@ -93,7 +95,18 @@ struct data_s {
   std::unordered_map<hp_command_queue_t, ze_command_queue_desc_t> commandQueueToDesc;
 
   std::unordered_map<hpt_t, btx_launch_desc_t> threadToLastLaunchInfo;
-  std::unordered_map<hp_event_t, btx_event_desct_t> eventToBtxDesct;
+
+  /* Per-event metadata ring. An hEvent can be the signal event of N
+   * Appends in one build phase, and the cl can be resubmitted M times,
+   * yielding M*N result events. We store the N Appends as a vector and
+   * advance `cursor` per result, wrapping at the end. A new push that
+   * arrives after the cursor advanced indicates a new build phase —
+   * we clear and start over so the ring tracks only the current phase. */
+  struct event_ring_t {
+    std::vector<btx_event_desct_t> entries;
+    size_t cursor = 0;
+  };
+  std::unordered_map<hp_event_t, event_ring_t> eventToBtxDesct;
   // Require for non IMM
   std::unordered_map<hp_command_list_t, std::unordered_set<ze_event_handle_t>> commandListToEvents;
 
diff --git a/backends/ze/gen_ze.rb b/backends/ze/gen_ze.rb
index 12dc75dfc..9e662b952 100644
--- a/backends/ze/gen_ze.rb
+++ b/backends/ze/gen_ze.rb
@@ -8,6 +8,7 @@
     #include <dlfcn.h>
     #include <stdio.h>
     #include <stdlib.h>
+    #include <alloca.h>
     #include <unistd.h>
     #include <string.h>
     #include <pthread.h>
diff --git a/backends/ze/tests/interval_profiling_fast.bt_text_pretty b/backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty
similarity index 57%
rename from backends/ze/tests/interval_profiling_fast.bt_text_pretty
rename to backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty
index 3403ebcdb..68e12d805 100644
--- a/backends/ze/tests/interval_profiling_fast.bt_text_pretty
+++ b/backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty
@@ -1,2 +1,3 @@
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
 lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
-lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
diff --git a/backends/ze/tests/interval_profiling_fast.thapi_text_pretty b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
similarity index 50%
rename from backends/ze/tests/interval_profiling_fast.thapi_text_pretty
rename to backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
index fb6f10a79..b4c3ca9b4 100644
--- a/backends/ze/tests/interval_profiling_fast.thapi_text_pretty
+++ b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty
@@ -1,4 +1,8 @@
+# 1 Append, but the underlying cl is Executed twice in a real run, so
+# 2 results arrive for the same hEvent. Both are attributed to that one
+# Append.
 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
-12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x1000000000000000 }
-12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x1000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
-12:00:00.030000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
+12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 }
diff --git a/backends/ze/tests/interval_profiling_shared_event.bt_text_pretty b/backends/ze/tests/interval_profiling_shared_event.bt_text_pretty
new file mode 100644
index 000000000..2205557d8
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event.bt_text_pretty
@@ -0,0 +1,8 @@
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000200, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000300, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000200, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000300, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 50, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
diff --git a/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
new file mode 100644
index 000000000..64199d25b
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty
@@ -0,0 +1,18 @@
+# 4 Appends share one hEvent. Each Append's result is attributed back to
+# its own Append, in submission order.
+12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.110000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.120000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.210000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.220000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.300000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.310000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.320000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.400000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
+12:00:00.410000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 }
+12:00:00.420000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 200, globalEnd: 240, contextStart: 200, contextEnd: 240 }
+12:00:00.430000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 300, globalEnd: 350, contextStart: 300, contextEnd: 350 }
diff --git a/backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty b/backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty
new file mode 100644
index 000000000..25a5c77b2
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty
@@ -0,0 +1,6 @@
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 50, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
diff --git a/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
new file mode 100644
index 000000000..fb64b5d7a
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty
@@ -0,0 +1,13 @@
+# 2 Appends share one hEvent, then the underlying cl is Executed twice,
+# so 4 results arrive. Each submission's pair of results is attributed to
+# the two Appends in submission order.
+12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.110000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.120000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
+12:00:00.210000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 }
+12:00:00.300000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 200, globalEnd: 240, contextStart: 200, contextEnd: 240 }
+12:00:00.310000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 300, globalEnd: 350, contextStart: 300, contextEnd: 350 }
diff --git a/backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty b/backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty
new file mode 100644
index 000000000..ebbbc90a5
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty
@@ -0,0 +1,10 @@
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 50, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
+lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" }
diff --git a/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
new file mode 100644
index 000000000..e9d336d89
--- /dev/null
+++ b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty
@@ -0,0 +1,21 @@
+# Two build phases on the same cl, both reusing the same hEvent.
+# Phase 1: 2 Appends, cl Executed twice -> 4 results.
+# Phase 2: cl is Reset, then 1 Append, cl Executed three times -> 3 results.
+# Results from a phase are attributed only to that phase's Appends; the
+# phase-1 Appends do not bleed into the phase-2 results.
+12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.110000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.120000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 }
+12:00:00.210000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 }
+12:00:00.300000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 200, globalEnd: 240, contextStart: 200, contextEnd: 240 }
+12:00:00.310000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 300, globalEnd: 350, contextStart: 300, contextEnd: 350 }
+12:00:00.400000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 }
+12:00:00.410000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 }
+12:00:00.420000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS }
+12:00:00.500000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 400, globalEnd: 410, contextStart: 400, contextEnd: 410 }
+12:00:00.510000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 500, globalEnd: 520, contextStart: 500, contextEnd: 520 }
+12:00:00.520000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 600, globalEnd: 640, contextStart: 600, contextEnd: 640 }
diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c
index 8cfe31d7a..10693c1ac 100644
--- a/backends/ze/tracer_ze_helpers.include.c
+++ b/backends/ze/tracer_ze_helpers.include.c
@@ -1,30 +1,139 @@
-#ifdef THAPI_DEBUG
-#define TAHPI_LOG stderr
-#define THAPI_DBGLOG(fmt, ...)                                                                     \
-  do {                                                                                             \
-    fprintf(TAHPI_LOG, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__, __VA_ARGS__);                \
-  } while (0)
-#define THAPI_DBGLOG_NO_ARGS(fmt)                                                                  \
+/* Algorithm
+ * =========
+ *
+ * On profiled Append (cl, sig=user_sig, waits=user_waits):
+ *   - allocate inj from per-context pool; swap user_sig -> inj
+ *   - place a Query (see "QKT placement" below)
+ *   - allocate a slot {inj, attr=user_sig, off, waits=copy(user_waits)}
+ *   - immediate cl: instantiate(slot) inline
+ *
+ * instantiate(s):
+ *   - s.preds = [event_latest_signaled[w] for w in s.waits if live]
+ *                + previous live slot in same cl (if cl is in-order)
+ *   - s.live = true; event_latest_signaled[s.attr] = &s
+ *
+ * On Execute(q, cl) prologue:
+ *   - if cl.in_flight_q: Synchronize(in_flight_q); drain_cl(cl)
+ *   - shadow-path slots: re-Append Query on shadow cl
+ *     inline-path slots: nothing (Query is baked into cl body)
+ *   - instantiate every slot in cl
+ *   - cl.in_flight_q = q; index cl under q (and its fence) for sync lookup
+ *
+ * On Sync (the synced anchor tells us what to drain):
+ *   - Sync(ev):  drain(event_latest_signaled[ev])
+ *   - Sync(q):   drain_cl(cl) for every cl in the q-index bucket for q
+ *                (O(matching cls), not a scan of every live cl)
+ *   - Sync(cl):  drain_cl(cl)
+ *
+ * drain(s):
+ *   - for p in s.preds: drain(p)
+ *   - shadow-path: host-sync on shadow_done, reset, decrement live_queries
+ *   - read slab[s.off], emit tracepoint(s.attr or inj)
+ *   - clear event_latest_signaled[s.attr] (if it still points at s)
+ *   - clear s.live and s.preds
+ *   (Build-time fields inj, attr, off, waits stay so the next Execute
+ *    can re-instantiate without re-Appending.)
+ *
+ * Concurrency
+ * ===========
+ *
+ * One global mutex (_ze_state_mutex) covers all tracer state: the cl
+ * registry, every cl's chunks/slots/preds, the event freelist + pool
+ * registry, the latest-signaled map, the shadow cl registry, the
+ * qgroup cache. Append / Execute / Drain / Destroy all take it.
+ *
+ * Per-cl mutexes don't work because drain follows cross-cl pred edges
+ * (event_latest_signaled[ev] can point at a slot in any cl) and
+ * mutates the pred's chunk via _slot_release. Any per-cl scheme has
+ * to acquire multiple cl mutexes with cross-cl ordering rules. One
+ * global mutex sidesteps that entirely.
+ *
+ * Perf: Append on different cls and freelist accesses serialize
+ * through one lock. The L0 calls inside the critical section
+ * (AppendBarrier, AppendQueryKernelTimestamps) just queue work on
+ * the GPU — the GPU executes asynchronously, so the held region is
+ * short. Drain is host-blocking (zeEventHostSynchronize on shadow
+ * fence events) and was effectively serial anyway.
+ *
+ * QKT placement
+ * =============
+ *
+ * AppendQueryKernelTimestamps (the device-side timestamp read) lives
+ * in one of two places, picked at cl create from the queue group's
+ * COMPUTE flag and stored in cl_data->is_compute. Both paths share the
+ * slot/drain/dep-graph machinery; they only differ in where the QKT is
+ * Appended and how the drain knows it has fired.
+ *
+ *   INLINE (user cl is on a COMPUTE queue group):
+ *
+ *     Kernel(sig=inj) ──> QKT(wait=inj, sig=user_signal)   [on user cl]
+ *
+ *     One Append. user_signal IS the QKT-done edge — any user-level
+ *     sync (event/queue/cl) that covers user_signal also covers the
+ *     QKT. No tracer fence event, no host-sync at drain. For regular
+ *     cls the QKT is baked into the cl body once and re-fires on every
+ *     Execute.
+ *
+ *   SHADOW (user cl is copy-only, or queue group unknown):
+ *
+ *                       ┌─> Barrier(wait=inj, sig=user_signal) [on user cl]
+ *     Kernel(sig=inj) ──┤
+ *                       └─> QKT(wait=inj, sig=shadow_done)     [on shadow cl]
+ *
+ *     Two Appends. The shadow cl is a per-(context, device) tracer-owned
+ *     immediate compute cl; QKT goes there because copy queue groups
+ *     reject AppendQueryKernelTimestamps. shadow_done is a tracer-owned
+ *     fence event that drain host-syncs on — required because the
+ *     shadow cl's completion isn't implied by any user-level sync. For
+ *     regular cls the shadow QKT is (re-)Appended in the Execute
+ *     epilogue (the user cl is in flight by then, so Appending the
+ *     Query won't deadlock on a shared engine).
+ */
+
+/* Always-on tracer log. Prefixes THAPI(func:line) so messages are
+ * grep-able across the bench/test harness which often interleaves
+ * tracer and user output. GCC's `, ##__VA_ARGS__` extension swallows
+ * the leading comma when the variadic list is empty. fflush so the
+ * line lands even if we abort() right after. */
+#define _THAPI_LOG(fmt, ...)                                                                       \
   do {                                                                                             \
-    fprintf(TAHPI_LOG, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__);                             \
+    fprintf(stderr, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__, ##__VA_ARGS__);                 \
+    fflush(stderr);                                                                                \
   } while (0)
+
+#ifdef THAPI_DEBUG
+#define THAPI_DBGLOG(fmt, ...) _THAPI_LOG(fmt, ##__VA_ARGS__)
 #else
 #define THAPI_DBGLOG(...)                                                                          \
   do {                                                                                             \
   } while (0)
-#define THAPI_DBGLOG_NO_ARGS(fmt)                                                                  \
+#endif
+
+/* Tracer invariant check: print + abort. Unconditional (not gated on
+ * NDEBUG) — silently dropping the check would let the bug ship bad
+ * data instead of crashing. Use for "this can never happen" preconditions
+ * inside the tracer, not for user-input validation. */
+#define _THAPI_ASSERT(cond, fmt, ...)                                                              \
   do {                                                                                             \
+    if (!(cond)) {                                                                                 \
+      _THAPI_LOG("assertion failed: %s — " fmt, #cond, ##__VA_ARGS__);                             \
+      abort();                                                                                     \
+    }                                                                                              \
   } while (0)
-#endif
 
-#ifdef THAPI_USE_DESTRUCTORS
-#define THAPI_ATTRIBUTE_DESTRUCTOR __attribute__((destructor))
-#else
-#define THAPI_ATTRIBUTE_DESTRUCTOR
-#endif
+/* Wrap a tracer-issued L0 call whose failure means we'd either hang the
+ * user (sync chain Barrier) or produce a non-self-consistent trace
+ * (QKT, event create, ...). Defensive: print + abort so the bug surfaces
+ * under sanitizers/CI rather than ship bad data. NOT for driver query
+ * calls (Get*Handle, GetCommandQueueGroupProperties) — those can fail
+ * transiently during teardown and have graceful fallbacks. */
+#define _ZE_MUST(call)                                                                             \
+  do {                                                                                             \
+    ze_result_t _r = (call);                                                                       \
+    _THAPI_ASSERT(_r == ZE_RESULT_SUCCESS, "%s = 0x%x", #call, _r);                                \
+  } while (0)
 
 static int _do_profile = 0;
-static int _do_cleanup = 0;
 static int _do_chained_structs = 0;
 static int _do_paranoid_drift = 0;
 static int _do_paranoid_memory_location = 0;
@@ -43,110 +152,425 @@ struct ze_closure {
 
 struct ze_closure *ze_closures = NULL;
 
-typedef enum _ze_command_list_flag { _ZE_EXECUTED = ZE_BIT(0) } _ze_command_list_flag_t;
-typedef _ze_command_list_flag_t _ze_command_list_flags_t;
-
 struct _ze_event_h;
+struct _ze_slot;
+struct _ze_slab_chunk;
+
+/* Dependency-tracking slot: one per profiled Append. Slots carry the
+ * happens-before edges the user established (via cl in-order semantics
+ * and via phWaitEvents). At sync time we walk these edges from the
+ * synced anchor and drain everything reachable. Drain is pop semantics:
+ * after emit, the slot is dropped from the cl's list. */
+struct _ze_slot {
+  struct _ze_command_list_obj_data *owner; /* cl_data this slot lives in */
+  struct _ze_slab_chunk *chunk; /* chunk this slot lives in (==> .slab to read at drain) */
+  /* Shadow path only: shadow cl the Query was Appended to. Inline-path
+   * slots leave this NULL — their Query lives in the user cl body and
+   * the dep-graph walk that triggers drain already implies it has run. */
+  struct _ze_shadow_cl *sh;
+  struct _ze_event_h *inj; /* tracer-owned event the Query waits on */
+  /* Shadow path only: tracer-owned fence event the Query signals; drain
+   * host-syncs on it. Inline-path slots leave this NULL. */
+  struct _ze_event_h *shadow_done;
+  ze_event_handle_t attr; /* user's original signal event (NULL => inj->event) */
+  size_t off;             /* byte offset within chunk->slab */
+  /* User wait events copied at Append time (stable across rebuilds);
+   * preds[] is computed at instantiate from waits[] by looking up
+   * event_latest_signaled[w] for each w. */
+  ze_event_handle_t *waits;
+  uint32_t n_waits;
+  struct _ze_slot **preds; /* points at slots whose drain must come first (may be in another cl) */
+  uint32_t n_preds;
+  unsigned char live; /* in-flight (instantiated, not drained) */
+  /* Incoming pred edges: count of downstream slots whose preds[] points
+   * here AND that have not yet been drained. Incremented at downstream
+   * _slot_instantiate, decremented at downstream _slot_drain. Slot is
+   * reclaimable iff live==0 AND refs==0. */
+  uint32_t refs;
+};
+
+#define _ZE_SLAB_CHUNK_SLOTS 64
+
+/* Slot + slab storage in fixed-size chunks; cl_data->chunks is a utlist
+ * DL of these. Imm cls allocate new chunks as needed (no cap); regular
+ * cls stop at one chunk — the inj events (and on the inline path, the
+ * QKT itself) are baked into the closed cl body, so adding a chunk
+ * after Close would create slots the body doesn't address.
+ *
+ * Within a chunk, slots[i].off is i * sizeof(timestamp) into slab. The
+ * chunk frees itself when n_held drops to 0 AND it is not the tail
+ * (new Appends still want to land on the tail). */
+struct _ze_slab_chunk {
+  void *slab;                   /* _ZE_SLAB_CHUNK_SLOTS * sizeof(ze_kernel_timestamp_result_t) */
+  ze_context_handle_t slab_ctx; /* context the slab was allocated against (zeMemFree target) */
+  uint32_t n_used;              /* slots ever assigned in this chunk (monotonic until chunk free) */
+  uint32_t n_held;              /* unreleased slots (n_used minus _slot_release calls) */
+  /* Nonzero only on a DETACHED chunk: one whose owning cl was torn down
+   * (reset/destroy) while >=1 slot was still referenced as a pred by a live
+   * slot in ANOTHER cl. The chunk is removed from cl_data->chunks, its slots'
+   * resources are already released and owner==NULL — only the struct survives
+   * so the referrers' preds[] pointers stay valid. n_pinned counts those
+   * surviving referenced slots; the downstream drain that drops the last ref
+   * frees the struct. 0 for normal attached chunks. */
+  uint32_t n_pinned;
+  struct _ze_slab_chunk *next, *prev;
+  struct _ze_slot slots[_ZE_SLAB_CHUNK_SLOTS];
+};
+
+/* Iterate every used slot in a cl, oldest-to-newest (chunk DL order, then
+ * slot order within a chunk) — the natural time order. Binds `s` to each
+ * `struct _ze_slot *`. Only for read/dispose passes that do NOT free chunks
+ * mid-walk; the drain path bumps n_held by hand and uses DL_FOREACH_SAFE. */
+#define _ZE_FOREACH_SLOT(cl_data, s)                                                               \
+  for (struct _ze_slab_chunk *_c = (cl_data)->chunks; _c; _c = _c->next)                           \
+    for (struct _ze_slot *s = _c->slots, *_se = _c->slots + _c->n_used; s < _se; ++s)
 
 struct _ze_command_list_obj_data {
-  void *ptr; /* the ze_command_list_handle_t this entry tracks */
+  void *ptr;
   UT_hash_handle hh;
-  _ze_command_list_flags_t flags;
-  struct _ze_event_h *events;
+
+  struct _ze_slab_chunk *chunks; /* utlist DL_ head; tail = chunks->prev (circular) */
+
+  /* in_flight_q is the queue this cl was last Executed on AND not yet
+   * drained. NULL means "not in flight" — safe to Execute without a
+   * force-sync. Set on Execute, cleared on drain.
+   *
+   * Held only for regular cls; immediate cls never Execute. */
+  ze_command_queue_handle_t in_flight_q;
+  /* The fence (if any) passed to that same Execute. NULL when the user
+   * Executed without a fence. Lets a fence-only sync find which cls to
+   * drain — the fence signals when all cls in its Execute complete, so
+   * zeFenceHostSynchronize(f) drains every cl whose in_flight_fence == f.
+   * Set on Execute alongside in_flight_q, cleared together on drain. */
+  ze_fence_handle_t in_flight_fence;
+  unsigned char is_immediate;
+  unsigned char is_in_order;
+  /* 1 if this cl's queue group exposes COMPUTE — its body can host
+   * AppendQueryKernelTimestamps directly, so we skip the per-(ctx,device)
+   * shadow cl and bake QKT into the user cl itself. See the placement
+   * diagram at the top of this file. 0 for copy-only cls and for any cl
+   * whose group flags we couldn't determine. Set at create; immutable. */
+  unsigned char is_compute;
+
+  /* Cached on first use: context handle for this cl. Immutable for the
+   * cl's lifetime. Load-bearing for _on_destroy_context's sweep: lets it
+   * associate cls back to their ctx without an L0 roundtrip per cl. */
+  ze_context_handle_t cached_context;
+
+  /* Membership in the per-queue / per-fence in-flight indexes (see
+   * _ze_q_index / _ze_fence_index below). A cl in flight is linked into both
+   * its queue's bucket (q_prev/q_next) and, if Executed with a fence, its
+   * fence's bucket (f_prev/f_next), so a queue/fence sync drains exactly the
+   * matching cls without scanning every live cl. Linked at Execute, unlinked
+   * at drain, both via _cl_index_clear. */
+  struct _ze_command_list_obj_data *q_prev, *q_next;
+  struct _ze_command_list_obj_data *f_prev, *f_next;
 };
 
 struct _ze_command_list_obj_data *_ze_cls = NULL;
-pthread_mutex_t _ze_cls_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-#define FIND_ZE_CL(key, val)                                                                       \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_cls_mutex);                                                            \
-    HASH_FIND_PTR(_ze_cls, key, val);                                                              \
-    pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
-  } while (0)
+/* The single mutex covering all tracer state — see the "Concurrency"
+ * section in the file header for rationale. Every static helper in this
+ * file that touches tracer state assumes the caller holds it. */
+pthread_mutex_t _ze_state_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-#define ADD_ZE_CL(val)                                                                             \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_cls_mutex);                                                            \
-    HASH_ADD_PTR(_ze_cls, ptr, val);                                                               \
-    pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
-  } while (0)
+/* Pure HASH wrappers. */
+static struct _ze_command_list_obj_data *_cl_find(ze_command_list_handle_t command_list) {
+  struct _ze_command_list_obj_data *cl = NULL;
+  HASH_FIND_PTR(_ze_cls, &command_list, cl);
+  return cl;
+}
 
-#define FIND_AND_DEL_ZE_CL(key, val)                                                               \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_cls_mutex);                                                            \
-    HASH_FIND_PTR(_ze_cls, key, val);                                                              \
-    if (val) {                                                                                     \
-      HASH_DEL(_ze_cls, val);                                                                      \
-    }                                                                                              \
-    pthread_mutex_unlock(&_ze_cls_mutex);                                                          \
-  } while (0)
+static void _cl_add(struct _ze_command_list_obj_data *cl) { HASH_ADD_PTR(_ze_cls, ptr, cl); }
 
-static inline void _on_create_command_list(ze_command_list_handle_t command_list, int immediate) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
+static struct _ze_command_list_obj_data *_cl_find_and_del(ze_command_list_handle_t command_list) {
+  struct _ze_command_list_obj_data *cl = _cl_find(command_list);
+  if (cl)
+    HASH_DEL(_ze_cls, cl);
+  return cl;
+}
 
-  FIND_ZE_CL(&command_list, cl_data);
-  if (cl_data) {
-    THAPI_DBGLOG("Command list already registered: %p", command_list);
+/* In-flight indexes: queue handle -> the cls currently in flight on that queue,
+ * and fence handle -> the cls in flight under that fence. A queue/fence sync
+ * completes exactly the cls of the matching Execute, so these let _on_sync
+ * drain just those cls instead of scanning every live cl (which is O(live cls)
+ * per sync — see bench/sync_scaling). Buckets are created lazily at Execute and
+ * freed when they go empty at drain. */
+struct _ze_inflight_bucket {
+  void *key;                             /* ze_command_queue_handle_t or ze_fence_handle_t */
+  struct _ze_command_list_obj_data *cls; /* DL via q_prev/q_next or f_prev/f_next */
+  UT_hash_handle hh;
+};
+static struct _ze_inflight_bucket *_ze_q_index = NULL;
+static struct _ze_inflight_bucket *_ze_fence_index = NULL;
+
+static void _index_link(struct _ze_inflight_bucket **index,
+                        void *key,
+                        struct _ze_command_list_obj_data *cl,
+                        int is_fence) {
+  if (!key)
     return;
+  struct _ze_inflight_bucket *b = NULL;
+  HASH_FIND_PTR(*index, &key, b);
+  if (!b) {
+    b = (struct _ze_inflight_bucket *)calloc(1, sizeof(*b));
+    if (!b)
+      return;
+    b->key = key;
+    HASH_ADD_PTR(*index, key, b);
   }
+  if (is_fence)
+    DL_APPEND2(b->cls, cl, f_prev, f_next);
+  else
+    DL_APPEND2(b->cls, cl, q_prev, q_next);
+}
 
-  cl_data = (struct _ze_command_list_obj_data *)calloc(1, sizeof(*cl_data));
-  if (!cl_data) {
-    THAPI_DBGLOG_NO_ARGS("Failed to allocate memory");
+static void _index_unlink(struct _ze_inflight_bucket **index,
+                          void *key,
+                          struct _ze_command_list_obj_data *cl,
+                          int is_fence) {
+  if (!key)
+    return;
+  struct _ze_inflight_bucket *b = NULL;
+  HASH_FIND_PTR(*index, &key, b);
+  if (!b)
     return;
+  if (is_fence)
+    DL_DELETE2(b->cls, cl, f_prev, f_next);
+  else
+    DL_DELETE2(b->cls, cl, q_prev, q_next);
+  if (!b->cls) {
+    HASH_DEL(*index, b);
+    free(b);
   }
+}
 
-  cl_data->ptr = (void *)command_list;
-  /* Immediate cls have no Execute step; their appends run on the device the
-   * moment they're submitted. Treat them as already-executed so drainers
-   * (Reset/Destroy hooks) query their events via _ZE_EXECUTED uniformly. */
-  if (immediate)
-    cl_data->flags = _ZE_EXECUTED;
+/* Link cl into the queue (and, if non-NULL, fence) in-flight indexes. Called
+ * once per Execute, after in_flight_q/in_flight_fence are stamped. */
+static void _cl_index_set(struct _ze_command_list_obj_data *cl,
+                          ze_command_queue_handle_t q,
+                          ze_fence_handle_t f) {
+  _index_link(&_ze_q_index, q, cl, /*is_fence=*/0);
+  _index_link(&_ze_fence_index, f, cl, /*is_fence=*/1);
+}
+
+/* Remove cl from both in-flight indexes. Uses cl's own in_flight_q/_fence as
+ * the keys, so it MUST run before those are cleared. Idempotent: a cl not in
+ * flight has NULL keys and is a no-op. */
+static void _cl_index_clear(struct _ze_command_list_obj_data *cl) {
+  _index_unlink(&_ze_q_index, cl->in_flight_q, cl, /*is_fence=*/0);
+  _index_unlink(&_ze_fence_index, cl->in_flight_fence, cl, /*is_fence=*/1);
+}
+
+/* Per-device cache of the queue-group flag bitmap. The lookup is
+ * read-mostly: scan zeDeviceGetCommandQueueGroupProperties once,
+ * remember the per-ordinal flags. flags==NULL means "we already checked
+ * and the device returned no groups". Used by two readers:
+ *   _get_compute_ordinal(dev)        -> first COMPUTE ord, or -1
+ *   _ordinal_is_compute(dev, ord)    -> 1 if ord is COMPUTE on dev */
+struct _ze_qgroup_cache_entry {
+  ze_device_handle_t device;
+  ze_command_queue_group_property_flags_t *flags; /* owned; n_groups entries */
+  uint32_t n_groups;
+  UT_hash_handle hh;
+};
+static struct _ze_qgroup_cache_entry *_ze_qgroup_cache = NULL;
+
+/* Populate (or return cached) flag bitmap for device. The cache lives
+ * for process lifetime. First-touch L0 queries happen under the state
+ * mutex; cost is bounded since lookups are once per device. */
+static struct _ze_qgroup_cache_entry *_qgroup_cache_get(ze_device_handle_t device) {
+  struct _ze_qgroup_cache_entry *e = NULL;
+  HASH_FIND_PTR(_ze_qgroup_cache, &device, e);
+  if (e)
+    return e;
+
+  uint32_t n_groups = 0;
+  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, NULL) !=
+          ZE_RESULT_SUCCESS ||
+      n_groups == 0)
+    return NULL;
+  ze_command_queue_group_properties_t *groups =
+      (ze_command_queue_group_properties_t *)calloc(n_groups, sizeof(*groups));
+  if (!groups)
+    return NULL;
+  for (uint32_t i = 0; i < n_groups; ++i)
+    groups[i].stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES;
+  if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, groups) !=
+      ZE_RESULT_SUCCESS) {
+    free(groups);
+    return NULL;
+  }
+  ze_command_queue_group_property_flags_t *flags =
+      (ze_command_queue_group_property_flags_t *)calloc(n_groups, sizeof(*flags));
+  if (!flags) {
+    free(groups);
+    return NULL;
+  }
+  for (uint32_t i = 0; i < n_groups; ++i)
+    flags[i] = groups[i].flags;
+  free(groups);
 
-  ADD_ZE_CL(cl_data);
+  e = (struct _ze_qgroup_cache_entry *)calloc(1, sizeof(*e));
+  if (!e) {
+    free(flags);
+    return NULL;
+  }
+  e->device = device;
+  e->flags = flags;
+  e->n_groups = n_groups;
+  HASH_ADD_PTR(_ze_qgroup_cache, device, e);
+  return e;
 }
 
-typedef enum _ze_event_flag { _ZE_IMMEDIATE_CMD = ZE_BIT(0) } _ze_event_flag_t;
-typedef _ze_event_flag_t _ze_event_flags_t;
+/* Returns the first COMPUTE queue group ordinal for device, or (uint32_t)-1
+ * if the device exposes no compute group (fatal — caller should bail). */
+static uint32_t _get_compute_ordinal(ze_device_handle_t device) {
+  struct _ze_qgroup_cache_entry *e = _qgroup_cache_get(device);
+  if (!e)
+    return (uint32_t)-1;
+  for (uint32_t i = 0; i < e->n_groups; ++i)
+    if (e->flags[i] & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)
+      return i;
+  return (uint32_t)-1;
+}
 
+/* 1 iff `ordinal` on `device` is a COMPUTE queue group. Returns 0 on any
+ * uncertainty (unknown device, OOB ordinal, driver error) — callers
+ * should treat the cl as non-compute and use the shadow-cl QKT path. */
+static int _ordinal_is_compute(ze_device_handle_t device, uint32_t ordinal) {
+  if (!device)
+    return 0;
+  struct _ze_qgroup_cache_entry *e = _qgroup_cache_get(device);
+  return e && ordinal < e->n_groups &&
+                 (e->flags[ordinal] & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)
+             ? 1
+             : 0;
+}
+
+/* Per-(context, device) tracer-owned immediate OOO compute cl used by
+ * the SHADOW path to host AppendQueryKernelTimestamps. Copy queue
+ * groups reject QKT, so the shadow cl exists to give those user cls
+ * somewhere compute-capable to put their Query. Compute user cls take
+ * the INLINE path and never touch a shadow cl — see the QKT placement
+ * diagram at the top of this file. */
+struct _ze_shadow_key {
+  ze_context_handle_t context;
+  ze_device_handle_t device;
+};
+struct _ze_shadow_cl {
+  struct _ze_shadow_key key;
+  ze_command_list_handle_t cl;
+  uint32_t live_queries; /* QKTs appended but not yet host-synced */
+  UT_hash_handle hh;
+};
+static struct _ze_shadow_cl *_ze_shadow_cls = NULL;
+
+/* Returns the shadow cl for (context, device), creating it lazily on
+ * first use (first-touch L0 zeCommandListCreateImmediate runs under
+ * the state mutex; cost bounded). Returns NULL if the device has no
+ * compute group (fatal: log to stderr) or if creation fails. */
+static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context,
+                                            ze_device_handle_t device) {
+  struct _ze_shadow_key key = {context, device};
+  struct _ze_shadow_cl *sh = NULL;
+  HASH_FIND(hh, _ze_shadow_cls, &key, sizeof(key), sh);
+  if (sh)
+    return sh;
+
+  uint32_t ord = _get_compute_ordinal(device);
+  if (ord == (uint32_t)-1) {
+    fprintf(stderr,
+            "THAPI: device %p has no COMPUTE queue group; "
+            "cannot create shadow cl. Profiling disabled for "
+            "command lists on this device.\n",
+            (void *)device);
+    return NULL;
+  }
+  /* ASYNCHRONOUS mode is critical: with SYNCHRONOUS (the DEFAULT),
+   * each AppendQueryKernelTimestamps on this immediate cl blocks until
+   * the Query completes — which it can't, because Query is waiting on
+   * inj, and inj is signaled by the user cl's kernel that hasn't been
+   * submitted yet (we're called from the user's Execute prologue).
+   * Deadlock. ASYNCHRONOUS lets the Append return immediately and the
+   * Query run device-side at its own pace. */
+  ze_command_queue_desc_t qd = {
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, NULL, ord, 0, 0, ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+      ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+  ze_command_list_handle_t new_cl = NULL;
+  if (ZE_COMMAND_LIST_CREATE_IMMEDIATE_PTR(context, device, &qd, &new_cl) != ZE_RESULT_SUCCESS ||
+      !new_cl) {
+    fprintf(stderr,
+            "THAPI: failed to create shadow cl for "
+            "context=%p device=%p\n",
+            (void *)context, (void *)device);
+    return NULL;
+  }
+  sh = (struct _ze_shadow_cl *)calloc(1, sizeof(*sh));
+  if (!sh) {
+    ZE_COMMAND_LIST_DESTROY_PTR(new_cl);
+    return NULL;
+  }
+  sh->key = key;
+  sh->cl = new_cl;
+  HASH_ADD(hh, _ze_shadow_cls, key, sizeof(sh->key), sh);
+  return sh;
+}
+
+/* Append AppendQueryKernelTimestamps on the shadow cl: wait on inj,
+ * signal shadow_done, write timestamps into slab[*off]. The state
+ * mutex also serializes the not-thread-safe-per-cl-handle L0 Append
+ * on the shared shadow cl. Aborts on L0 failure (defensive — a missing
+ * Query would silently drop this kernel's timing). */
+static void _shadow_append_query(struct _ze_shadow_cl *sh,
+                                 ze_event_handle_t inj_event,
+                                 void *slab,
+                                 size_t *off,
+                                 ze_event_handle_t shadow_done_event) {
+  sh->live_queries++;
+  _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(sh->cl, 1, &inj_event, slab, off,
+                                                              /*hSignalEvent=*/shadow_done_event,
+                                                              /*numWaitEvents=*/1, &inj_event));
+}
+
+static inline void _on_create_command_list(ze_command_list_handle_t command_list,
+                                           ze_device_handle_t device,
+                                           uint32_t ordinal,
+                                           int immediate,
+                                           int in_order) {
+  struct _ze_command_list_obj_data *cl_data =
+      (struct _ze_command_list_obj_data *)calloc(1, sizeof(*cl_data));
+  if (!cl_data) {
+    THAPI_DBGLOG("Failed to allocate memory");
+    return;
+  }
+  cl_data->ptr = (void *)command_list;
+  cl_data->is_immediate = immediate ? 1 : 0;
+  cl_data->is_in_order = in_order ? 1 : 0;
+
+  pthread_mutex_lock(&_ze_state_mutex);
+  /* _ordinal_is_compute touches the qgroup cache (state-mutex-protected). */
+  cl_data->is_compute = _ordinal_is_compute(device, ordinal) ? 1 : 0;
+  if (_cl_find(command_list)) {
+    pthread_mutex_unlock(&_ze_state_mutex);
+    THAPI_DBGLOG("Command list already registered: %p", command_list);
+    free(cl_data);
+    return;
+  }
+  _cl_add(cl_data);
+  pthread_mutex_unlock(&_ze_state_mutex);
+}
+
+/* Wrapper around an injected event we own. Lives either in the per-context
+ * free pool (between uses) or anchored to one of cl_data->slots[] (in flight). */
 struct _ze_event_h {
   ze_event_handle_t event;
-  UT_hash_handle hh;
   ze_event_pool_handle_t event_pool;
   ze_context_handle_t context;
-  _ze_event_flags_t flags;
-  /* to remember events in command lists */
+  /* doubly-linked list pointers used by the per-context free pool */
   struct _ze_event_h *next, *prev;
 };
 
-static struct _ze_event_h *_ze_events = NULL;
-static pthread_mutex_t _ze_events_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-#define FIND_ZE_EVENT(key, val)                                                                    \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_events_mutex);                                                         \
-    HASH_FIND_PTR(_ze_events, key, val);                                                           \
-    pthread_mutex_unlock(&_ze_events_mutex);                                                       \
-  } while (0)
-
-#define ADD_ZE_EVENT(val)                                                                          \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_events_mutex);                                                         \
-    HASH_ADD_PTR(_ze_events, event, val);                                                          \
-    pthread_mutex_unlock(&_ze_events_mutex);                                                       \
-  } while (0)
-
-#define FIND_AND_DEL_ZE_EVENT(key, val)                                                            \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_events_mutex);                                                         \
-    HASH_FIND_PTR(_ze_events, key, val);                                                           \
-    if (val) {                                                                                     \
-      HASH_DEL(_ze_events, val);                                                                   \
-    }                                                                                              \
-    pthread_mutex_unlock(&_ze_events_mutex);                                                       \
-  } while (0)
-
 struct _ze_event_pool_entry {
   ze_context_handle_t context;
   UT_hash_handle hh;
@@ -154,166 +578,196 @@ struct _ze_event_pool_entry {
 };
 
 struct _ze_event_pool_entry *_ze_event_pools = NULL;
-static pthread_mutex_t _ze_event_pools_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-#define GET_ZE_EVENT(key, val)                                                                     \
-  do {                                                                                             \
-    struct _ze_event_pool_entry *pool = NULL;                                                      \
-    pthread_mutex_lock(&_ze_event_pools_mutex);                                                    \
-    HASH_FIND_PTR(_ze_event_pools, key, pool);                                                     \
-    if (pool && pool->events) {                                                                    \
-      val = pool->events;                                                                          \
-      DL_DELETE(pool->events, val);                                                                \
-    } else                                                                                         \
-      val = NULL;                                                                                  \
-    pthread_mutex_unlock(&_ze_event_pools_mutex);                                                  \
-  } while (0)
+/* Per-event tracer state, keyed by the user's event handle. Two facts live
+ * here, both populated around drain and both bound to the event's lifetime, so
+ * they share one uthash entry (one lookup, one alloc, one eviction):
+ *
+ *   latest  -> the most recent slot whose attr==ev. Resolves happens-before
+ *              edges: when a new Append waits on ev, that slot becomes a pred.
+ *              Set at instantiate; cleared at drain/dispose only if it still
+ *              points at the draining slot (a newer Append may have overwritten
+ *              it — don't clobber that).
+ *   kts     -> last kernel-timestamp result we drained for ev. The Append
+ *              prologue swaps the user's signal for our inj, so the user's event
+ *              carries QKT/barrier op timing, not the kernel's. At drain we read
+ *              the real kernel result from the slab and stash it here so the
+ *              user's own zeEventQueryKernelTimestamp can be served kernel
+ *              timing; re-signaling overwrites.
+ *
+ * The whole entry is evicted by _on_destroy_event so a recycled handle address
+ * (the L0 driver reuses freed event addresses) never serves a dead event's
+ * latest slot (a dangling pred -> UAF) or stale kts. The value stays inline in
+ * the entry — no per-set heap box. */
+struct _ze_event_state_entry {
+  ze_event_handle_t ev; /* key */
+  struct _ze_slot *latest;
+  ze_kernel_timestamp_result_t kts;
+  unsigned char has_kts;
+  UT_hash_handle hh;
+};
+static struct _ze_event_state_entry *_ze_event_state = NULL;
 
-#define PUT_ZE_EVENT(val)                                                                          \
-  do {                                                                                             \
-    struct _ze_event_pool_entry *pool = NULL;                                                      \
-    pthread_mutex_lock(&_ze_event_pools_mutex);                                                    \
-    HASH_FIND_PTR(_ze_event_pools, &(val->context), pool);                                         \
-    if (!pool) {                                                                                   \
-      pool = (struct _ze_event_pool_entry *)calloc(1, sizeof(struct _ze_event_pool_entry));        \
-      if (!pool) {                                                                                 \
-        THAPI_DBGLOG_NO_ARGS("Failed to allocate memory");                                         \
-        pthread_mutex_unlock(&_ze_event_pools_mutex);                                              \
-        if (val->event_pool) {                                                                     \
-          if (val->event)                                                                          \
-            ZE_EVENT_DESTROY_PTR(val->event);                                                      \
-          ZE_EVENT_POOL_DESTROY_PTR(val->event_pool);                                              \
-        }                                                                                          \
-        free(val);                                                                                 \
-        break;                                                                                     \
-      }                                                                                            \
-      pool->context = val->context;                                                                \
-      HASH_ADD_PTR(_ze_event_pools, context, pool);                                                \
-    }                                                                                              \
-    val->flags = 0;                                                                                \
-    ZE_EVENT_HOST_RESET_PTR(val->event);                                                           \
-    DL_PREPEND(pool->events, val);                                                                 \
-    pthread_mutex_unlock(&_ze_event_pools_mutex);                                                  \
-  } while (0)
+/* Find-or-create the entry for ev. NULL only on ev==NULL or OOM. */
+static struct _ze_event_state_entry *_event_state_get_or_add(ze_event_handle_t ev) {
+  if (!ev)
+    return NULL;
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  if (!e) {
+    e = (struct _ze_event_state_entry *)calloc(1, sizeof(*e));
+    if (!e)
+      return NULL;
+    e->ev = ev;
+    HASH_ADD_PTR(_ze_event_state, ev, e);
+  }
+  return e;
+}
 
-struct _ze_event_h *_ze_event_wrappers = NULL;
-static pthread_mutex_t _ze_event_wrappers_mutex = PTHREAD_MUTEX_INITIALIZER;
+/* Drop the entry if it carries nothing worth keeping (no latest slot, no
+ * stashed kts) — keeps the map bounded as facts are cleared. */
+static inline void _event_state_gc(struct _ze_event_state_entry *e) {
+  if (e && !e->latest && !e->has_kts) {
+    HASH_DEL(_ze_event_state, e);
+    free(e);
+  }
+}
 
-#define GET_ZE_EVENT_WRAPPER(val)                                                                  \
-  do {                                                                                             \
-    pthread_mutex_lock(&_ze_event_wrappers_mutex);                                                 \
-    if (_ze_event_wrappers) {                                                                      \
-      val = _ze_event_wrappers;                                                                    \
-      DL_DELETE(_ze_event_wrappers, val);                                                          \
-    } else {                                                                                       \
-      val = calloc(1, sizeof(struct _ze_event_h));                                                 \
-    }                                                                                              \
-    pthread_mutex_unlock(&_ze_event_wrappers_mutex);                                               \
-  } while (0)
+static inline struct _ze_slot *_event_latest_get(ze_event_handle_t ev) {
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  return e ? e->latest : NULL;
+}
 
-#define PUT_ZE_EVENT_WRAPPER(val)                                                                  \
-  do {                                                                                             \
-    memset(val, 0, sizeof(struct _ze_event_h));                                                    \
-    pthread_mutex_lock(&_ze_event_wrappers_mutex);                                                 \
-    DL_PREPEND(_ze_event_wrappers, val);                                                           \
-    pthread_mutex_unlock(&_ze_event_wrappers_mutex);                                               \
-  } while (0)
+static inline void _event_latest_set(ze_event_handle_t ev, struct _ze_slot *slot) {
+  struct _ze_event_state_entry *e = _event_state_get_or_add(ev);
+  if (e)
+    e->latest = slot;
+}
 
-/* Snapshot context + immediate-flag from cmdlist into the event wrapper.
- * The immediate flag is read at register time (not at _on_reset_event
- * time) because by reset time the cmdlist may already be destroyed and
- * zeCommandListIsImmediate would dereference a freed handle. */
-static inline void _tag_event_from_cl(struct _ze_event_h *_ze_event,
-                                      ze_command_list_handle_t command_list) {
-  ze_context_handle_t context = NULL;
-  ze_result_t res = ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &context);
-  if (res == ZE_RESULT_SUCCESS && context)
-    _ze_event->context = context;
-  else
-    THAPI_DBGLOG("zeCommandListGetContextHandle failed with %d for command list: %p", res,
-                 command_list);
-
-  ze_bool_t is_immediate = 0;
-  if (ZE_COMMAND_LIST_IS_IMMEDIATE_PTR(command_list, &is_immediate) == ZE_RESULT_SUCCESS &&
-      is_immediate)
-    _ze_event->flags |= _ZE_IMMEDIATE_CMD;
-}
-
-/* Append an event wrapper we own to its cmdlist's events list, under the
- * cl-hash lock (the FIND_AND_DEL/ADD pattern guards cl_data against a
- * concurrent free in _on_destroy_command_list). */
-static inline void _attach_event_to_cl(struct _ze_event_h *_ze_event,
-                                       ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
-  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) {
-    THAPI_DBGLOG("Could not get command list associated to event: %p", _ze_event->event);
+/* Clear latest iff it still points at `slot` (a newer Append may own it now). */
+static inline void _event_latest_clear_if(ze_event_handle_t ev, struct _ze_slot *slot) {
+  if (!ev)
     return;
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  if (e && e->latest == slot) {
+    e->latest = NULL;
+    _event_state_gc(e);
   }
-  DL_APPEND(cl_data->events, _ze_event);
-  ADD_ZE_CL(cl_data);
 }
 
-/* Register an injected (tracer-owned) event. Caller has already populated
- * _ze_event->event and _ze_event->event_pool via _get_profiling_event. */
-static inline void _register_our_event(struct _ze_event_h *_ze_event,
-                                       ze_command_list_handle_t command_list) {
-  _tag_event_from_cl(_ze_event, command_list);
-  _attach_event_to_cl(_ze_event, command_list);
-  ADD_ZE_EVENT(_ze_event);
+static inline void _event_kts_set(ze_event_handle_t ev, ze_kernel_timestamp_result_t val) {
+  struct _ze_event_state_entry *e = _event_state_get_or_add(ev);
+  if (e) {
+    e->kts = val;
+    e->has_kts = 1;
+  }
 }
 
-/* Register a user event (we don't own its lifetime). Look up or create the
- * wrapper; users are responsible for reset/destroy, so we don't attach it
- * to the cl's events list. */
-static inline void _register_user_event(ze_event_handle_t event,
-                                        ze_command_list_handle_t command_list) {
-  struct _ze_event_h *_ze_event = NULL;
-  FIND_ZE_EVENT(&event, _ze_event);
-  if (_ze_event)
-    return; /* already tracked, nothing more to do */
+static inline int _event_kts_get(ze_event_handle_t ev, ze_kernel_timestamp_result_t *out) {
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  if (!e || !e->has_kts)
+    return 0;
+  *out = e->kts;
+  return 1;
+}
 
-  GET_ZE_EVENT_WRAPPER(_ze_event);
-  if (!_ze_event) {
-    THAPI_DBGLOG("Could not get event wrapper for: %p", event);
+/* Evict the whole entry (both facts) — called when the event is destroyed. */
+static inline void _event_state_del(ze_event_handle_t ev) {
+  if (!ev)
     return;
+  struct _ze_event_state_entry *e = NULL;
+  HASH_FIND_PTR(_ze_event_state, &ev, e);
+  if (e) {
+    HASH_DEL(_ze_event_state, e);
+    free(e);
   }
-  /* GET_ZE_EVENT_WRAPPER returns a fully-zeroed wrapper (calloc on first use,
-   * memset by PUT_ZE_EVENT_WRAPPER on recycle), so event_pool and flags are
-   * already 0 — only set the fields we actually want non-zero. */
-  _ze_event->event = event;
-
-  _tag_event_from_cl(_ze_event, command_list);
-  ADD_ZE_EVENT(_ze_event);
 }
 
-static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command_list) {
-  struct _ze_event_h *e_w;
-
-  ze_context_handle_t context = NULL;
-  ze_result_t res = ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &context);
-  if (res != ZE_RESULT_SUCCESS || !context) {
-    THAPI_DBGLOG("zeCommandListGetContextHandle failed with %d, for command list: %p", res,
-                 command_list);
+/* Pop one recycled event wrapper from the per-context freelist; NULL
+ * if none cached (caller falls back to creating a fresh L0 event). */
+static struct _ze_event_h *_get_ze_event(ze_context_handle_t context) {
+  struct _ze_event_pool_entry *pool = NULL;
+  HASH_FIND_PTR(_ze_event_pools, &context, pool);
+  if (!pool || !pool->events)
     return NULL;
+  struct _ze_event_h *e = pool->events;
+  DL_DELETE(pool->events, e);
+  return e;
+}
+
+/* Return an event wrapper to its per-context freelist. On total failure
+ * (no bucket can be allocated), destroy the backing L0 objects and free
+ * the wrapper — we'd rather leak nothing than poison the freelist. */
+static void _put_ze_event(struct _ze_event_h *val) {
+  _ZE_MUST(ZE_EVENT_HOST_RESET_PTR(val->event));
+  struct _ze_event_pool_entry *pool = NULL;
+  HASH_FIND_PTR(_ze_event_pools, &val->context, pool);
+  if (!pool) {
+    pool = (struct _ze_event_pool_entry *)calloc(1, sizeof(*pool));
+    if (!pool) {
+      THAPI_DBGLOG("Failed to allocate memory");
+      if (val->event_pool) {
+        if (val->event)
+          ZE_EVENT_DESTROY_PTR(val->event);
+        ZE_EVENT_POOL_DESTROY_PTR(val->event_pool);
+      }
+      free(val);
+      return;
+    }
+    pool->context = val->context;
+    HASH_ADD_PTR(_ze_event_pools, context, pool);
   }
-  GET_ZE_EVENT(&context, e_w);
+  DL_PREPEND(pool->events, val);
+}
+
+struct _ze_event_h *_ze_event_wrappers = NULL;
+
+/* Get a zeroed event wrapper struct: pop from the global recycle list if
+ * any, else calloc a fresh one. The wrapper is context-agnostic — only
+ * the backing L0 event + pool inside it bind to a specific ctx. */
+static struct _ze_event_h *_get_ze_event_wrapper(void) {
+  struct _ze_event_h *e = _ze_event_wrappers;
+  if (e)
+    DL_DELETE(_ze_event_wrappers, e);
+  else
+    e = (struct _ze_event_h *)calloc(1, sizeof(*e));
+  return e;
+}
+
+/* Return a wrapper struct to the recycle list. Used in two situations:
+ *   1) wrapper construction failed, no L0 objects ever attached;
+ *   2) the wrapper's context is being destroyed — caller has already
+ *      arranged for the L0 event/pool inside to be released (or left
+ *      them to die with the context).
+ * We zero before publishing so a future _get_ze_event_wrapper returns
+ * something equivalent to a fresh calloc. */
+static void _put_ze_event_wrapper(struct _ze_event_h *val) {
+  memset(val, 0, sizeof(*val));
+  DL_PREPEND(_ze_event_wrappers, val);
+}
+
+/* Caller-supplied ctx avoids a redundant zeCommandListGetContextHandle
+ * (the prologue already fetched it). L0 event/pool create runs under
+ * the state mutex; cold path, bounded cost. */
+static struct _ze_event_h *_get_profiling_event(ze_context_handle_t context) {
+  struct _ze_event_h *e_w = _get_ze_event(context);
   if (e_w)
     return e_w;
-
-  GET_ZE_EVENT_WRAPPER(e_w);
+  e_w = _get_ze_event_wrapper();
   if (!e_w) {
-    THAPI_DBGLOG("Could not create a new event wrapper for command list: %p", command_list);
+    THAPI_DBGLOG("Could not create a new event wrapper for context: %p", context);
     return NULL;
   }
 
   ze_event_pool_desc_t desc = {
       ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, NULL,
       ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP | ZE_EVENT_POOL_FLAG_HOST_VISIBLE, 1};
-  res = ZE_EVENT_POOL_CREATE_PTR(context, &desc, 0, NULL, &e_w->event_pool);
+  ze_result_t res = ZE_EVENT_POOL_CREATE_PTR(context, &desc, 0, NULL, &e_w->event_pool);
   if (res != ZE_RESULT_SUCCESS) {
-    THAPI_DBGLOG("zeEventPoolCreate failed with %d, for command list: %p, context: %p", res,
-                 command_list, context);
+    THAPI_DBGLOG("zeEventPoolCreate failed with %d, for context: %p", res, context);
     goto cleanup_wrapper;
   }
   ze_event_desc_t e_desc = {ZE_STRUCTURE_TYPE_EVENT_DESC, NULL, 0, ZE_EVENT_SCOPE_FLAG_HOST,
@@ -328,188 +782,782 @@ static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command
 cleanup_ep:
   ZE_EVENT_POOL_DESTROY_PTR(e_w->event_pool);
 cleanup_wrapper:
-  PUT_ZE_EVENT_WRAPPER(e_w);
+  _put_ze_event_wrapper(e_w);
   return NULL;
 }
 
-static void _profile_event_results(ze_event_handle_t event) {
-  ze_kernel_timestamp_result_t res = {0};
-  ze_result_t status;
-  ze_result_t timestamp_status;
+/* Unlink chunk c from cl_data->chunks and free its slab + struct.
+ * `free_slab` controls whether to issue zeMemFree on the slab — false when
+ * the chunk's context is being destroyed (driver reclaims; zeMemFree on a
+ * doomed ctx is at best racy). Slot-side cleanup (events, waits, preds)
+ * is the caller's responsibility — this helper only owns the chunk
+ * envelope and the slab. */
+static void
+_cl_chunk_free(struct _ze_command_list_obj_data *cl_data, struct _ze_slab_chunk *c, int free_slab) {
+  DL_DELETE(cl_data->chunks, c);
+  if (free_slab && c->slab)
+    ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+  free(c);
+}
 
-  if (tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) {
-    status = ZE_EVENT_QUERY_STATUS_PTR(event);
-    timestamp_status = ZE_EVENT_QUERY_KERNEL_TIMESTAMP_PTR(event, &res);
-    do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, event, status, timestamp_status,
-                  res.global.kernelStart, res.global.kernelEnd, res.context.kernelStart,
-                  res.context.kernelEnd);
+/* Allocate a new chunk and append it to cl_data->chunks. */
+static struct _ze_slab_chunk *_cl_chunk_alloc(struct _ze_command_list_obj_data *cl_data,
+                                              ze_context_handle_t ctx) {
+  struct _ze_slab_chunk *c = (struct _ze_slab_chunk *)calloc(1, sizeof(*c));
+  if (!c)
+    return NULL;
+  size_t bytes = (size_t)_ZE_SLAB_CHUNK_SLOTS * sizeof(ze_kernel_timestamp_result_t);
+  ze_host_mem_alloc_desc_t hd = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, NULL, 0};
+  if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &c->slab) != ZE_RESULT_SUCCESS ||
+      !c->slab) {
+    free(c);
+    return NULL;
   }
+  memset(c->slab, 0, bytes);
+  c->slab_ctx = ctx;
+  DL_APPEND(cl_data->chunks, c);
+  return c;
 }
 
-static inline void _on_destroy_event(ze_event_handle_t event) {
-  struct _ze_event_h *ze_event = NULL;
+/* Allocate one new slot at the tail of cl_data->chunks. Grows by one
+ * chunk for imm cls; regular cls stay at one chunk and return NULL when
+ * full (their inj events are baked into the closed cl body, so storage
+ * must keep addressing them via the same (slab, off) pair). */
+static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_data,
+                                        ze_context_handle_t ctx,
+                                        struct _ze_event_h *inj,
+                                        struct _ze_event_h *shadow_done,
+                                        ze_event_handle_t attr,
+                                        ze_event_handle_t *waits,
+                                        uint32_t n_waits) {
+  struct _ze_slab_chunk *tail = cl_data->chunks ? cl_data->chunks->prev : NULL;
+  if (!tail || tail->n_used >= _ZE_SLAB_CHUNK_SLOTS) {
+    if (tail && !cl_data->is_immediate) {
+      /* Regular cl is capped at one chunk (inj events are baked into the
+       * closed cl body, so storage can't move). Past the cap we drop the
+       * Append's profiling silently — warn once so the data loss is at
+       * least visible. Called under _ze_state_mutex, so the guard is safe. */
+      static int warned = 0;
+      if (!warned) {
+        warned = 1;
+        _THAPI_LOG("warning: regular command list %p exceeded %d profiled "
+                   "Appends in one build; further Appends will not be timed",
+                   (void *)cl_data->ptr, _ZE_SLAB_CHUNK_SLOTS);
+      }
+      return NULL;
+    }
+    tail = _cl_chunk_alloc(cl_data, ctx);
+    if (!tail)
+      return NULL;
+  }
+  uint32_t idx = tail->n_used;
+  struct _ze_slot *s = &tail->slots[idx];
+  /* Chunk memory is calloc'd, so all other slot fields are already zero. */
+  s->owner = cl_data;
+  s->chunk = tail;
+  s->inj = inj;
+  s->shadow_done = shadow_done;
+  s->attr = attr;
+  s->off = (size_t)idx * sizeof(ze_kernel_timestamp_result_t);
+  if (n_waits) {
+    s->waits = (ze_event_handle_t *)malloc(n_waits * sizeof(ze_event_handle_t));
+    if (s->waits) {
+      memcpy(s->waits, waits, n_waits * sizeof(ze_event_handle_t));
+      s->n_waits = n_waits;
+    }
+  }
+  tail->n_used++;
+  tail->n_held++;
+  return s;
+}
 
-  FIND_AND_DEL_ZE_EVENT(&event, ze_event);
-  if (!ze_event) {
-    return;
+/* Compute s->preds from s->waits via the global event_latest_signaled
+ * map, plus the previous live slot on this cl if the cl is in-order.
+ * Marks s live and publishes s as the new event_latest_signaled[attr]. */
+static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct _ze_slot *s) {
+  /* Slot must be inert: live=0, preds NULL. Re-instantiating a live slot
+   * would overwrite preds[] (leaking the prior pred refs) and let the
+   * in-order pred walk pick up later-appended live siblings as predecessors,
+   * forming cycles that infinite-loop _slot_drain. */
+  _THAPI_ASSERT(!s->live, "slot %p already live (double _slot_instantiate)", (void *)s);
+  s->live = 1;
+  uint32_t cap = s->n_waits + 1; /* +1 for in-order prev */
+  s->preds = (struct _ze_slot **)calloc(cap, sizeof(struct _ze_slot *));
+  s->n_preds = 0;
+  for (uint32_t i = 0; i < s->n_waits; ++i) {
+    struct _ze_slot *p = _event_latest_get(s->waits[i]);
+    if (p && p->live)
+      s->preds[s->n_preds++] = p;
+  }
+  if (cl_data->is_in_order) {
+    /* Walk chunks newest-to-oldest, slots high-to-low, stop at the first
+     * live slot strictly before s. Chunks are appended in time order
+     * (DL_APPEND) and slots within a chunk in time order, so reverse-walk
+     * yields reverse time order. Skip s itself; s might still have
+     * live=0 here but the !=s guard is safe and clearer. */
+    struct _ze_slab_chunk *c;
+    struct _ze_slot *prev = NULL;
+    for (c = cl_data->chunks ? cl_data->chunks->prev : NULL; c && !prev;
+         c = (c == cl_data->chunks) ? NULL : c->prev) {
+      for (int32_t i = (int32_t)c->n_used - 1; i >= 0; --i) {
+        if (&c->slots[i] == s)
+          continue;
+        if (c->slots[i].live) {
+          prev = &c->slots[i];
+          break;
+        }
+      }
+    }
+    if (prev)
+      s->preds[s->n_preds++] = prev;
   }
+  /* Each new pred edge holds a ref on its target. */
+  for (uint32_t i = 0; i < s->n_preds; ++i)
+    s->preds[i]->refs++;
+  if (s->attr)
+    _event_latest_set(s->attr, s);
+}
 
-  _profile_event_results(event);
-  PUT_ZE_EVENT_WRAPPER(ze_event);
+/* Publish a fresh slot: shadow path appends a Query on the per-(ctx,device)
+ * shadow cl; inline path is a no-op here (its QKT is baked into the user cl
+ * body at Append). Then instantiate in the dep graph. `s->shadow_done` is
+ * the single source of truth for "shadow vs inline" — no is_compute branch
+ * at the call site. */
+static void _slot_publish(struct _ze_command_list_obj_data *cl_data,
+                          struct _ze_slot *s,
+                          struct _ze_shadow_cl *sh) {
+  if (s->shadow_done) {
+    _THAPI_ASSERT(sh, "shadow-path slot needs a shadow cl");
+    _shadow_append_query(sh, s->inj->event, s->chunk->slab, &s->off, s->shadow_done->event);
+    s->sh = sh;
+  }
+  _slot_instantiate(cl_data, s);
 }
 
-/* Caller already holds the wrapper (e.g. iterating cl_data->events) and
- * has removed it from any per-cl list. Drops it from the global events
- * hash, optionally emits its timestamp tracepoint, and recycles. */
-static inline void _unregister_ze_event(struct _ze_event_h *ze_event, int get_results) {
-  struct _ze_event_h *evicted = NULL;
-  FIND_AND_DEL_ZE_EVENT(&ze_event->event, evicted);
-  /* evicted should be == ze_event; if not, our hash bookkeeping is corrupt. */
+/* INLINE path: bake the QKT into the user cl body (wait=inj, sig=user_signal).
+ * Fires when Appended for immediate cls and on every Execute for regular cls
+ * (it is now part of the cl body). The QKT signaling user_signal IS the
+ * user_signal chain — no separate barrier needed. */
+static void _append_inline_query(ze_command_list_handle_t command_list,
+                                 struct _ze_slot *s,
+                                 ze_event_handle_t inj_event,
+                                 ze_event_handle_t user_signal) {
+  _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(
+      command_list, 1, &inj_event, s->chunk->slab, &s->off, user_signal, 1, &inj_event));
+}
 
-  if (get_results)
-    _profile_event_results(ze_event->event);
-  if (ze_event->event_pool)
-    PUT_ZE_EVENT(ze_event);
-  else
-    PUT_ZE_EVENT_WRAPPER(ze_event);
+/* Chain the user's signal event off our inj on the user cl: the prologue
+ * swapped user_signal for inj, so without this the user's Sync(user_signal)
+ * would hang forever. No-op (returns 0) when the user passed no signal;
+ * returns 1 when the barrier was appended. Mutex-agnostic — it issues an
+ * L0 Append on the user cl and touches no tracer state, so it is correct
+ * both inside the critical section (shadow path) and outside it (the
+ * failure-path compensation). Aborts on L0 failure (a silent hang is worse). */
+static int _chain_user_signal(ze_command_list_handle_t command_list,
+                              ze_event_handle_t inj_event,
+                              ze_event_handle_t user_signal) {
+  if (!user_signal)
+    return 0;
+  _ZE_MUST(ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &inj_event));
+  return 1;
 }
 
-static inline void _on_reset_event(ze_event_handle_t event) {
-  struct _ze_event_h *ze_event = NULL;
+/* Roll back the slot just handed out by _cl_slot_append. We were the last to
+ * touch the tail chunk and hold _ze_state_mutex, so decrementing n_used/n_held
+ * and zeroing the slot is safe; if the chunk was freshly allocated only for
+ * this Append (n_used now 0), free it back so a slot-append failure doesn't
+ * leak a chunk. */
+static void _slot_append_rollback(struct _ze_command_list_obj_data *cl_data, struct _ze_slot *s) {
+  free(s->waits);
+  struct _ze_slab_chunk *c = s->chunk;
+  c->n_used--;
+  c->n_held--;
+  memset(s, 0, sizeof(*s));
+  if (c->n_used == 0)
+    _cl_chunk_free(cl_data, c, /*free_slab=*/1);
+}
 
-  FIND_AND_DEL_ZE_EVENT(&event, ze_event);
-  if (!ze_event) {
-    THAPI_DBGLOG("Could not find event: %p", event);
+/* Append-time hook from profiling_epilogue. The prologue swapped user's
+ * hSignalEvent for inj->event; user_signal is the original (possibly NULL),
+ * user_waits is the user's wait list, ctx is the cl's context (fetched
+ * once in the prologue, threaded in). Forks on cl_data->is_compute to
+ * pick the QKT placement — see "QKT placement" in the file header. */
+static void _universal_record_append(ze_command_list_handle_t command_list,
+                                     ze_context_handle_t ctx,
+                                     struct _ze_event_h *inj,
+                                     ze_event_handle_t user_signal,
+                                     ze_event_handle_t *user_waits,
+                                     uint32_t user_n_waits) {
+  if (!inj || !ctx)
     return;
+  struct _ze_event_h *shadow_done = NULL;
+  struct _ze_slot *s = NULL;
+  int barrier_chained = 0;
+
+  inj->context = ctx;
+
+  pthread_mutex_lock(&_ze_state_mutex);
+  struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
+  if (!cl_data)
+    goto fail_locked;
+  int inline_path = cl_data->is_compute;
+
+  /* Shadow path needs a fence event (Query lives on the shadow cl;
+   * drain host-syncs on it). Inline path uses user_signal as the fence
+   * via the dep graph, no extra event needed. */
+  if (!inline_path) {
+    shadow_done = _get_profiling_event(ctx);
+    if (!shadow_done)
+      goto fail_locked;
+    shadow_done->context = ctx;
   }
 
-  _profile_event_results(event);
+  /* Publish the cl->ctx mapping. _on_execute_one_cl reads it directly
+   * (no fallback fetch) when resolving the shadow cl, and
+   * _on_destroy_context's per-cl sweep matches against it. */
+  cl_data->cached_context = ctx;
 
-  if (!(ze_event->flags & _ZE_IMMEDIATE_CMD))
-    ADD_ZE_EVENT(ze_event);
-  else
-    PUT_ZE_EVENT_WRAPPER(ze_event);
+  s = _cl_slot_append(cl_data, ctx, inj, shadow_done, user_signal, user_waits, user_n_waits);
+  if (!s)
+    goto fail_locked;
+
+  if (inline_path) {
+    _append_inline_query(command_list, s, inj->event, user_signal);
+    barrier_chained = 1; /* user_signal chained via the QKT itself */
+    _slot_instantiate(cl_data, s);
+    pthread_mutex_unlock(&_ze_state_mutex);
+    return;
+  }
+
+  /* Shadow path: chain user_signal off inj on the user cl, then place
+   * the Query on the shadow cl (immediate cls only — regular cls defer
+   * to the Execute epilogue, see _on_execute_one_cl). */
+  barrier_chained = _chain_user_signal(command_list, inj->event, user_signal);
+  if (cl_data->is_immediate) {
+    ze_device_handle_t dev = NULL;
+    _ZE_MUST(ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev));
+    struct _ze_shadow_cl *sh = _get_shadow_cl(ctx, dev);
+    if (!sh)
+      goto fail_locked;
+    _slot_publish(cl_data, s, sh);
+  }
+  pthread_mutex_unlock(&_ze_state_mutex);
+  return;
+
+fail_locked:
+  if (s)
+    _slot_append_rollback(cl_data, s);
+  if (shadow_done)
+    _put_ze_event(shadow_done);
+  _put_ze_event(inj);
+  pthread_mutex_unlock(&_ze_state_mutex);
+  /* Compensate outside the state mutex: if we bailed before chaining
+   * user_signal off inj, do it now or the user's Sync(user_signal) hangs. */
+  if (!barrier_chained)
+    _chain_user_signal(command_list, inj->event, user_signal);
 }
 
-static inline void _dump_and_reset_our_event(ze_event_handle_t event) {
-  struct _ze_event_h *ze_event = NULL;
+/* Dispose the per-slot resources shared by every teardown path: the inj and
+ * shadow_done events, the waits[] copy, the preds[] array, and the slot's
+ * entry in event_latest_signaled. The event-disposal target differs by caller:
+ *   _ZE_DISPOSE_POOL    -> _put_ze_event (ctx alive: events recycle to the pool)
+ *   _ZE_DISPOSE_WRAPPER -> _put_ze_event_wrapper (ctx dying: only recycle the
+ *                          wrapper struct; the L0 event/pool die with the ctx)
+ * Deliberately does NOT touch chunk accounting (n_held / n_pinned), refs,
+ * owner, or live — those are caller-specific and stay at the call site.
+ * Every field is nulled so the call is idempotent (safe to re-run on a slot
+ * whose preds/latest-signaled were already cleared during drain). */
+enum _ze_slot_dispose_mode { _ZE_DISPOSE_POOL, _ZE_DISPOSE_WRAPPER };
+static void _slot_dispose_resources(struct _ze_slot *s, enum _ze_slot_dispose_mode mode) {
+  if (s->inj) {
+    if (mode == _ZE_DISPOSE_WRAPPER)
+      _put_ze_event_wrapper(s->inj);
+    else
+      _put_ze_event(s->inj);
+    s->inj = NULL;
+  }
+  if (s->shadow_done) {
+    if (mode == _ZE_DISPOSE_WRAPPER)
+      _put_ze_event_wrapper(s->shadow_done);
+    else
+      _put_ze_event(s->shadow_done);
+    s->shadow_done = NULL;
+  }
+  free(s->waits);
+  s->waits = NULL;
+  s->n_waits = 0;
+  free(s->preds);
+  s->preds = NULL;
+  s->n_preds = 0;
+  _event_latest_clear_if(s->attr, s);
+  s->attr = NULL;
+}
 
-  FIND_AND_DEL_ZE_EVENT(&event, ze_event);
-  if (!ze_event) {
-    THAPI_DBGLOG("Could not find event: %p", event);
+/* Reclaim a slot: PUT events back to the per-context pool, free waits,
+ * decrement chunk n_held; if the chunk hits 0 AND isn't the active
+ * tail, unlink and free it. Regular cls are skipped (their inj is
+ * baked into the cl body — reclaim happens at cl destroy instead). */
+static void _slot_release(struct _ze_slot *s) {
+  if (!s)
+    return;
+  /* Detached slot: its owning cl was torn down (reset/destroy) while this
+   * slot was still a pred of a live slot elsewhere. Its resources were freed
+   * at reclaim and owner was nulled; the chunk struct was kept alive only to
+   * keep this slot's refs addressable. We are the downstream drain dropping
+   * the last ref — drop the chunk's pin and free the bare struct at zero. */
+  if (!s->owner && s->chunk && s->chunk->n_pinned) {
+    struct _ze_slab_chunk *c = s->chunk;
+    if (--c->n_pinned == 0)
+      free(c);
     return;
   }
+  if (!s->owner || !s->owner->is_immediate)
+    return;
+  /* Reached only from _slot_drain, which already freed s->preds and cleared
+   * event_latest_signaled[s->attr]; the primitive re-running those is a no-op
+   * (free(NULL); _clear_if on a missing/overwritten key does nothing). */
+  _slot_dispose_resources(s, _ZE_DISPOSE_POOL);
+
+  struct _ze_slab_chunk *c = s->chunk;
+  struct _ze_command_list_obj_data *cl = s->owner;
+  if (!c)
+    return;
+  c->n_held--;
+  if (c->n_held == 0 && c != cl->chunks->prev)
+    _cl_chunk_free(cl, c, /*free_slab=*/1);
+}
 
-  _profile_event_results(event);
-  ZE_EVENT_HOST_RESET_PTR(event);
-  ADD_ZE_EVENT(ze_event);
+/* Drain one slot. Recurses on its preds, emits the slot's tracepoint,
+ * drops one ref on each pred (releasing fully-drained-and-unreferenced
+ * preds), then releases s if its own refs hit 0. Safe to call on an
+ * already-drained (live=0) slot. Slab read uses s->chunk->slab — preds
+ * may live in another cl, so we can't use the caller's slab.
+ *
+ * No cycle guard: preds come from in-order prev (strictly earlier slot
+ * in the same cl, DAG) and from event_latest_signaled[wait_event] (a
+ * slot published BEFORE us). A cycle would need user-declared mutual
+ * waits, which L0 itself deadlocks on. */
+static void _slot_drain(struct _ze_slot *s) {
+  if (!s || !s->live)
+    return;
+  for (uint32_t i = 0; i < s->n_preds; ++i)
+    _slot_drain(s->preds[i]);
+  s->live = 0;
+  /* Shadow-path only: block until the Query has fired, then reset
+   * shadow_done so the next Execute round starts with a clean event.
+   * The user's own sync doesn't cover the Query because it runs on the
+   * shadow cl. Inline-path slots have shadow_done==NULL — their QKT
+   * lives in the user cl body and the dep-graph walk that brought us
+   * here already implies it has run. */
+  if (s->shadow_done && s->shadow_done->event) {
+    _ZE_MUST(ZE_EVENT_HOST_SYNCHRONIZE_PTR(s->shadow_done->event, UINT64_MAX));
+    _ZE_MUST(ZE_EVENT_HOST_RESET_PTR(s->shadow_done->event));
+    /* QKT completed device-side. Drop the live ref; if nothing else on
+     * this shadow cl is in flight, Reset it: the L0 driver leaks ~10 KB
+     * per AppendQueryKernelTimestamps and only reclaims at Reset/Destroy. */
+    if (s->sh) {
+      s->sh->live_queries--;
+      if (s->sh->live_queries == 0)
+        _ZE_MUST(ZE_COMMAND_LIST_RESET_PTR(s->sh->cl));
+    }
+  }
+  ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL);
+  if (s->chunk && s->chunk->slab && attr) {
+    ze_kernel_timestamp_result_t r =
+        *(ze_kernel_timestamp_result_t *)((char *)s->chunk->slab + s->off);
+    /* Stash the kernel result under the user's own event so the user's
+     * zeEventQueryKernelTimestamp returns kernel timing, not the QKT/barrier
+     * op timing their event actually carries (we swapped it for inj). Only
+     * when the user supplied an event (s->attr); inj is ours, not queryable. */
+    if (s->attr)
+      _event_kts_set(s->attr, r);
+    if (tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results))
+      do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr, ZE_RESULT_SUCCESS,
+                    ZE_RESULT_SUCCESS, r.global.kernelStart, r.global.kernelEnd,
+                    r.context.kernelStart, r.context.kernelEnd);
+  }
+  _event_latest_clear_if(s->attr, s);
+  /* Drop refs on preds; release any that hit 0 and are already drained. */
+  for (uint32_t i = 0; i < s->n_preds; ++i) {
+    struct _ze_slot *p = s->preds[i];
+    if (--p->refs == 0 && !p->live)
+      _slot_release(p);
+  }
+  free(s->preds);
+  s->preds = NULL;
+  s->n_preds = 0;
+  if (s->refs == 0)
+    _slot_release(s);
 }
 
-/* Tear down a wrapper: optionally emit its timestamp tracepoint, then
- * destroy the injected event+pool if we own them, then recycle the
- * wrapper. Caller must have already removed it from any list/hash that
- * references it. */
-static inline void _dispose_event_wrapper(struct _ze_event_h *ze_event, int do_dump) {
-  if (do_dump && ze_event->event)
-    _profile_event_results(ze_event->event);
-  if (ze_event->event_pool) {
-    if (ze_event->event)
-      ZE_EVENT_DESTROY_PTR(ze_event->event);
-    ZE_EVENT_POOL_DESTROY_PTR(ze_event->event_pool);
+/* Drain every live slot in a cl (walk chunks oldest-to-newest, slots
+ * low-to-high — natural time order for emission). */
+static void _cl_drain(struct _ze_command_list_obj_data *cl_data) {
+  struct _ze_slab_chunk *c, *tmp;
+  DL_FOREACH_SAFE (cl_data->chunks, c, tmp) {
+    /* Bump refcount during traversal so the last _slot_drain doesn't
+     * free c out from under the inner loop. Drop after, free here. */
+    c->n_held++;
+    for (uint32_t i = 0; i < c->n_used; ++i)
+      _slot_drain(&c->slots[i]);
+    c->n_held--;
+    if (c->n_held == 0 && c != cl_data->chunks->prev)
+      _cl_chunk_free(cl_data, c, /*free_slab=*/1);
   }
-  PUT_ZE_EVENT_WRAPPER(ze_event);
+  _cl_index_clear(cl_data);
+  cl_data->in_flight_q = NULL;
+  cl_data->in_flight_fence = NULL;
+}
+
+static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data); /* fwd */
+
+/* 1 if any slot in the cl is still in flight (instantiated, not yet drained). */
+static int _cl_any_live(struct _ze_command_list_obj_data *cl_data) {
+  _ZE_FOREACH_SLOT (cl_data, s)
+    if (s->live)
+      return 1;
+  return 0;
+}
+
+/* Immediate cls only: once every slot in the cl is drained, raw-Reset the
+ * user's cl so the L0 driver reclaims its per-QKT storage (it accumulates
+ * otherwise on a long-lived reused immediate cl — see bench/mem_persistent_cl),
+ * then reclaim our own slot bookkeeping (the baked state is gone after the
+ * driver reset, exactly like a user zeCommandListReset on a regular cl).
+ * Raw *_PTR = untraced; safe only when no slot is still live (no in-flight
+ * work). Called at the tail of every sync-drain path that can touch an imm cl. */
+static void _imm_reset_if_drained(struct _ze_command_list_obj_data *cl_data) {
+  if (!cl_data || !cl_data->is_immediate || _cl_any_live(cl_data))
+    return;
+  ZE_COMMAND_LIST_RESET_PTR((ze_command_list_handle_t)cl_data->ptr);
+  _cl_data_reset(cl_data);
 }
 
-static void _event_cleanup() {
-  struct _ze_event_h *ze_event = NULL;
-  struct _ze_event_h *tmp = NULL;
-  HASH_ITER(hh, _ze_events, ze_event, tmp) {
-    HASH_DEL(_ze_events, ze_event);
-    _dispose_event_wrapper(ze_event, 1);
+/* Reclaim one chunk during cl teardown (reset or single-cl destroy, ctx
+ * alive). Releases every slot's resources (events to pool, waits, preds,
+ * clears latest-signaled), then either frees the chunk or — if any slot is
+ * still referenced as a pred by a live slot in ANOTHER cl (refs>0) — DETACHES
+ * it: unlink from cl_data->chunks, null each slot's owner, and keep the bare
+ * struct alive with n_pinned = #referenced slots. The downstream drains that
+ * drop those refs free the struct (see _slot_release's detached branch).
+ * Without this, freeing the chunk here would dangle the referrers' preds[]. */
+static void _cl_chunk_reclaim(struct _ze_command_list_obj_data *cl_data, struct _ze_slab_chunk *c) {
+  uint32_t pinned = 0;
+  for (uint32_t i = 0; i < c->n_used; ++i) {
+    struct _ze_slot *s = &c->slots[i];
+    _slot_dispose_resources(s, _ZE_DISPOSE_POOL);
+    if (s->refs)
+      pinned++;
+  }
+  if (pinned == 0) {
+    _cl_chunk_free(cl_data, c, /*free_slab=*/1);
+    return;
+  }
+  /* Detach: keep the struct alive for the surviving referenced slots. */
+  DL_DELETE(cl_data->chunks, c);
+  if (c->slab) {
+    ZE_MEM_FREE_PTR(c->slab_ctx, c->slab);
+    c->slab = NULL;
   }
+  for (uint32_t i = 0; i < c->n_used; ++i)
+    c->slots[i].owner = NULL;
+  c->n_pinned = pinned;
 }
 
-static void _on_destroy_context(ze_context_handle_t context) {
-  struct _ze_event_h *ze_event = NULL;
-  struct _ze_event_h *tmp = NULL;
-  pthread_mutex_lock(&_ze_events_mutex);
-  HASH_ITER(hh, _ze_events, ze_event, tmp) {
-    if (ze_event->context == context) {
-      HASH_DEL(_ze_events, ze_event);
-      _dispose_event_wrapper(ze_event, 1);
-    }
+/* Reclaim all of a regular cl's slot state, keeping cl_data registered and
+ * empty for reuse. Used by the zeCommandListReset hook. */
+static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data) {
+  struct _ze_slab_chunk *c, *tmp;
+  DL_FOREACH_SAFE (cl_data->chunks, c, tmp)
+    _cl_chunk_reclaim(cl_data, c);
+  _cl_index_clear(cl_data);
+  cl_data->in_flight_q = NULL;
+  cl_data->in_flight_fence = NULL;
+}
+
+/* Release everything cl_data owns and free cl_data itself. Caller has
+ * already removed cl_data from _ze_cls (single-cl: _cl_find_and_del;
+ * per-ctx sweep: HASH_DEL inside the iter). When ctx is dying we just
+ * recycle wrapper structs (the L0 event/pool will be destroyed in
+ * _on_destroy_context step 3) and skip zeMemFree on the slab (the
+ * driver reclaims, and zeMemFree on a doomed ctx is racy); no slot can
+ * outlive the ctx, so no detach is needed. When the ctx is alive a slot
+ * may still be referenced cross-cl, so we reclaim per-chunk (detaching
+ * referenced chunks) just like reset. */
+static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_dying) {
+  struct _ze_slab_chunk *c, *tmp;
+  /* Unlink from the in-flight indexes before the struct is freed, or a later
+   * queue/fence sync would walk a dangling cl. (When ctx_dying the whole index
+   * is torn down separately, but unlinking here is still correct and cheap.) */
+  _cl_index_clear(cl_data);
+  if (!ctx_dying) {
+    DL_FOREACH_SAFE (cl_data->chunks, c, tmp)
+      _cl_chunk_reclaim(cl_data, c);
+    free(cl_data);
+    return;
   }
-  pthread_mutex_unlock(&_ze_events_mutex);
-  pthread_mutex_lock(&_ze_event_pools_mutex);
-  struct _ze_event_pool_entry *pool = NULL;
-  HASH_FIND_PTR(_ze_event_pools, &context, pool);
-  if (pool) {
-    HASH_DEL(_ze_event_pools, pool);
-    struct _ze_event_h *elt = NULL, *tmp = NULL;
-    DL_FOREACH_SAFE(pool->events, elt, tmp) {
-      DL_DELETE(pool->events, elt);
-      /* Wrapper is in the free list — its event was already dumped+reset
-       * by whoever recycled it. Don't dump again, just tear down. */
-      _dispose_event_wrapper(elt, 0);
-    }
-    free(pool);
+  DL_FOREACH_SAFE (cl_data->chunks, c, tmp) {
+    for (uint32_t i = 0; i < c->n_used; ++i)
+      _slot_dispose_resources(&c->slots[i], _ZE_DISPOSE_WRAPPER);
+    _cl_chunk_free(cl_data, c, /*free_slab=*/0);
   }
-  pthread_mutex_unlock(&_ze_event_pools_mutex);
+  free(cl_data);
+}
+
+/* zeCommandListDestroy epilogue. Per L0 spec the device is no longer
+ * referencing the cl, so we don't drain — just release our state.
+ * Regular cls recycle inj here (cl body is about to die anyway);
+ * immediate cls' slots are typically already released at drain. */
+static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
+  pthread_mutex_lock(&_ze_state_mutex);
+  struct _ze_command_list_obj_data *cl_data = _cl_find_and_del(command_list);
+  if (cl_data)
+    _cl_data_destroy(cl_data, /*ctx_dying=*/0);
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
+/* zeCommandListReset epilogue. The L0 spec requires the user to have
+ * synchronized before Reset, so our slots are drained — but for a REGULAR cl
+ * "drained" is not "reclaimed": _slot_release is a no-op for regular cls
+ * (their inj is baked into the cl body, kept for reuse across Executes), so
+ * the slots linger. Reset wipes that body, so we must reclaim now; otherwise
+ * the stale slots are re-published on the next Execute (massive over-count)
+ * and their chunks accumulate (leak). We drain defensively first in case the
+ * user under-synced, then reclaim. The cl stays registered, empty for reuse. */
 static void _on_reset_command_list(ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
+  pthread_mutex_lock(&_ze_state_mutex);
+  struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
+  if (cl_data) {
+    _cl_drain(cl_data);
+    _cl_data_reset(cl_data);
+  }
+  pthread_mutex_unlock(&_ze_state_mutex);
+}
 
-  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
-  if (!cl_data) {
-    THAPI_DBGLOG("Could not get command list: %p", command_list);
-    return;
+/* zeContextDestroy prologue. Three sweeps to drop our own L0 objects
+ * that live inside this ctx; the user's own cls/events are their
+ * responsibility per the L0 contract. */
+static void _on_destroy_context(ze_context_handle_t hContext) {
+  /* 1) Drop cls bound to this ctx. */
+  pthread_mutex_lock(&_ze_state_mutex);
+  struct _ze_command_list_obj_data *cl_data = NULL, *cl_tmp = NULL;
+  HASH_ITER (hh, _ze_cls, cl_data, cl_tmp) {
+    if (cl_data->cached_context != hContext)
+      continue;
+    HASH_DEL(_ze_cls, cl_data);
+    _cl_data_destroy(cl_data, /*ctx_dying=*/1);
+  }
+
+  /* 2) Shadow cls keyed by (ctx, device). */
+  struct _ze_shadow_cl *sh = NULL, *sh_tmp = NULL;
+  HASH_ITER (hh, _ze_shadow_cls, sh, sh_tmp) {
+    if (sh->key.context != hContext)
+      continue;
+    HASH_DEL(_ze_shadow_cls, sh);
+    if (sh->cl)
+      ZE_COMMAND_LIST_DESTROY_PTR(sh->cl);
+    free(sh);
   }
-  struct _ze_event_h *elt = NULL, *tmp = NULL;
-  DL_FOREACH_SAFE(cl_data->events, elt, tmp) {
-    DL_DELETE(cl_data->events, elt);
-    _unregister_ze_event(elt, cl_data->flags & _ZE_EXECUTED);
+
+  /* 3) Per-ctx event pool freelist. */
+  struct _ze_event_pool_entry *pe = NULL;
+  HASH_FIND_PTR(_ze_event_pools, &hContext, pe);
+  if (pe) {
+    HASH_DEL(_ze_event_pools, pe);
+    struct _ze_event_h *w, *w_tmp;
+    DL_FOREACH_SAFE (pe->events, w, w_tmp) {
+      if (w->event)
+        ZE_EVENT_DESTROY_PTR(w->event);
+      if (w->event_pool)
+        ZE_EVENT_POOL_DESTROY_PTR(w->event_pool);
+      DL_DELETE(pe->events, w);
+      _put_ze_event_wrapper(w);
+    }
+    free(pe);
   }
-  cl_data->flags &= ~_ZE_EXECUTED;
-  ADD_ZE_CL(cl_data);
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
-static void _on_execute_command_lists(uint32_t numCommandLists,
-                                      ze_command_list_handle_t *phCommandLists) {
-  for (uint32_t i = 0; i < numCommandLists; i++) {
-    struct _ze_command_list_obj_data *cl_data = NULL;
-    FIND_AND_DEL_ZE_CL(phCommandLists + i, cl_data);
+/* The four user sync APIs all reduce to "drain the slots the synced anchor
+ * covers". They differ only in how the anchor selects work:
+ *
+ *   _ZE_SYNC_CL     zeCommandListHostSynchronize  -> the one named cl
+ *   _ZE_SYNC_QUEUE  zeCommandQueueSynchronize     -> every cl with in_flight_q == h
+ *   _ZE_SYNC_FENCE  zeFenceHostSynchronize        -> every cl with in_flight_fence == h
+ *   _ZE_SYNC_EVENT  zeEventHostSynchronize        -> the slot that last signaled h,
+ *                                                    walking its pred edges
+ *
+ * QUEUE/FENCE share one rule: a queue/fence wait completes exactly the cls a
+ * given Execute submitted, identified by the handle stamped on the cl at
+ * Execute. CL/EVENT name their target directly. After draining, a fully-drained
+ * immediate cl is raw-Reset to cap the driver's per-QKT storage leak
+ * (_imm_reset_if_drained); for the cl/queue/fence anchors _cl_drain already
+ * cleared in_flight_*, while the event anchor may leave live siblings, so it
+ * clears in_flight_* only once the cl has no slot left in flight. */
+enum _ze_sync_kind { _ZE_SYNC_CL, _ZE_SYNC_QUEUE, _ZE_SYNC_FENCE, _ZE_SYNC_EVENT };
+static void _on_sync(enum _ze_sync_kind kind, void *h) {
+  pthread_mutex_lock(&_ze_state_mutex);
+  if (kind == _ZE_SYNC_EVENT) {
+    struct _ze_slot *s = _event_latest_get((ze_event_handle_t)h);
+    if (s && s->owner) {
+      _slot_drain(s);
+      if (!_cl_any_live(s->owner)) {
+        _cl_index_clear(s->owner);
+        s->owner->in_flight_q = NULL;
+        s->owner->in_flight_fence = NULL;
+        _imm_reset_if_drained(s->owner);
+      }
+    }
+  } else if (kind == _ZE_SYNC_CL) {
+    struct _ze_command_list_obj_data *cl_data = _cl_find((ze_command_list_handle_t)h);
     if (cl_data) {
-      /* dump events if they were executed */
-      if (cl_data->flags & _ZE_EXECUTED) {
-        struct _ze_event_h *elt = NULL;
-        DL_FOREACH(cl_data->events, elt) { _dump_and_reset_our_event(elt->event); }
-      } else
-        cl_data->flags |= _ZE_EXECUTED;
-      ADD_ZE_CL(cl_data);
-    } else
-      THAPI_DBGLOG("Could not get command list: %p", phCommandLists[i]);
+      _cl_drain(cl_data);
+      _imm_reset_if_drained(cl_data);
+    }
+  } else { /* _ZE_SYNC_QUEUE / _ZE_SYNC_FENCE: drain just the indexed cls */
+    struct _ze_inflight_bucket *b = NULL;
+    if (kind == _ZE_SYNC_QUEUE)
+      HASH_FIND_PTR(_ze_q_index, &h, b);
+    else
+      HASH_FIND_PTR(_ze_fence_index, &h, b);
+    if (b) {
+      struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL;
+      /* SAFE2 because _cl_drain -> _cl_index_clear unlinks cl_data from this
+       * very bucket (and may free the bucket on the last unlink). */
+      if (kind == _ZE_SYNC_QUEUE) {
+        DL_FOREACH_SAFE2 (b->cls, cl_data, tmp, q_next)
+          _cl_drain(cl_data);
+      } else {
+        DL_FOREACH_SAFE2 (b->cls, cl_data, tmp, f_next)
+          _cl_drain(cl_data);
+      }
+    }
   }
+  pthread_mutex_unlock(&_ze_state_mutex);
 }
 
-static void _on_destroy_command_list(ze_command_list_handle_t command_list) {
-  struct _ze_command_list_obj_data *cl_data = NULL;
+/* zeEventQueryKernelTimestamp epilogue. If we drained a kernel result for
+ * this user event, overwrite *dstptr with it: the user's event carries the
+ * QKT/barrier op timing (we swapped their signal for inj at Append), but the
+ * caller wants the KERNEL timing, which we stashed at drain. Returns 1 if it
+ * served a stashed result. */
+static int _on_query_kernel_timestamp(ze_event_handle_t hEvent,
+                                      ze_kernel_timestamp_result_t *dstptr) {
+  if (!hEvent || !dstptr)
+    return 0;
+  pthread_mutex_lock(&_ze_state_mutex);
+  int found = _event_kts_get(hEvent, dstptr);
+  pthread_mutex_unlock(&_ze_state_mutex);
+  return found;
+}
+
+/* zeEventDestroy epilogue (success only). The per-event state entry is keyed by
+ * the event's HANDLE ADDRESS, which the L0 driver recycles: a fresh event
+ * created after this one is destroyed can land on the same address. Without
+ * eviction the new event inherits the dead one's entry —
+ *   .kts:    a never-signaled event's zeEventQueryKernelTimestamp would be
+ *            served the prior event's stale timing;
+ *   .latest: a wait on the reused address would resolve to a freed slot, a
+ *            use-after-free in the pred walk.
+ * Evicting the entry at destroy bounds the map to live events and closes the
+ * recycled-address reads. Gated on a successful destroy by the caller: a failed
+ * destroy leaves the event (and its address) alive, so its data stays. */
+static void _on_destroy_event(ze_event_handle_t hEvent) {
+  pthread_mutex_lock(&_ze_state_mutex);
+  _event_state_del(hEvent);
+  pthread_mutex_unlock(&_ze_state_mutex);
+}
 
-  FIND_AND_DEL_ZE_CL(&command_list, cl_data);
+/* Execute-epilogue handler for ONE cl. Runs AFTER L0 Execute returned,
+ * with the user cl in flight. Three phases:
+ *
+ *   1) If in_flight_q is set (prior Execute by another thread),
+ *      force-sync that queue and drain before we overwrite it.
+ *      Regression test: inorder_reg_Event_multithreaded_01.
+ *   2) Publish each not-yet-live slot (_slot_publish): shadow-path slots
+ *      Append a fresh Query on the per-(ctx,device) shadow cl, then every
+ *      slot is instantiated into the dep graph. The Append must run AFTER
+ *      L0 Execute — appending earlier deadlocks if the shadow shares an
+ *      engine with the user cl (tests/bugs/query_on_separate_cl_regular_user_cl).
+ *      Inline-path cls bake the QKT into the cl body at Append, so their
+ *      publish is instantiate-only.
+ *   3) Stamp in_flight_q = hQueue and in_flight_fence = hFence (the fence
+ *      the user passed to this Execute, or NULL). */
+static void _on_execute_one_cl(ze_command_queue_handle_t hQueue,
+                               ze_fence_handle_t hFence,
+                               ze_command_list_handle_t command_list) {
+  pthread_mutex_lock(&_ze_state_mutex);
+  struct _ze_command_list_obj_data *cl_data = _cl_find(command_list);
   if (!cl_data) {
-    THAPI_DBGLOG("Could not get command list: %p", command_list);
+    pthread_mutex_unlock(&_ze_state_mutex);
     return;
   }
-  if (_do_profile) {
-    struct _ze_event_h *elt = NULL, *tmp = NULL;
-    DL_FOREACH_SAFE(cl_data->events, elt, tmp) {
-      DL_DELETE(cl_data->events, elt);
-      _unregister_ze_event(elt, cl_data->flags & _ZE_EXECUTED);
+
+  if (cl_data->in_flight_q) {
+    _ZE_MUST(ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX));
+    _cl_drain(cl_data);
+  }
+  /* Shadow cl is resolved lazily on first shadow-path slot. Inline-only cls
+   * never trigger the lookup. */
+  struct _ze_shadow_cl *sh = NULL;
+  int sh_resolved = 0;
+  struct _ze_slab_chunk *c;
+  DL_FOREACH (cl_data->chunks, c) {
+    for (uint32_t j = 0; j < c->n_used; ++j) {
+      struct _ze_slot *slot = &c->slots[j];
+      if (!slot->inj)
+        continue;
+      /* Already-live slots have nothing left to do this Execute: their
+       * dep-graph entry from Append-time _slot_instantiate is still valid,
+       * and (inline path) their QKT is baked into the cl body and re-fires
+       * automatically. Only fresh / drained slots need work here. */
+      if (slot->live)
+        continue;
+      if (slot->shadow_done && !sh_resolved) {
+        /* cached_context was published by _universal_record_append before any
+         * shadow_done slot could exist, so it's always set here — no need
+         * for an L0 round-trip to recover it. */
+        ze_context_handle_t ctx = cl_data->cached_context;
+        ze_device_handle_t dev = NULL;
+        _ZE_MUST(ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev));
+        sh = ctx ? _get_shadow_cl(ctx, dev) : NULL;
+        sh_resolved = 1;
+      }
+      if (slot->shadow_done && !sh)
+        continue;
+      _slot_publish(cl_data, slot, sh);
     }
   }
-  free(cl_data);
+  cl_data->in_flight_q = hQueue;
+  cl_data->in_flight_fence = hFence;
+  /* Index this cl under its queue (and fence) so a later queue/fence sync
+   * drains it without scanning every live cl. The force-sync+drain above
+   * already unlinked any prior in-flight membership, so no double-link. */
+  _cl_index_set(cl_data, hQueue, hFence);
+
+  pthread_mutex_unlock(&_ze_state_mutex);
+}
+
+static void _on_execute_command_lists_epilogue(ze_command_queue_handle_t hQueue,
+                                               ze_fence_handle_t hFence,
+                                               uint32_t numCommandLists,
+                                               ze_command_list_handle_t *phCommandLists) {
+  for (uint32_t i = 0; i < numCommandLists; ++i)
+    _on_execute_one_cl(hQueue, hFence, phCommandLists[i]);
 }
 
+/* ========================================================================
+ * Property/info dumping + tracer init
+ *
+ * Separate concern from the slot/drain engine above: read device/driver/
+ * kernel/memory properties and emit the lttng_ust_ze_properties / _build
+ * tracepoints, plus one-time loader/symbol init. Self-contained — the
+ * engine never calls into this section, and the only external callers are
+ * ze_model.rb hooks (_do_state, _dump_memory_info,
+ * _dump_command_list_device_timer, _in_loader_init) and gen_ze.rb
+ * (_init_tracer / _init_tracer_dump).
+ * ======================================================================== */
+
 static pthread_once_t _init = PTHREAD_ONCE_INIT;
 static __thread volatile int _in_init = 0;
 static volatile unsigned int _in_loader_init = 0;
@@ -524,13 +1572,6 @@ static inline int _do_state() {
          tracepoint_enabled(lttng_ust_ze_properties, memory_info_range);
 }
 
-static void THAPI_ATTRIBUTE_DESTRUCTOR _lib_cleanup() {
-  if (_do_cleanup) {
-    if (_do_profile)
-      _event_cleanup();
-  }
-}
-
 static void _dump_driver_subdevice_properties(ze_driver_handle_t hDriver,
                                               ze_device_handle_t hDevice) {
   if (!tracepoint_enabled(lttng_ust_ze_properties, subdevice))
@@ -666,24 +1707,6 @@ static inline void _dump_memory_info(ze_command_list_handle_t hCommandList, cons
     _dump_memory_info_ctx(hContext, ptr);
 }
 
-////////////////////////////////////////////
-#define _ZE_ERROR_MSG(NAME, RES)                                                                   \
-  do {                                                                                             \
-    fprintf(stderr, "%s() failed at %d(%s): res=%x\n", (NAME), __LINE__, __FILE__, (RES));         \
-  } while (0)
-#define _ZE_ERROR_MSG_NOTERMINATE(NAME, RES)                                                       \
-  do {                                                                                             \
-    fprintf(stderr, "%s() error at %d(%s): res=%x\n", (NAME), __LINE__, __FILE__, (RES));          \
-  } while (0)
-#define _ERROR_MSG(MSG)                                                                            \
-  {                                                                                                \
-    perror((MSG)) do {                                                                             \
-      {                                                                                            \
-        perror((MSG));                                                                             \
-        fprintf(stderr, "errno=%d at %d(%s)", errno, __LINE__, __FILE__);                          \
-      }                                                                                            \
-      while (0)
-
 static void _load_tracer(void) {
   char *s = NULL;
   void *handle = NULL;
@@ -736,12 +1759,6 @@ static void _load_tracer(void) {
   s = getenv("LTTNG_UST_ZE_PARANOID_MEMORY_LOCATION");
   if (s)
     _do_paranoid_memory_location = 1;
-
-  _do_cleanup = 1;
-
-#ifndef THAPI_USE_DESTRUCTORS
-  atexit(_lib_cleanup);
-#endif
 }
 
 static void _load_tracer_dump(void) {
diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb
index ec664445a..7341d253b 100644
--- a/backends/ze/ze_model.rb
+++ b/backends/ze/ze_model.rb
@@ -139,39 +139,107 @@ def upper_snake_case(str)
 
 register_epilogue 'zeCommandListCreate', <<EOF
   if (_do_state()) {
-    if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList) {
-      _on_create_command_list(*phCommandList, 0);
+    if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList && desc) {
+      int _io = (desc->flags & ZE_COMMAND_LIST_FLAG_IN_ORDER) ? 1 : 0;
+      _on_create_command_list(*phCommandList, hDevice, desc->commandQueueGroupOrdinal,
+                              /*immediate=*/0, _io);
     }
   }
 EOF
 
 register_epilogue 'zeCommandListCreateImmediate', <<EOF
   if (_do_state()) {
-    if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList) {
-      _on_create_command_list(*phCommandList, 1);
+    if (_retval == ZE_RESULT_SUCCESS && phCommandList && *phCommandList && altdesc) {
+      int _io = (altdesc->flags & ZE_COMMAND_QUEUE_FLAG_IN_ORDER) ? 1 : 0;
+      _on_create_command_list(*phCommandList, hDevice, altdesc->ordinal,
+                              /*immediate=*/1, _io);
     }
   }
 EOF
 
+# Reset hook: the L0 spec
+# (https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#zecommandlistreset)
+# says the user must have synchronized first, so our slots are drained — but
+# for a REGULAR cl "drained" is not "reclaimed" (_slot_release is a no-op for
+# regular cls; their inj is baked into the cl body for reuse across Executes).
+# Reset wipes that body, so we reclaim the slots/chunks/events now. Without it
+# the stale slots are re-published on the next Execute (over-count) and chunks
+# leak. The cl stays registered, empty for reuse.
 register_epilogue 'zeCommandListReset', <<EOF
-  if (_do_profile && hCommandList)
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hCommandList)
     _on_reset_command_list(hCommandList);
 EOF
 
+# Destroy hook: the same spec rule applies for the GPU side (no in-flight
+# work on the cl), but we still need to clean up OUR host-side state —
+# slot/slab chunks, per-slot waits, and tracer-owned events that haven't
+# already gone back to the pool. Otherwise every cl create/destroy cycle
+# leaks all of the above.
 register_epilogue 'zeCommandListDestroy', <<EOF
-  if (_do_state()) {
-    if (_retval == ZE_RESULT_SUCCESS && hCommandList) {
-      _on_destroy_command_list(hCommandList);
-    }
-  }
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hCommandList)
+    _on_destroy_command_list(hCommandList);
+EOF
+
+# zeContextDestroy prologue: tear down our own L0 objects that live
+# inside this context (shadow cls, per-ctx event pools/events) BEFORE the
+# user destroys the context. The L0 spec says the user has ensured the
+# device is no longer referencing the context, so all user-side cls/events
+# are already done — we just need to not leak our allocations.
+register_prologue 'zeContextDestroy', <<EOF
+  if (_do_profile && hContext)
+    _on_destroy_context(hContext);
 EOF
 
+# Epilogue runs after L0's actual submission has returned. ALL the
+# tracer's bookkeeping for Execute happens here (no prologue) so that
+# concurrent Executes / Syncs from other threads observe in_flight_q
+# atomically — the force-sync-prior + Append-Query + claim-in_flight_q
+# are one critical section.
+#
+# The Append-Query specifically MUST run after L0 submit, not before:
+# the shadow cl can share the engine with the user cl, and a pending
+# shadow Query op holds the engine, deadlocking the user cl.
 register_epilogue 'zeCommandQueueExecuteCommandLists', <<EOF
-  if (_do_profile) {
-    if (_retval == ZE_RESULT_SUCCESS && numCommandLists > 0) {
-      _on_execute_command_lists(numCommandLists, phCommandLists);
-    }
-  }
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && numCommandLists > 0 && phCommandLists)
+    _on_execute_command_lists_epilogue(hCommandQueue, hFence, numCommandLists, phCommandLists);
+EOF
+
+# Sync hooks: walk dependency edges from the synced anchor and drain
+# everything reachable. Each sync API has a different anchor.
+register_epilogue 'zeCommandQueueSynchronize', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS)
+    _on_sync(_ZE_SYNC_QUEUE, hCommandQueue);
+EOF
+
+register_epilogue 'zeEventHostSynchronize', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hEvent)
+    _on_sync(_ZE_SYNC_EVENT, hEvent);
+EOF
+
+register_epilogue 'zeCommandListHostSynchronize', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hCommandList)
+    _on_sync(_ZE_SYNC_CL, hCommandList);
+EOF
+
+# The Append prologue swaps the user's signal event for our injected event, so
+# the user's own event ends up carrying the QKT/barrier op timing, not the
+# kernel's. If the user queries their event's kernel timestamp themselves,
+# serve back the kernel result we stashed at drain so they see kernel timing.
+register_epilogue 'zeEventQueryKernelTimestamp', <<EOF
+  if (_do_profile && hEvent && dstptr &&
+      _on_query_kernel_timestamp(hEvent, dstptr))
+    _retval = ZE_RESULT_SUCCESS;
+EOF
+
+# Fence sync: the fence the user passed to Execute is stamped onto each cl
+# (in_flight_fence), so a fence wait drains exactly the cls that Execute
+# submitted. zeFenceQueryStatus is NOT hooked: it's a non-blocking poll, so
+# a SUCCESS return means the work is done but we can't assume the user is
+# finished issuing — draining there could race a still-building reuse. The
+# blocking zeFenceHostSynchronize is the safe anchor.
+register_epilogue 'zeFenceHostSynchronize', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hFence)
+    _on_sync(_ZE_SYNC_FENCE, hFence);
 EOF
 
 register_prologue 'zeEventPoolCreate', <<EOF
@@ -194,22 +262,14 @@ def upper_snake_case(str)
   }
 EOF
 
-register_prologue 'zeEventDestroy', <<EOF
-  if (_do_profile && hEvent) {
+# Evict our per-event state once the destroy SUCCEEDS: the driver recycles
+# handle addresses, so a fresh event can reuse this one's. Without eviction the
+# new event inherits the dead one's stashed kernel timing and a dangling latest-
+# signaled slot pointer. Epilogue gated on _retval — a failed destroy (e.g. a bad
+# handle) leaves the event alive, its address can't be recycled, data must stay.
+register_epilogue 'zeEventDestroy', <<EOF
+  if (_do_profile && _retval == ZE_RESULT_SUCCESS && hEvent)
     _on_destroy_event(hEvent);
-  }
-EOF
-
-register_prologue 'zeEventHostReset', <<EOF
-  if (_do_profile && hEvent) {
-    _on_reset_event(hEvent);
-  }
-EOF
-
-register_epilogue 'zeContextDestroy', <<EOF
-  if (_do_profile && hContext) {
-    _on_destroy_context(hContext);
-  }
 EOF
 
 # Dump memory info if required
@@ -260,28 +320,49 @@ def upper_snake_case(str)
 # WARNING: there seems to be no way to profile if
 # zeCommandListAppendEventReset is used or at least
 # not very cleanly is used....
+# Universal scheme (see project_ze_universal_scheme):
+#   prologue: always inject _ewrapper. Save user's signal (may be NULL).
+#             Swap user's signal -> our injected event.
+#   epilogue: on success, call _universal_record_append which inserts
+#             a QueryKernelTimestamps(wait=inj, signal=user_sig) into
+#             the cmdlist and records the slot for drain.
+#             The event_profiling tracepoint is attributed to the
+#             user's original signal (or inj when user passed NULL).
+#   on sync (queue/event/fence/cl-host): drain the slabs.
 profiling_prologue = lambda { |event_name|
   <<EOF
+  ze_event_handle_t _user_signal = #{event_name};
   struct _ze_event_h * _ewrapper = NULL;
-  if (_do_profile && !#{event_name}) {
-    _ewrapper = _get_profiling_event(hCommandList);
-    if (_ewrapper)
-      #{event_name} = _ewrapper->event;
+  /* Fetched once per profiled Append and threaded to both
+   * _get_profiling_event (prologue) and _universal_record_append (epilogue)
+   * so the tracer issues exactly one zeCommandListGetContextHandle per
+   * Append instead of three. */
+  ze_context_handle_t _ctx = NULL;
+  if (_do_profile) {
+    if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(hCommandList, &_ctx) == ZE_RESULT_SUCCESS && _ctx) {
+      pthread_mutex_lock(&_ze_state_mutex);
+      _ewrapper = _get_profiling_event(_ctx);
+      pthread_mutex_unlock(&_ze_state_mutex);
+      if (_ewrapper)
+        #{event_name} = _ewrapper->event;
+    }
+    /* If injection failed, fall through with the user's signal unchanged;
+     * we won't be able to time this Append, but it still runs. */
   }
 EOF
 }
 
-profiling_epilogue = lambda { |event_name|
+profiling_epilogue = lambda { |_event_name, waits_expr = 'phWaitEvents', n_waits_expr = 'numWaitEvents'|
   <<EOF
-  if (_do_profile && #{event_name}) {
+  if (_do_profile && _ewrapper) {
     if (_retval == ZE_RESULT_SUCCESS) {
-      if (_ewrapper)
-        _register_our_event(_ewrapper, hCommandList);
-      else
-        _register_user_event(#{event_name}, hCommandList);
-      tracepoint(lttng_ust_ze_profiling, event_profiling, #{event_name});
-    } else if (_ewrapper)
-      PUT_ZE_EVENT(_ewrapper);
+      ze_event_handle_t _attr = _user_signal ? _user_signal : _ewrapper->event;
+      _universal_record_append(hCommandList, _ctx, _ewrapper, _user_signal,
+                               #{waits_expr}, #{n_waits_expr});
+      tracepoint(lttng_ust_ze_profiling, event_profiling, _attr);
+    } else {
+      _put_ze_event(_ewrapper);
+    }
   }
 EOF
 }
@@ -319,7 +400,7 @@ def upper_snake_case(str)
 
 ['zeCommandListAppendSignalEvent'].each do |c|
   register_prologue c, profiling_prologue.call('hEvent')
-  register_epilogue c, profiling_epilogue.call('hEvent')
+  register_epilogue c, profiling_epilogue.call('hEvent', 'NULL', '0')
 end
 
 # WARNING
diff --git a/utils/thapi_log_to_bt_source_component.rb b/utils/thapi_log_to_bt_source_component.rb
index 3e16753f7..f27fe4120 100755
--- a/utils/thapi_log_to_bt_source_component.rb
+++ b/utils/thapi_log_to_bt_source_component.rb
@@ -146,7 +146,10 @@ def parse_event(model, line, exclude_fields)
 
 def parse_log(model, input_path, exclude_fields)
   File.open(input_path, 'r') do |file|
-    file.each_line.map do |line|
+    file.each_line.filter_map do |line|
+      stripped = line.strip
+      next if stripped.empty? || stripped.start_with?('#')
+
       parse_event(model, line, exclude_fields)
     end
   end