diff --git a/.clang-format b/.clang-format index 99557dc73..cb9c4a123 100644 --- a/.clang-format +++ b/.clang-format @@ -1,2 +1,8 @@ BinPackParameters: false ColumnLimit: 100 +ForEachMacros: + - _ZE_FOREACH_SLOT + - DL_FOREACH + - DL_FOREACH_SAFE + - DL_FOREACH_SAFE2 + - HASH_ITER diff --git a/backends/ze/Makefile.am b/backends/ze/Makefile.am index 2445e1d61..942c0947b 100644 --- a/backends/ze/Makefile.am +++ b/backends/ze/Makefile.am @@ -278,9 +278,12 @@ TRACE_COMMON = \ tests/interval_profiling_normal.thapi_text_pretty \ tests/interval_profiling_multithread.thapi_text_pretty \ tests/interval_profiling_API_call.thapi_text_pretty \ - tests/interval_profiling_fast.thapi_text_pretty \ tests/interval_profiling_interleave_process.thapi_text_pretty \ - tests/interval_profiling_ignore.thapi_text_pretty + tests/interval_profiling_ignore.thapi_text_pretty \ + tests/interval_profiling_shared_event.thapi_text_pretty \ + tests/interval_profiling_resubmit_event.thapi_text_pretty \ + tests/interval_profiling_shared_event_resubmit.thapi_text_pretty \ + tests/interval_profiling_shared_event_xphase.thapi_text_pretty BTX_ZE_GENERATED_SOURCE_TEST = \ btx_source_ze_test/metababel/metababel.h \ diff --git a/backends/ze/btx_zeinterval_callbacks.cpp b/backends/ze/btx_zeinterval_callbacks.cpp index c6355fdb0..2a1980e8f 100644 --- a/backends/ze/btx_zeinterval_callbacks.cpp +++ b/backends/ze/btx_zeinterval_callbacks.cpp @@ -559,6 +559,20 @@ static void hSignalEvent_rest_entry_callback(void *btx_handle, hCommandList, name, ts, btx_event_t::OTHER, {}}; } +static void zeCommandListAppendSignalEvent_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *hostname, + int64_t vpid, + uint64_t vtid, + ze_command_list_handle_t hCommandList, + ze_event_handle_t hEvent) { + (void)hEvent; + auto *data = static_cast(usr_data); + data->threadToLastLaunchInfo[{hostname, vpid, vtid}] = { + hCommandList, "zeCommandListAppendSignalEvent", ts, btx_event_t::SIGNAL, {}}; +} + /* * _ _ _ * _ _ / _ ._ _ ._ _ _. ._ _| / \ _ _ |_ _ _ _|_ _ @@ -584,9 +598,11 @@ zeCommandQueueExecuteCommandLists_entry_callback(void *btx_handle, const auto commandQueueDesc = data->commandQueueToDesc[{hostname, vpid, hCommandQueue}]; for (size_t i = 0; i < _phCommandLists_vals_length; i++) { for (auto &hEvent : data->commandListToEvents[{hostname, vpid, phCommandLists_vals[i]}]) { - auto &h = data->eventToBtxDesct[{hostname, vpid, hEvent}]; - std::get(h) = commandQueueDesc; - std::get(h) = ts; + auto &ring = data->eventToBtxDesct[{hostname, vpid, hEvent}]; + for (auto &h : ring.entries) { + std::get(h) = commandQueueDesc; + std::get(h) = ts; + } } } } @@ -825,11 +841,16 @@ static void event_profiling_callback(void *btx_handle, } // If not IMM will be commandQueueDesc overwrited latter - data->eventToBtxDesct[{hostname, vpid, hEvent}] = {vtid, commandQueueDesc, - hCommandList, hCommandListIsImmediate, - hDevice, commandName, - ts_min, clockLttngDevice, - type, ptr}; + // Push onto the per-event ring. If the cursor has advanced (we've + // already consumed at least one result for this event), the prior + // ring belongs to a finished build phase — clear and start fresh. + auto &ring = data->eventToBtxDesct[{hostname, vpid, hEvent}]; + if (ring.cursor > 0) { + ring.entries.clear(); + ring.cursor = 0; + } + ring.entries.push_back({vtid, commandQueueDesc, hCommandList, hCommandListIsImmediate, hDevice, + commandName, ts_min, clockLttngDevice, type, ptr}); // Prepare job for non IMM if (!hCommandListIsImmediate) data->commandListToEvents[{hostname, vpid, hCommandList}].insert(hEvent); @@ -880,14 +901,17 @@ static void event_profiling_result_callback(void *btx_handle, auto *data = static_cast(usr_data); - // TODO: Should we always find the eventToBtxDesct? - // We didn't find the partial payload, that mean we should ignore it + // Read the current ring slot for this event; advance the cursor; + // wrap to 0 on overflow. Resubmits re-cycle through the same ring. const auto it_p = data->eventToBtxDesct.find({hostname, vpid, hEvent}); - if (it_p == data->eventToBtxDesct.cend()) + if (it_p == data->eventToBtxDesct.cend() || it_p->second.entries.empty()) return; - // We don't erase, may have one entry for multiple result + auto &ring = it_p->second; + if (ring.cursor >= ring.entries.size()) + ring.cursor = 0; const auto &[vtid_submission, commandQueueDesc, hCommandList, hCommandListIsImmediate, device, - commandName, lltngMin, clockLttngDevice, type, ptr] = it_p->second; + commandName, lltngMin, clockLttngDevice, type, ptr] = ring.entries[ring.cursor]; + ring.cursor++; std::string metadata = ""; { std::stringstream ss_metadata; @@ -901,6 +925,13 @@ static void event_profiling_result_callback(void *btx_handle, if (!hCommandListIsImmediate) data->commandListToEvents[{hostname, vpid, hCommandList}].erase(hEvent); + /* AppendSignalEvent is a host-side signal with no GPU work to time. + * We pushed a ring entry to keep state consistent (so a future + * profiling_results lookup doesn't walk a stale prior entry), but + * suppress the device-side tally emission here. */ + if (type == btx_event_t::SIGNAL) + return; + if ((type == btx_event_t::TRAFFIC) && (status == ZE_RESULT_SUCCESS)) { auto &[ts, size] = std::get(ptr); btx_push_message_lttng_traffic(btx_handle, hostname, vpid, vtid, ts, BACKEND_ZE, @@ -1400,6 +1431,12 @@ void btx_register_usr_callbacks(void *btx_handle) { REGISTER_ASSOCIATED_CALLBACK(eventMemory_without_hSignalEvent_exit); REGISTER_ASSOCIATED_CALLBACK(hSignalEvent_rest_entry); + /* zeCommandListAppendSignalEvent doesn't match the hSignalEvent_* sets + * (payload is `hEvent`, not `hSignalEvent`), so it needs its own entry + * callback to keep threadToLastLaunchInfo from going stale. */ + btx_register_callbacks_lttng_ust_ze_zeCommandListAppendSignalEvent_entry( + btx_handle, &zeCommandListAppendSignalEvent_entry_callback); + /* Remove Memory */ REGISTER_ASSOCIATED_CALLBACK(memFree_entry); REGISTER_ASSOCIATED_CALLBACK(memFree_exit); diff --git a/backends/ze/btx_zeinterval_callbacks.hpp b/backends/ze/btx_zeinterval_callbacks.hpp index 80c2dc119..a6cdb0e0f 100644 --- a/backends/ze/btx_zeinterval_callbacks.hpp +++ b/backends/ze/btx_zeinterval_callbacks.hpp @@ -55,7 +55,9 @@ using btx_kernel_group_size_t = std::tuple; using btx_kernel_desct_t = std::tuple; -enum class btx_event_t { TRAFFIC, KERNEL, OTHER }; +// SIGNAL = zeCommandListAppendSignalEvent. Ring entry is created so state +// stays consistent, but filtered out of the device tally (no GPU work). +enum class btx_event_t { TRAFFIC, KERNEL, SIGNAL, OTHER }; using btx_additional_info_traffic_t = std::tuple; using btx_additional_info_kernel_t = std::string /*metadata*/; using btx_additional_info = @@ -93,7 +95,18 @@ struct data_s { std::unordered_map commandQueueToDesc; std::unordered_map threadToLastLaunchInfo; - std::unordered_map eventToBtxDesct; + + /* Per-event metadata ring. An hEvent can be the signal event of N + * Appends in one build phase, and the cl can be resubmitted M times, + * yielding M*N result events. We store the N Appends as a vector and + * advance `cursor` per result, wrapping at the end. A new push that + * arrives after the cursor advanced indicates a new build phase — + * we clear and start over so the ring tracks only the current phase. */ + struct event_ring_t { + std::vector entries; + size_t cursor = 0; + }; + std::unordered_map eventToBtxDesct; // Require for non IMM std::unordered_map> commandListToEvents; diff --git a/backends/ze/gen_ze.rb b/backends/ze/gen_ze.rb index 12dc75dfc..9e662b952 100644 --- a/backends/ze/gen_ze.rb +++ b/backends/ze/gen_ze.rb @@ -8,6 +8,7 @@ #include #include #include + #include #include #include #include diff --git a/backends/ze/tests/interval_profiling_fast.bt_text_pretty b/backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty similarity index 57% rename from backends/ze/tests/interval_profiling_fast.bt_text_pretty rename to backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty index 3403ebcdb..68e12d805 100644 --- a/backends/ze/tests/interval_profiling_fast.bt_text_pretty +++ b/backends/ze/tests/interval_profiling_resubmit_event.bt_text_pretty @@ -1,2 +1,3 @@ +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } -lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, err = false } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } diff --git a/backends/ze/tests/interval_profiling_fast.thapi_text_pretty b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty similarity index 50% rename from backends/ze/tests/interval_profiling_fast.thapi_text_pretty rename to backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty index fb6f10a79..b4c3ca9b4 100644 --- a/backends/ze/tests/interval_profiling_fast.thapi_text_pretty +++ b/backends/ze/tests/interval_profiling_resubmit_event.thapi_text_pretty @@ -1,4 +1,8 @@ +# 1 Append, but the underlying cl is Executed twice in a real run, so +# 2 results arrive for the same hEvent. Both are attributed to that one +# Append. 12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } -12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x1000000000000000 } -12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x1000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 } -12:00:00.030000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 } +12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 } diff --git a/backends/ze/tests/interval_profiling_shared_event.bt_text_pretty b/backends/ze/tests/interval_profiling_shared_event.bt_text_pretty new file mode 100644 index 000000000..2205557d8 --- /dev/null +++ b/backends/ze/tests/interval_profiling_shared_event.bt_text_pretty @@ -0,0 +1,8 @@ +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000200, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000300, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000200, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000300, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 50, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } diff --git a/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty new file mode 100644 index 000000000..64199d25b --- /dev/null +++ b/backends/ze/tests/interval_profiling_shared_event.thapi_text_pretty @@ -0,0 +1,18 @@ +# 4 Appends share one hEvent. Each Append's result is attributed back to +# its own Append, in submission order. +12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } +12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } +12:00:00.110000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.120000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } +12:00:00.210000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.220000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.300000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } +12:00:00.310000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.320000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.400000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 } +12:00:00.410000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 } +12:00:00.420000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 200, globalEnd: 240, contextStart: 200, contextEnd: 240 } +12:00:00.430000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 300, globalEnd: 350, contextStart: 300, contextEnd: 350 } diff --git a/backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty b/backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty new file mode 100644 index 000000000..25a5c77b2 --- /dev/null +++ b/backends/ze/tests/interval_profiling_shared_event_resubmit.bt_text_pretty @@ -0,0 +1,6 @@ +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 50, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } diff --git a/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty new file mode 100644 index 000000000..fb64b5d7a --- /dev/null +++ b/backends/ze/tests/interval_profiling_shared_event_resubmit.thapi_text_pretty @@ -0,0 +1,13 @@ +# 2 Appends share one hEvent, then the underlying cl is Executed twice, +# so 4 results arrive. Each submission's pair of results is attributed to +# the two Appends in submission order. +12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } +12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } +12:00:00.110000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.120000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 } +12:00:00.210000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 } +12:00:00.300000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 200, globalEnd: 240, contextStart: 200, contextEnd: 240 } +12:00:00.310000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 300, globalEnd: 350, contextStart: 300, contextEnd: 350 } diff --git a/backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty b/backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty new file mode 100644 index 000000000..ebbbc90a5 --- /dev/null +++ b/backends/ze/tests/interval_profiling_shared_event_xphase.bt_text_pretty @@ -0,0 +1,10 @@ +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 30, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000000, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000100, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 50, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:host: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, err = false } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 10, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 20, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } +lttng:device: { hostname = "testhost", vpid = 10, vtid = 1, ts = 1704110400000000400, backend = 1 }, { name = "zeCommandListAppendBarrier", dur = 40, did = 0, sdid = 0, err = false, metadata = "{ordinal: 0, index: 0}" } diff --git a/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty new file mode 100644 index 000000000..e9d336d89 --- /dev/null +++ b/backends/ze/tests/interval_profiling_shared_event_xphase.thapi_text_pretty @@ -0,0 +1,21 @@ +# Two build phases on the same cl, both reusing the same hEvent. +# Phase 1: 2 Appends, cl Executed twice -> 4 results. +# Phase 2: cl is Reset, then 1 Append, cl Executed three times -> 3 results. +# Results from a phase are attributed only to that phase's Appends; the +# phase-1 Appends do not bleed into the phase-2 results. +12:00:00.000000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } +12:00:00.010000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.020000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.100000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } +12:00:00.110000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.120000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.200000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 0, globalEnd: 10, contextStart: 0, contextEnd: 10 } +12:00:00.210000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 100, globalEnd: 130, contextStart: 100, contextEnd: 130 } +12:00:00.300000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 200, globalEnd: 240, contextStart: 200, contextEnd: 240 } +12:00:00.310000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 300, globalEnd: 350, contextStart: 300, contextEnd: 350 } +12:00:00.400000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_entry: { hCommandList: 0x1000000000000000, hSignalEvent: 0x0000000000000000, numWaitEvents: 0, phWaitEvents: 0x0000000000000000, _phWaitEvents_vals_length: 0, phWaitEvents_vals: 0x0000000000000000 } +12:00:00.410000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling: { hEvent: 0x4000000000000000 } +12:00:00.420000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze:zeCommandListAppendBarrier_exit: { zeResult: ZE_RESULT_SUCCESS } +12:00:00.500000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 400, globalEnd: 410, contextStart: 400, contextEnd: 410 } +12:00:00.510000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 500, globalEnd: 520, contextStart: 500, contextEnd: 520 } +12:00:00.520000000 - testhost - vpid: 10, vtid: 1 - lttng_ust_ze_profiling:event_profiling_results: { hEvent: 0x4000000000000000, status: ZE_RESULT_SUCCESS, timestampStatus: ZE_RESULT_SUCCESS, globalStart: 600, globalEnd: 640, contextStart: 600, contextEnd: 640 } diff --git a/backends/ze/tracer_ze_helpers.include.c b/backends/ze/tracer_ze_helpers.include.c index 8cfe31d7a..10693c1ac 100644 --- a/backends/ze/tracer_ze_helpers.include.c +++ b/backends/ze/tracer_ze_helpers.include.c @@ -1,30 +1,139 @@ -#ifdef THAPI_DEBUG -#define TAHPI_LOG stderr -#define THAPI_DBGLOG(fmt, ...) \ - do { \ - fprintf(TAHPI_LOG, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__, __VA_ARGS__); \ - } while (0) -#define THAPI_DBGLOG_NO_ARGS(fmt) \ +/* Algorithm + * ========= + * + * On profiled Append (cl, sig=user_sig, waits=user_waits): + * - allocate inj from per-context pool; swap user_sig -> inj + * - place a Query (see "QKT placement" below) + * - allocate a slot {inj, attr=user_sig, off, waits=copy(user_waits)} + * - immediate cl: instantiate(slot) inline + * + * instantiate(s): + * - s.preds = [event_latest_signaled[w] for w in s.waits if live] + * + previous live slot in same cl (if cl is in-order) + * - s.live = true; event_latest_signaled[s.attr] = &s + * + * On Execute(q, cl) prologue: + * - if cl.in_flight_q: Synchronize(in_flight_q); drain_cl(cl) + * - shadow-path slots: re-Append Query on shadow cl + * inline-path slots: nothing (Query is baked into cl body) + * - instantiate every slot in cl + * - cl.in_flight_q = q; index cl under q (and its fence) for sync lookup + * + * On Sync (the synced anchor tells us what to drain): + * - Sync(ev): drain(event_latest_signaled[ev]) + * - Sync(q): drain_cl(cl) for every cl in the q-index bucket for q + * (O(matching cls), not a scan of every live cl) + * - Sync(cl): drain_cl(cl) + * + * drain(s): + * - for p in s.preds: drain(p) + * - shadow-path: host-sync on shadow_done, reset, decrement live_queries + * - read slab[s.off], emit tracepoint(s.attr or inj) + * - clear event_latest_signaled[s.attr] (if it still points at s) + * - clear s.live and s.preds + * (Build-time fields inj, attr, off, waits stay so the next Execute + * can re-instantiate without re-Appending.) + * + * Concurrency + * =========== + * + * One global mutex (_ze_state_mutex) covers all tracer state: the cl + * registry, every cl's chunks/slots/preds, the event freelist + pool + * registry, the latest-signaled map, the shadow cl registry, the + * qgroup cache. Append / Execute / Drain / Destroy all take it. + * + * Per-cl mutexes don't work because drain follows cross-cl pred edges + * (event_latest_signaled[ev] can point at a slot in any cl) and + * mutates the pred's chunk via _slot_release. Any per-cl scheme has + * to acquire multiple cl mutexes with cross-cl ordering rules. One + * global mutex sidesteps that entirely. + * + * Perf: Append on different cls and freelist accesses serialize + * through one lock. The L0 calls inside the critical section + * (AppendBarrier, AppendQueryKernelTimestamps) just queue work on + * the GPU — the GPU executes asynchronously, so the held region is + * short. Drain is host-blocking (zeEventHostSynchronize on shadow + * fence events) and was effectively serial anyway. + * + * QKT placement + * ============= + * + * AppendQueryKernelTimestamps (the device-side timestamp read) lives + * in one of two places, picked at cl create from the queue group's + * COMPUTE flag and stored in cl_data->is_compute. Both paths share the + * slot/drain/dep-graph machinery; they only differ in where the QKT is + * Appended and how the drain knows it has fired. + * + * INLINE (user cl is on a COMPUTE queue group): + * + * Kernel(sig=inj) ──> QKT(wait=inj, sig=user_signal) [on user cl] + * + * One Append. user_signal IS the QKT-done edge — any user-level + * sync (event/queue/cl) that covers user_signal also covers the + * QKT. No tracer fence event, no host-sync at drain. For regular + * cls the QKT is baked into the cl body once and re-fires on every + * Execute. + * + * SHADOW (user cl is copy-only, or queue group unknown): + * + * ┌─> Barrier(wait=inj, sig=user_signal) [on user cl] + * Kernel(sig=inj) ──┤ + * └─> QKT(wait=inj, sig=shadow_done) [on shadow cl] + * + * Two Appends. The shadow cl is a per-(context, device) tracer-owned + * immediate compute cl; QKT goes there because copy queue groups + * reject AppendQueryKernelTimestamps. shadow_done is a tracer-owned + * fence event that drain host-syncs on — required because the + * shadow cl's completion isn't implied by any user-level sync. For + * regular cls the shadow QKT is (re-)Appended in the Execute + * epilogue (the user cl is in flight by then, so Appending the + * Query won't deadlock on a shared engine). + */ + +/* Always-on tracer log. Prefixes THAPI(func:line) so messages are + * grep-able across the bench/test harness which often interleaves + * tracer and user output. GCC's `, ##__VA_ARGS__` extension swallows + * the leading comma when the variadic list is empty. fflush so the + * line lands even if we abort() right after. */ +#define _THAPI_LOG(fmt, ...) \ do { \ - fprintf(TAHPI_LOG, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__); \ + fprintf(stderr, "THAPI(%s:%d): " fmt "\n", __func__, __LINE__, ##__VA_ARGS__); \ + fflush(stderr); \ } while (0) + +#ifdef THAPI_DEBUG +#define THAPI_DBGLOG(fmt, ...) _THAPI_LOG(fmt, ##__VA_ARGS__) #else #define THAPI_DBGLOG(...) \ do { \ } while (0) -#define THAPI_DBGLOG_NO_ARGS(fmt) \ +#endif + +/* Tracer invariant check: print + abort. Unconditional (not gated on + * NDEBUG) — silently dropping the check would let the bug ship bad + * data instead of crashing. Use for "this can never happen" preconditions + * inside the tracer, not for user-input validation. */ +#define _THAPI_ASSERT(cond, fmt, ...) \ do { \ + if (!(cond)) { \ + _THAPI_LOG("assertion failed: %s — " fmt, #cond, ##__VA_ARGS__); \ + abort(); \ + } \ } while (0) -#endif -#ifdef THAPI_USE_DESTRUCTORS -#define THAPI_ATTRIBUTE_DESTRUCTOR __attribute__((destructor)) -#else -#define THAPI_ATTRIBUTE_DESTRUCTOR -#endif +/* Wrap a tracer-issued L0 call whose failure means we'd either hang the + * user (sync chain Barrier) or produce a non-self-consistent trace + * (QKT, event create, ...). Defensive: print + abort so the bug surfaces + * under sanitizers/CI rather than ship bad data. NOT for driver query + * calls (Get*Handle, GetCommandQueueGroupProperties) — those can fail + * transiently during teardown and have graceful fallbacks. */ +#define _ZE_MUST(call) \ + do { \ + ze_result_t _r = (call); \ + _THAPI_ASSERT(_r == ZE_RESULT_SUCCESS, "%s = 0x%x", #call, _r); \ + } while (0) static int _do_profile = 0; -static int _do_cleanup = 0; static int _do_chained_structs = 0; static int _do_paranoid_drift = 0; static int _do_paranoid_memory_location = 0; @@ -43,110 +152,425 @@ struct ze_closure { struct ze_closure *ze_closures = NULL; -typedef enum _ze_command_list_flag { _ZE_EXECUTED = ZE_BIT(0) } _ze_command_list_flag_t; -typedef _ze_command_list_flag_t _ze_command_list_flags_t; - struct _ze_event_h; +struct _ze_slot; +struct _ze_slab_chunk; + +/* Dependency-tracking slot: one per profiled Append. Slots carry the + * happens-before edges the user established (via cl in-order semantics + * and via phWaitEvents). At sync time we walk these edges from the + * synced anchor and drain everything reachable. Drain is pop semantics: + * after emit, the slot is dropped from the cl's list. */ +struct _ze_slot { + struct _ze_command_list_obj_data *owner; /* cl_data this slot lives in */ + struct _ze_slab_chunk *chunk; /* chunk this slot lives in (==> .slab to read at drain) */ + /* Shadow path only: shadow cl the Query was Appended to. Inline-path + * slots leave this NULL — their Query lives in the user cl body and + * the dep-graph walk that triggers drain already implies it has run. */ + struct _ze_shadow_cl *sh; + struct _ze_event_h *inj; /* tracer-owned event the Query waits on */ + /* Shadow path only: tracer-owned fence event the Query signals; drain + * host-syncs on it. Inline-path slots leave this NULL. */ + struct _ze_event_h *shadow_done; + ze_event_handle_t attr; /* user's original signal event (NULL => inj->event) */ + size_t off; /* byte offset within chunk->slab */ + /* User wait events copied at Append time (stable across rebuilds); + * preds[] is computed at instantiate from waits[] by looking up + * event_latest_signaled[w] for each w. */ + ze_event_handle_t *waits; + uint32_t n_waits; + struct _ze_slot **preds; /* points at slots whose drain must come first (may be in another cl) */ + uint32_t n_preds; + unsigned char live; /* in-flight (instantiated, not drained) */ + /* Incoming pred edges: count of downstream slots whose preds[] points + * here AND that have not yet been drained. Incremented at downstream + * _slot_instantiate, decremented at downstream _slot_drain. Slot is + * reclaimable iff live==0 AND refs==0. */ + uint32_t refs; +}; + +#define _ZE_SLAB_CHUNK_SLOTS 64 + +/* Slot + slab storage in fixed-size chunks; cl_data->chunks is a utlist + * DL of these. Imm cls allocate new chunks as needed (no cap); regular + * cls stop at one chunk — the inj events (and on the inline path, the + * QKT itself) are baked into the closed cl body, so adding a chunk + * after Close would create slots the body doesn't address. + * + * Within a chunk, slots[i].off is i * sizeof(timestamp) into slab. The + * chunk frees itself when n_held drops to 0 AND it is not the tail + * (new Appends still want to land on the tail). */ +struct _ze_slab_chunk { + void *slab; /* _ZE_SLAB_CHUNK_SLOTS * sizeof(ze_kernel_timestamp_result_t) */ + ze_context_handle_t slab_ctx; /* context the slab was allocated against (zeMemFree target) */ + uint32_t n_used; /* slots ever assigned in this chunk (monotonic until chunk free) */ + uint32_t n_held; /* unreleased slots (n_used minus _slot_release calls) */ + /* Nonzero only on a DETACHED chunk: one whose owning cl was torn down + * (reset/destroy) while >=1 slot was still referenced as a pred by a live + * slot in ANOTHER cl. The chunk is removed from cl_data->chunks, its slots' + * resources are already released and owner==NULL — only the struct survives + * so the referrers' preds[] pointers stay valid. n_pinned counts those + * surviving referenced slots; the downstream drain that drops the last ref + * frees the struct. 0 for normal attached chunks. */ + uint32_t n_pinned; + struct _ze_slab_chunk *next, *prev; + struct _ze_slot slots[_ZE_SLAB_CHUNK_SLOTS]; +}; + +/* Iterate every used slot in a cl, oldest-to-newest (chunk DL order, then + * slot order within a chunk) — the natural time order. Binds `s` to each + * `struct _ze_slot *`. Only for read/dispose passes that do NOT free chunks + * mid-walk; the drain path bumps n_held by hand and uses DL_FOREACH_SAFE. */ +#define _ZE_FOREACH_SLOT(cl_data, s) \ + for (struct _ze_slab_chunk *_c = (cl_data)->chunks; _c; _c = _c->next) \ + for (struct _ze_slot *s = _c->slots, *_se = _c->slots + _c->n_used; s < _se; ++s) struct _ze_command_list_obj_data { - void *ptr; /* the ze_command_list_handle_t this entry tracks */ + void *ptr; UT_hash_handle hh; - _ze_command_list_flags_t flags; - struct _ze_event_h *events; + + struct _ze_slab_chunk *chunks; /* utlist DL_ head; tail = chunks->prev (circular) */ + + /* in_flight_q is the queue this cl was last Executed on AND not yet + * drained. NULL means "not in flight" — safe to Execute without a + * force-sync. Set on Execute, cleared on drain. + * + * Held only for regular cls; immediate cls never Execute. */ + ze_command_queue_handle_t in_flight_q; + /* The fence (if any) passed to that same Execute. NULL when the user + * Executed without a fence. Lets a fence-only sync find which cls to + * drain — the fence signals when all cls in its Execute complete, so + * zeFenceHostSynchronize(f) drains every cl whose in_flight_fence == f. + * Set on Execute alongside in_flight_q, cleared together on drain. */ + ze_fence_handle_t in_flight_fence; + unsigned char is_immediate; + unsigned char is_in_order; + /* 1 if this cl's queue group exposes COMPUTE — its body can host + * AppendQueryKernelTimestamps directly, so we skip the per-(ctx,device) + * shadow cl and bake QKT into the user cl itself. See the placement + * diagram at the top of this file. 0 for copy-only cls and for any cl + * whose group flags we couldn't determine. Set at create; immutable. */ + unsigned char is_compute; + + /* Cached on first use: context handle for this cl. Immutable for the + * cl's lifetime. Load-bearing for _on_destroy_context's sweep: lets it + * associate cls back to their ctx without an L0 roundtrip per cl. */ + ze_context_handle_t cached_context; + + /* Membership in the per-queue / per-fence in-flight indexes (see + * _ze_q_index / _ze_fence_index below). A cl in flight is linked into both + * its queue's bucket (q_prev/q_next) and, if Executed with a fence, its + * fence's bucket (f_prev/f_next), so a queue/fence sync drains exactly the + * matching cls without scanning every live cl. Linked at Execute, unlinked + * at drain, both via _cl_index_clear. */ + struct _ze_command_list_obj_data *q_prev, *q_next; + struct _ze_command_list_obj_data *f_prev, *f_next; }; struct _ze_command_list_obj_data *_ze_cls = NULL; -pthread_mutex_t _ze_cls_mutex = PTHREAD_MUTEX_INITIALIZER; -#define FIND_ZE_CL(key, val) \ - do { \ - pthread_mutex_lock(&_ze_cls_mutex); \ - HASH_FIND_PTR(_ze_cls, key, val); \ - pthread_mutex_unlock(&_ze_cls_mutex); \ - } while (0) +/* The single mutex covering all tracer state — see the "Concurrency" + * section in the file header for rationale. Every static helper in this + * file that touches tracer state assumes the caller holds it. */ +pthread_mutex_t _ze_state_mutex = PTHREAD_MUTEX_INITIALIZER; -#define ADD_ZE_CL(val) \ - do { \ - pthread_mutex_lock(&_ze_cls_mutex); \ - HASH_ADD_PTR(_ze_cls, ptr, val); \ - pthread_mutex_unlock(&_ze_cls_mutex); \ - } while (0) +/* Pure HASH wrappers. */ +static struct _ze_command_list_obj_data *_cl_find(ze_command_list_handle_t command_list) { + struct _ze_command_list_obj_data *cl = NULL; + HASH_FIND_PTR(_ze_cls, &command_list, cl); + return cl; +} -#define FIND_AND_DEL_ZE_CL(key, val) \ - do { \ - pthread_mutex_lock(&_ze_cls_mutex); \ - HASH_FIND_PTR(_ze_cls, key, val); \ - if (val) { \ - HASH_DEL(_ze_cls, val); \ - } \ - pthread_mutex_unlock(&_ze_cls_mutex); \ - } while (0) +static void _cl_add(struct _ze_command_list_obj_data *cl) { HASH_ADD_PTR(_ze_cls, ptr, cl); } -static inline void _on_create_command_list(ze_command_list_handle_t command_list, int immediate) { - struct _ze_command_list_obj_data *cl_data = NULL; +static struct _ze_command_list_obj_data *_cl_find_and_del(ze_command_list_handle_t command_list) { + struct _ze_command_list_obj_data *cl = _cl_find(command_list); + if (cl) + HASH_DEL(_ze_cls, cl); + return cl; +} - FIND_ZE_CL(&command_list, cl_data); - if (cl_data) { - THAPI_DBGLOG("Command list already registered: %p", command_list); +/* In-flight indexes: queue handle -> the cls currently in flight on that queue, + * and fence handle -> the cls in flight under that fence. A queue/fence sync + * completes exactly the cls of the matching Execute, so these let _on_sync + * drain just those cls instead of scanning every live cl (which is O(live cls) + * per sync — see bench/sync_scaling). Buckets are created lazily at Execute and + * freed when they go empty at drain. */ +struct _ze_inflight_bucket { + void *key; /* ze_command_queue_handle_t or ze_fence_handle_t */ + struct _ze_command_list_obj_data *cls; /* DL via q_prev/q_next or f_prev/f_next */ + UT_hash_handle hh; +}; +static struct _ze_inflight_bucket *_ze_q_index = NULL; +static struct _ze_inflight_bucket *_ze_fence_index = NULL; + +static void _index_link(struct _ze_inflight_bucket **index, + void *key, + struct _ze_command_list_obj_data *cl, + int is_fence) { + if (!key) return; + struct _ze_inflight_bucket *b = NULL; + HASH_FIND_PTR(*index, &key, b); + if (!b) { + b = (struct _ze_inflight_bucket *)calloc(1, sizeof(*b)); + if (!b) + return; + b->key = key; + HASH_ADD_PTR(*index, key, b); } + if (is_fence) + DL_APPEND2(b->cls, cl, f_prev, f_next); + else + DL_APPEND2(b->cls, cl, q_prev, q_next); +} - cl_data = (struct _ze_command_list_obj_data *)calloc(1, sizeof(*cl_data)); - if (!cl_data) { - THAPI_DBGLOG_NO_ARGS("Failed to allocate memory"); +static void _index_unlink(struct _ze_inflight_bucket **index, + void *key, + struct _ze_command_list_obj_data *cl, + int is_fence) { + if (!key) + return; + struct _ze_inflight_bucket *b = NULL; + HASH_FIND_PTR(*index, &key, b); + if (!b) return; + if (is_fence) + DL_DELETE2(b->cls, cl, f_prev, f_next); + else + DL_DELETE2(b->cls, cl, q_prev, q_next); + if (!b->cls) { + HASH_DEL(*index, b); + free(b); } +} - cl_data->ptr = (void *)command_list; - /* Immediate cls have no Execute step; their appends run on the device the - * moment they're submitted. Treat them as already-executed so drainers - * (Reset/Destroy hooks) query their events via _ZE_EXECUTED uniformly. */ - if (immediate) - cl_data->flags = _ZE_EXECUTED; +/* Link cl into the queue (and, if non-NULL, fence) in-flight indexes. Called + * once per Execute, after in_flight_q/in_flight_fence are stamped. */ +static void _cl_index_set(struct _ze_command_list_obj_data *cl, + ze_command_queue_handle_t q, + ze_fence_handle_t f) { + _index_link(&_ze_q_index, q, cl, /*is_fence=*/0); + _index_link(&_ze_fence_index, f, cl, /*is_fence=*/1); +} + +/* Remove cl from both in-flight indexes. Uses cl's own in_flight_q/_fence as + * the keys, so it MUST run before those are cleared. Idempotent: a cl not in + * flight has NULL keys and is a no-op. */ +static void _cl_index_clear(struct _ze_command_list_obj_data *cl) { + _index_unlink(&_ze_q_index, cl->in_flight_q, cl, /*is_fence=*/0); + _index_unlink(&_ze_fence_index, cl->in_flight_fence, cl, /*is_fence=*/1); +} + +/* Per-device cache of the queue-group flag bitmap. The lookup is + * read-mostly: scan zeDeviceGetCommandQueueGroupProperties once, + * remember the per-ordinal flags. flags==NULL means "we already checked + * and the device returned no groups". Used by two readers: + * _get_compute_ordinal(dev) -> first COMPUTE ord, or -1 + * _ordinal_is_compute(dev, ord) -> 1 if ord is COMPUTE on dev */ +struct _ze_qgroup_cache_entry { + ze_device_handle_t device; + ze_command_queue_group_property_flags_t *flags; /* owned; n_groups entries */ + uint32_t n_groups; + UT_hash_handle hh; +}; +static struct _ze_qgroup_cache_entry *_ze_qgroup_cache = NULL; + +/* Populate (or return cached) flag bitmap for device. The cache lives + * for process lifetime. First-touch L0 queries happen under the state + * mutex; cost is bounded since lookups are once per device. */ +static struct _ze_qgroup_cache_entry *_qgroup_cache_get(ze_device_handle_t device) { + struct _ze_qgroup_cache_entry *e = NULL; + HASH_FIND_PTR(_ze_qgroup_cache, &device, e); + if (e) + return e; + + uint32_t n_groups = 0; + if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, NULL) != + ZE_RESULT_SUCCESS || + n_groups == 0) + return NULL; + ze_command_queue_group_properties_t *groups = + (ze_command_queue_group_properties_t *)calloc(n_groups, sizeof(*groups)); + if (!groups) + return NULL; + for (uint32_t i = 0; i < n_groups; ++i) + groups[i].stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES; + if (ZE_DEVICE_GET_COMMAND_QUEUE_GROUP_PROPERTIES_PTR(device, &n_groups, groups) != + ZE_RESULT_SUCCESS) { + free(groups); + return NULL; + } + ze_command_queue_group_property_flags_t *flags = + (ze_command_queue_group_property_flags_t *)calloc(n_groups, sizeof(*flags)); + if (!flags) { + free(groups); + return NULL; + } + for (uint32_t i = 0; i < n_groups; ++i) + flags[i] = groups[i].flags; + free(groups); - ADD_ZE_CL(cl_data); + e = (struct _ze_qgroup_cache_entry *)calloc(1, sizeof(*e)); + if (!e) { + free(flags); + return NULL; + } + e->device = device; + e->flags = flags; + e->n_groups = n_groups; + HASH_ADD_PTR(_ze_qgroup_cache, device, e); + return e; } -typedef enum _ze_event_flag { _ZE_IMMEDIATE_CMD = ZE_BIT(0) } _ze_event_flag_t; -typedef _ze_event_flag_t _ze_event_flags_t; +/* Returns the first COMPUTE queue group ordinal for device, or (uint32_t)-1 + * if the device exposes no compute group (fatal — caller should bail). */ +static uint32_t _get_compute_ordinal(ze_device_handle_t device) { + struct _ze_qgroup_cache_entry *e = _qgroup_cache_get(device); + if (!e) + return (uint32_t)-1; + for (uint32_t i = 0; i < e->n_groups; ++i) + if (e->flags[i] & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) + return i; + return (uint32_t)-1; +} +/* 1 iff `ordinal` on `device` is a COMPUTE queue group. Returns 0 on any + * uncertainty (unknown device, OOB ordinal, driver error) — callers + * should treat the cl as non-compute and use the shadow-cl QKT path. */ +static int _ordinal_is_compute(ze_device_handle_t device, uint32_t ordinal) { + if (!device) + return 0; + struct _ze_qgroup_cache_entry *e = _qgroup_cache_get(device); + return e && ordinal < e->n_groups && + (e->flags[ordinal] & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) + ? 1 + : 0; +} + +/* Per-(context, device) tracer-owned immediate OOO compute cl used by + * the SHADOW path to host AppendQueryKernelTimestamps. Copy queue + * groups reject QKT, so the shadow cl exists to give those user cls + * somewhere compute-capable to put their Query. Compute user cls take + * the INLINE path and never touch a shadow cl — see the QKT placement + * diagram at the top of this file. */ +struct _ze_shadow_key { + ze_context_handle_t context; + ze_device_handle_t device; +}; +struct _ze_shadow_cl { + struct _ze_shadow_key key; + ze_command_list_handle_t cl; + uint32_t live_queries; /* QKTs appended but not yet host-synced */ + UT_hash_handle hh; +}; +static struct _ze_shadow_cl *_ze_shadow_cls = NULL; + +/* Returns the shadow cl for (context, device), creating it lazily on + * first use (first-touch L0 zeCommandListCreateImmediate runs under + * the state mutex; cost bounded). Returns NULL if the device has no + * compute group (fatal: log to stderr) or if creation fails. */ +static struct _ze_shadow_cl *_get_shadow_cl(ze_context_handle_t context, + ze_device_handle_t device) { + struct _ze_shadow_key key = {context, device}; + struct _ze_shadow_cl *sh = NULL; + HASH_FIND(hh, _ze_shadow_cls, &key, sizeof(key), sh); + if (sh) + return sh; + + uint32_t ord = _get_compute_ordinal(device); + if (ord == (uint32_t)-1) { + fprintf(stderr, + "THAPI: device %p has no COMPUTE queue group; " + "cannot create shadow cl. Profiling disabled for " + "command lists on this device.\n", + (void *)device); + return NULL; + } + /* ASYNCHRONOUS mode is critical: with SYNCHRONOUS (the DEFAULT), + * each AppendQueryKernelTimestamps on this immediate cl blocks until + * the Query completes — which it can't, because Query is waiting on + * inj, and inj is signaled by the user cl's kernel that hasn't been + * submitted yet (we're called from the user's Execute prologue). + * Deadlock. ASYNCHRONOUS lets the Append return immediately and the + * Query run device-side at its own pace. */ + ze_command_queue_desc_t qd = { + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, NULL, ord, 0, 0, ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + ze_command_list_handle_t new_cl = NULL; + if (ZE_COMMAND_LIST_CREATE_IMMEDIATE_PTR(context, device, &qd, &new_cl) != ZE_RESULT_SUCCESS || + !new_cl) { + fprintf(stderr, + "THAPI: failed to create shadow cl for " + "context=%p device=%p\n", + (void *)context, (void *)device); + return NULL; + } + sh = (struct _ze_shadow_cl *)calloc(1, sizeof(*sh)); + if (!sh) { + ZE_COMMAND_LIST_DESTROY_PTR(new_cl); + return NULL; + } + sh->key = key; + sh->cl = new_cl; + HASH_ADD(hh, _ze_shadow_cls, key, sizeof(sh->key), sh); + return sh; +} + +/* Append AppendQueryKernelTimestamps on the shadow cl: wait on inj, + * signal shadow_done, write timestamps into slab[*off]. The state + * mutex also serializes the not-thread-safe-per-cl-handle L0 Append + * on the shared shadow cl. Aborts on L0 failure (defensive — a missing + * Query would silently drop this kernel's timing). */ +static void _shadow_append_query(struct _ze_shadow_cl *sh, + ze_event_handle_t inj_event, + void *slab, + size_t *off, + ze_event_handle_t shadow_done_event) { + sh->live_queries++; + _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR(sh->cl, 1, &inj_event, slab, off, + /*hSignalEvent=*/shadow_done_event, + /*numWaitEvents=*/1, &inj_event)); +} + +static inline void _on_create_command_list(ze_command_list_handle_t command_list, + ze_device_handle_t device, + uint32_t ordinal, + int immediate, + int in_order) { + struct _ze_command_list_obj_data *cl_data = + (struct _ze_command_list_obj_data *)calloc(1, sizeof(*cl_data)); + if (!cl_data) { + THAPI_DBGLOG("Failed to allocate memory"); + return; + } + cl_data->ptr = (void *)command_list; + cl_data->is_immediate = immediate ? 1 : 0; + cl_data->is_in_order = in_order ? 1 : 0; + + pthread_mutex_lock(&_ze_state_mutex); + /* _ordinal_is_compute touches the qgroup cache (state-mutex-protected). */ + cl_data->is_compute = _ordinal_is_compute(device, ordinal) ? 1 : 0; + if (_cl_find(command_list)) { + pthread_mutex_unlock(&_ze_state_mutex); + THAPI_DBGLOG("Command list already registered: %p", command_list); + free(cl_data); + return; + } + _cl_add(cl_data); + pthread_mutex_unlock(&_ze_state_mutex); +} + +/* Wrapper around an injected event we own. Lives either in the per-context + * free pool (between uses) or anchored to one of cl_data->slots[] (in flight). */ struct _ze_event_h { ze_event_handle_t event; - UT_hash_handle hh; ze_event_pool_handle_t event_pool; ze_context_handle_t context; - _ze_event_flags_t flags; - /* to remember events in command lists */ + /* doubly-linked list pointers used by the per-context free pool */ struct _ze_event_h *next, *prev; }; -static struct _ze_event_h *_ze_events = NULL; -static pthread_mutex_t _ze_events_mutex = PTHREAD_MUTEX_INITIALIZER; - -#define FIND_ZE_EVENT(key, val) \ - do { \ - pthread_mutex_lock(&_ze_events_mutex); \ - HASH_FIND_PTR(_ze_events, key, val); \ - pthread_mutex_unlock(&_ze_events_mutex); \ - } while (0) - -#define ADD_ZE_EVENT(val) \ - do { \ - pthread_mutex_lock(&_ze_events_mutex); \ - HASH_ADD_PTR(_ze_events, event, val); \ - pthread_mutex_unlock(&_ze_events_mutex); \ - } while (0) - -#define FIND_AND_DEL_ZE_EVENT(key, val) \ - do { \ - pthread_mutex_lock(&_ze_events_mutex); \ - HASH_FIND_PTR(_ze_events, key, val); \ - if (val) { \ - HASH_DEL(_ze_events, val); \ - } \ - pthread_mutex_unlock(&_ze_events_mutex); \ - } while (0) - struct _ze_event_pool_entry { ze_context_handle_t context; UT_hash_handle hh; @@ -154,166 +578,196 @@ struct _ze_event_pool_entry { }; struct _ze_event_pool_entry *_ze_event_pools = NULL; -static pthread_mutex_t _ze_event_pools_mutex = PTHREAD_MUTEX_INITIALIZER; -#define GET_ZE_EVENT(key, val) \ - do { \ - struct _ze_event_pool_entry *pool = NULL; \ - pthread_mutex_lock(&_ze_event_pools_mutex); \ - HASH_FIND_PTR(_ze_event_pools, key, pool); \ - if (pool && pool->events) { \ - val = pool->events; \ - DL_DELETE(pool->events, val); \ - } else \ - val = NULL; \ - pthread_mutex_unlock(&_ze_event_pools_mutex); \ - } while (0) +/* Per-event tracer state, keyed by the user's event handle. Two facts live + * here, both populated around drain and both bound to the event's lifetime, so + * they share one uthash entry (one lookup, one alloc, one eviction): + * + * latest -> the most recent slot whose attr==ev. Resolves happens-before + * edges: when a new Append waits on ev, that slot becomes a pred. + * Set at instantiate; cleared at drain/dispose only if it still + * points at the draining slot (a newer Append may have overwritten + * it — don't clobber that). + * kts -> last kernel-timestamp result we drained for ev. The Append + * prologue swaps the user's signal for our inj, so the user's event + * carries QKT/barrier op timing, not the kernel's. At drain we read + * the real kernel result from the slab and stash it here so the + * user's own zeEventQueryKernelTimestamp can be served kernel + * timing; re-signaling overwrites. + * + * The whole entry is evicted by _on_destroy_event so a recycled handle address + * (the L0 driver reuses freed event addresses) never serves a dead event's + * latest slot (a dangling pred -> UAF) or stale kts. The value stays inline in + * the entry — no per-set heap box. */ +struct _ze_event_state_entry { + ze_event_handle_t ev; /* key */ + struct _ze_slot *latest; + ze_kernel_timestamp_result_t kts; + unsigned char has_kts; + UT_hash_handle hh; +}; +static struct _ze_event_state_entry *_ze_event_state = NULL; -#define PUT_ZE_EVENT(val) \ - do { \ - struct _ze_event_pool_entry *pool = NULL; \ - pthread_mutex_lock(&_ze_event_pools_mutex); \ - HASH_FIND_PTR(_ze_event_pools, &(val->context), pool); \ - if (!pool) { \ - pool = (struct _ze_event_pool_entry *)calloc(1, sizeof(struct _ze_event_pool_entry)); \ - if (!pool) { \ - THAPI_DBGLOG_NO_ARGS("Failed to allocate memory"); \ - pthread_mutex_unlock(&_ze_event_pools_mutex); \ - if (val->event_pool) { \ - if (val->event) \ - ZE_EVENT_DESTROY_PTR(val->event); \ - ZE_EVENT_POOL_DESTROY_PTR(val->event_pool); \ - } \ - free(val); \ - break; \ - } \ - pool->context = val->context; \ - HASH_ADD_PTR(_ze_event_pools, context, pool); \ - } \ - val->flags = 0; \ - ZE_EVENT_HOST_RESET_PTR(val->event); \ - DL_PREPEND(pool->events, val); \ - pthread_mutex_unlock(&_ze_event_pools_mutex); \ - } while (0) +/* Find-or-create the entry for ev. NULL only on ev==NULL or OOM. */ +static struct _ze_event_state_entry *_event_state_get_or_add(ze_event_handle_t ev) { + if (!ev) + return NULL; + struct _ze_event_state_entry *e = NULL; + HASH_FIND_PTR(_ze_event_state, &ev, e); + if (!e) { + e = (struct _ze_event_state_entry *)calloc(1, sizeof(*e)); + if (!e) + return NULL; + e->ev = ev; + HASH_ADD_PTR(_ze_event_state, ev, e); + } + return e; +} -struct _ze_event_h *_ze_event_wrappers = NULL; -static pthread_mutex_t _ze_event_wrappers_mutex = PTHREAD_MUTEX_INITIALIZER; +/* Drop the entry if it carries nothing worth keeping (no latest slot, no + * stashed kts) — keeps the map bounded as facts are cleared. */ +static inline void _event_state_gc(struct _ze_event_state_entry *e) { + if (e && !e->latest && !e->has_kts) { + HASH_DEL(_ze_event_state, e); + free(e); + } +} -#define GET_ZE_EVENT_WRAPPER(val) \ - do { \ - pthread_mutex_lock(&_ze_event_wrappers_mutex); \ - if (_ze_event_wrappers) { \ - val = _ze_event_wrappers; \ - DL_DELETE(_ze_event_wrappers, val); \ - } else { \ - val = calloc(1, sizeof(struct _ze_event_h)); \ - } \ - pthread_mutex_unlock(&_ze_event_wrappers_mutex); \ - } while (0) +static inline struct _ze_slot *_event_latest_get(ze_event_handle_t ev) { + struct _ze_event_state_entry *e = NULL; + HASH_FIND_PTR(_ze_event_state, &ev, e); + return e ? e->latest : NULL; +} -#define PUT_ZE_EVENT_WRAPPER(val) \ - do { \ - memset(val, 0, sizeof(struct _ze_event_h)); \ - pthread_mutex_lock(&_ze_event_wrappers_mutex); \ - DL_PREPEND(_ze_event_wrappers, val); \ - pthread_mutex_unlock(&_ze_event_wrappers_mutex); \ - } while (0) +static inline void _event_latest_set(ze_event_handle_t ev, struct _ze_slot *slot) { + struct _ze_event_state_entry *e = _event_state_get_or_add(ev); + if (e) + e->latest = slot; +} -/* Snapshot context + immediate-flag from cmdlist into the event wrapper. - * The immediate flag is read at register time (not at _on_reset_event - * time) because by reset time the cmdlist may already be destroyed and - * zeCommandListIsImmediate would dereference a freed handle. */ -static inline void _tag_event_from_cl(struct _ze_event_h *_ze_event, - ze_command_list_handle_t command_list) { - ze_context_handle_t context = NULL; - ze_result_t res = ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &context); - if (res == ZE_RESULT_SUCCESS && context) - _ze_event->context = context; - else - THAPI_DBGLOG("zeCommandListGetContextHandle failed with %d for command list: %p", res, - command_list); - - ze_bool_t is_immediate = 0; - if (ZE_COMMAND_LIST_IS_IMMEDIATE_PTR(command_list, &is_immediate) == ZE_RESULT_SUCCESS && - is_immediate) - _ze_event->flags |= _ZE_IMMEDIATE_CMD; -} - -/* Append an event wrapper we own to its cmdlist's events list, under the - * cl-hash lock (the FIND_AND_DEL/ADD pattern guards cl_data against a - * concurrent free in _on_destroy_command_list). */ -static inline void _attach_event_to_cl(struct _ze_event_h *_ze_event, - ze_command_list_handle_t command_list) { - struct _ze_command_list_obj_data *cl_data = NULL; - FIND_AND_DEL_ZE_CL(&command_list, cl_data); - if (!cl_data) { - THAPI_DBGLOG("Could not get command list associated to event: %p", _ze_event->event); +/* Clear latest iff it still points at `slot` (a newer Append may own it now). */ +static inline void _event_latest_clear_if(ze_event_handle_t ev, struct _ze_slot *slot) { + if (!ev) return; + struct _ze_event_state_entry *e = NULL; + HASH_FIND_PTR(_ze_event_state, &ev, e); + if (e && e->latest == slot) { + e->latest = NULL; + _event_state_gc(e); } - DL_APPEND(cl_data->events, _ze_event); - ADD_ZE_CL(cl_data); } -/* Register an injected (tracer-owned) event. Caller has already populated - * _ze_event->event and _ze_event->event_pool via _get_profiling_event. */ -static inline void _register_our_event(struct _ze_event_h *_ze_event, - ze_command_list_handle_t command_list) { - _tag_event_from_cl(_ze_event, command_list); - _attach_event_to_cl(_ze_event, command_list); - ADD_ZE_EVENT(_ze_event); +static inline void _event_kts_set(ze_event_handle_t ev, ze_kernel_timestamp_result_t val) { + struct _ze_event_state_entry *e = _event_state_get_or_add(ev); + if (e) { + e->kts = val; + e->has_kts = 1; + } } -/* Register a user event (we don't own its lifetime). Look up or create the - * wrapper; users are responsible for reset/destroy, so we don't attach it - * to the cl's events list. */ -static inline void _register_user_event(ze_event_handle_t event, - ze_command_list_handle_t command_list) { - struct _ze_event_h *_ze_event = NULL; - FIND_ZE_EVENT(&event, _ze_event); - if (_ze_event) - return; /* already tracked, nothing more to do */ +static inline int _event_kts_get(ze_event_handle_t ev, ze_kernel_timestamp_result_t *out) { + struct _ze_event_state_entry *e = NULL; + HASH_FIND_PTR(_ze_event_state, &ev, e); + if (!e || !e->has_kts) + return 0; + *out = e->kts; + return 1; +} - GET_ZE_EVENT_WRAPPER(_ze_event); - if (!_ze_event) { - THAPI_DBGLOG("Could not get event wrapper for: %p", event); +/* Evict the whole entry (both facts) — called when the event is destroyed. */ +static inline void _event_state_del(ze_event_handle_t ev) { + if (!ev) return; + struct _ze_event_state_entry *e = NULL; + HASH_FIND_PTR(_ze_event_state, &ev, e); + if (e) { + HASH_DEL(_ze_event_state, e); + free(e); } - /* GET_ZE_EVENT_WRAPPER returns a fully-zeroed wrapper (calloc on first use, - * memset by PUT_ZE_EVENT_WRAPPER on recycle), so event_pool and flags are - * already 0 — only set the fields we actually want non-zero. */ - _ze_event->event = event; - - _tag_event_from_cl(_ze_event, command_list); - ADD_ZE_EVENT(_ze_event); } -static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command_list) { - struct _ze_event_h *e_w; - - ze_context_handle_t context = NULL; - ze_result_t res = ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(command_list, &context); - if (res != ZE_RESULT_SUCCESS || !context) { - THAPI_DBGLOG("zeCommandListGetContextHandle failed with %d, for command list: %p", res, - command_list); +/* Pop one recycled event wrapper from the per-context freelist; NULL + * if none cached (caller falls back to creating a fresh L0 event). */ +static struct _ze_event_h *_get_ze_event(ze_context_handle_t context) { + struct _ze_event_pool_entry *pool = NULL; + HASH_FIND_PTR(_ze_event_pools, &context, pool); + if (!pool || !pool->events) return NULL; + struct _ze_event_h *e = pool->events; + DL_DELETE(pool->events, e); + return e; +} + +/* Return an event wrapper to its per-context freelist. On total failure + * (no bucket can be allocated), destroy the backing L0 objects and free + * the wrapper — we'd rather leak nothing than poison the freelist. */ +static void _put_ze_event(struct _ze_event_h *val) { + _ZE_MUST(ZE_EVENT_HOST_RESET_PTR(val->event)); + struct _ze_event_pool_entry *pool = NULL; + HASH_FIND_PTR(_ze_event_pools, &val->context, pool); + if (!pool) { + pool = (struct _ze_event_pool_entry *)calloc(1, sizeof(*pool)); + if (!pool) { + THAPI_DBGLOG("Failed to allocate memory"); + if (val->event_pool) { + if (val->event) + ZE_EVENT_DESTROY_PTR(val->event); + ZE_EVENT_POOL_DESTROY_PTR(val->event_pool); + } + free(val); + return; + } + pool->context = val->context; + HASH_ADD_PTR(_ze_event_pools, context, pool); } - GET_ZE_EVENT(&context, e_w); + DL_PREPEND(pool->events, val); +} + +struct _ze_event_h *_ze_event_wrappers = NULL; + +/* Get a zeroed event wrapper struct: pop from the global recycle list if + * any, else calloc a fresh one. The wrapper is context-agnostic — only + * the backing L0 event + pool inside it bind to a specific ctx. */ +static struct _ze_event_h *_get_ze_event_wrapper(void) { + struct _ze_event_h *e = _ze_event_wrappers; + if (e) + DL_DELETE(_ze_event_wrappers, e); + else + e = (struct _ze_event_h *)calloc(1, sizeof(*e)); + return e; +} + +/* Return a wrapper struct to the recycle list. Used in two situations: + * 1) wrapper construction failed, no L0 objects ever attached; + * 2) the wrapper's context is being destroyed — caller has already + * arranged for the L0 event/pool inside to be released (or left + * them to die with the context). + * We zero before publishing so a future _get_ze_event_wrapper returns + * something equivalent to a fresh calloc. */ +static void _put_ze_event_wrapper(struct _ze_event_h *val) { + memset(val, 0, sizeof(*val)); + DL_PREPEND(_ze_event_wrappers, val); +} + +/* Caller-supplied ctx avoids a redundant zeCommandListGetContextHandle + * (the prologue already fetched it). L0 event/pool create runs under + * the state mutex; cold path, bounded cost. */ +static struct _ze_event_h *_get_profiling_event(ze_context_handle_t context) { + struct _ze_event_h *e_w = _get_ze_event(context); if (e_w) return e_w; - - GET_ZE_EVENT_WRAPPER(e_w); + e_w = _get_ze_event_wrapper(); if (!e_w) { - THAPI_DBGLOG("Could not create a new event wrapper for command list: %p", command_list); + THAPI_DBGLOG("Could not create a new event wrapper for context: %p", context); return NULL; } ze_event_pool_desc_t desc = { ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, NULL, ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP | ZE_EVENT_POOL_FLAG_HOST_VISIBLE, 1}; - res = ZE_EVENT_POOL_CREATE_PTR(context, &desc, 0, NULL, &e_w->event_pool); + ze_result_t res = ZE_EVENT_POOL_CREATE_PTR(context, &desc, 0, NULL, &e_w->event_pool); if (res != ZE_RESULT_SUCCESS) { - THAPI_DBGLOG("zeEventPoolCreate failed with %d, for command list: %p, context: %p", res, - command_list, context); + THAPI_DBGLOG("zeEventPoolCreate failed with %d, for context: %p", res, context); goto cleanup_wrapper; } ze_event_desc_t e_desc = {ZE_STRUCTURE_TYPE_EVENT_DESC, NULL, 0, ZE_EVENT_SCOPE_FLAG_HOST, @@ -328,188 +782,782 @@ static struct _ze_event_h *_get_profiling_event(ze_command_list_handle_t command cleanup_ep: ZE_EVENT_POOL_DESTROY_PTR(e_w->event_pool); cleanup_wrapper: - PUT_ZE_EVENT_WRAPPER(e_w); + _put_ze_event_wrapper(e_w); return NULL; } -static void _profile_event_results(ze_event_handle_t event) { - ze_kernel_timestamp_result_t res = {0}; - ze_result_t status; - ze_result_t timestamp_status; +/* Unlink chunk c from cl_data->chunks and free its slab + struct. + * `free_slab` controls whether to issue zeMemFree on the slab — false when + * the chunk's context is being destroyed (driver reclaims; zeMemFree on a + * doomed ctx is at best racy). Slot-side cleanup (events, waits, preds) + * is the caller's responsibility — this helper only owns the chunk + * envelope and the slab. */ +static void +_cl_chunk_free(struct _ze_command_list_obj_data *cl_data, struct _ze_slab_chunk *c, int free_slab) { + DL_DELETE(cl_data->chunks, c); + if (free_slab && c->slab) + ZE_MEM_FREE_PTR(c->slab_ctx, c->slab); + free(c); +} - if (tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) { - status = ZE_EVENT_QUERY_STATUS_PTR(event); - timestamp_status = ZE_EVENT_QUERY_KERNEL_TIMESTAMP_PTR(event, &res); - do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, event, status, timestamp_status, - res.global.kernelStart, res.global.kernelEnd, res.context.kernelStart, - res.context.kernelEnd); +/* Allocate a new chunk and append it to cl_data->chunks. */ +static struct _ze_slab_chunk *_cl_chunk_alloc(struct _ze_command_list_obj_data *cl_data, + ze_context_handle_t ctx) { + struct _ze_slab_chunk *c = (struct _ze_slab_chunk *)calloc(1, sizeof(*c)); + if (!c) + return NULL; + size_t bytes = (size_t)_ZE_SLAB_CHUNK_SLOTS * sizeof(ze_kernel_timestamp_result_t); + ze_host_mem_alloc_desc_t hd = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, NULL, 0}; + if (ZE_MEM_ALLOC_HOST_PTR(ctx, &hd, bytes, sizeof(uint64_t), &c->slab) != ZE_RESULT_SUCCESS || + !c->slab) { + free(c); + return NULL; } + memset(c->slab, 0, bytes); + c->slab_ctx = ctx; + DL_APPEND(cl_data->chunks, c); + return c; } -static inline void _on_destroy_event(ze_event_handle_t event) { - struct _ze_event_h *ze_event = NULL; +/* Allocate one new slot at the tail of cl_data->chunks. Grows by one + * chunk for imm cls; regular cls stay at one chunk and return NULL when + * full (their inj events are baked into the closed cl body, so storage + * must keep addressing them via the same (slab, off) pair). */ +static struct _ze_slot *_cl_slot_append(struct _ze_command_list_obj_data *cl_data, + ze_context_handle_t ctx, + struct _ze_event_h *inj, + struct _ze_event_h *shadow_done, + ze_event_handle_t attr, + ze_event_handle_t *waits, + uint32_t n_waits) { + struct _ze_slab_chunk *tail = cl_data->chunks ? cl_data->chunks->prev : NULL; + if (!tail || tail->n_used >= _ZE_SLAB_CHUNK_SLOTS) { + if (tail && !cl_data->is_immediate) { + /* Regular cl is capped at one chunk (inj events are baked into the + * closed cl body, so storage can't move). Past the cap we drop the + * Append's profiling silently — warn once so the data loss is at + * least visible. Called under _ze_state_mutex, so the guard is safe. */ + static int warned = 0; + if (!warned) { + warned = 1; + _THAPI_LOG("warning: regular command list %p exceeded %d profiled " + "Appends in one build; further Appends will not be timed", + (void *)cl_data->ptr, _ZE_SLAB_CHUNK_SLOTS); + } + return NULL; + } + tail = _cl_chunk_alloc(cl_data, ctx); + if (!tail) + return NULL; + } + uint32_t idx = tail->n_used; + struct _ze_slot *s = &tail->slots[idx]; + /* Chunk memory is calloc'd, so all other slot fields are already zero. */ + s->owner = cl_data; + s->chunk = tail; + s->inj = inj; + s->shadow_done = shadow_done; + s->attr = attr; + s->off = (size_t)idx * sizeof(ze_kernel_timestamp_result_t); + if (n_waits) { + s->waits = (ze_event_handle_t *)malloc(n_waits * sizeof(ze_event_handle_t)); + if (s->waits) { + memcpy(s->waits, waits, n_waits * sizeof(ze_event_handle_t)); + s->n_waits = n_waits; + } + } + tail->n_used++; + tail->n_held++; + return s; +} - FIND_AND_DEL_ZE_EVENT(&event, ze_event); - if (!ze_event) { - return; +/* Compute s->preds from s->waits via the global event_latest_signaled + * map, plus the previous live slot on this cl if the cl is in-order. + * Marks s live and publishes s as the new event_latest_signaled[attr]. */ +static void _slot_instantiate(struct _ze_command_list_obj_data *cl_data, struct _ze_slot *s) { + /* Slot must be inert: live=0, preds NULL. Re-instantiating a live slot + * would overwrite preds[] (leaking the prior pred refs) and let the + * in-order pred walk pick up later-appended live siblings as predecessors, + * forming cycles that infinite-loop _slot_drain. */ + _THAPI_ASSERT(!s->live, "slot %p already live (double _slot_instantiate)", (void *)s); + s->live = 1; + uint32_t cap = s->n_waits + 1; /* +1 for in-order prev */ + s->preds = (struct _ze_slot **)calloc(cap, sizeof(struct _ze_slot *)); + s->n_preds = 0; + for (uint32_t i = 0; i < s->n_waits; ++i) { + struct _ze_slot *p = _event_latest_get(s->waits[i]); + if (p && p->live) + s->preds[s->n_preds++] = p; + } + if (cl_data->is_in_order) { + /* Walk chunks newest-to-oldest, slots high-to-low, stop at the first + * live slot strictly before s. Chunks are appended in time order + * (DL_APPEND) and slots within a chunk in time order, so reverse-walk + * yields reverse time order. Skip s itself; s might still have + * live=0 here but the !=s guard is safe and clearer. */ + struct _ze_slab_chunk *c; + struct _ze_slot *prev = NULL; + for (c = cl_data->chunks ? cl_data->chunks->prev : NULL; c && !prev; + c = (c == cl_data->chunks) ? NULL : c->prev) { + for (int32_t i = (int32_t)c->n_used - 1; i >= 0; --i) { + if (&c->slots[i] == s) + continue; + if (c->slots[i].live) { + prev = &c->slots[i]; + break; + } + } + } + if (prev) + s->preds[s->n_preds++] = prev; } + /* Each new pred edge holds a ref on its target. */ + for (uint32_t i = 0; i < s->n_preds; ++i) + s->preds[i]->refs++; + if (s->attr) + _event_latest_set(s->attr, s); +} - _profile_event_results(event); - PUT_ZE_EVENT_WRAPPER(ze_event); +/* Publish a fresh slot: shadow path appends a Query on the per-(ctx,device) + * shadow cl; inline path is a no-op here (its QKT is baked into the user cl + * body at Append). Then instantiate in the dep graph. `s->shadow_done` is + * the single source of truth for "shadow vs inline" — no is_compute branch + * at the call site. */ +static void _slot_publish(struct _ze_command_list_obj_data *cl_data, + struct _ze_slot *s, + struct _ze_shadow_cl *sh) { + if (s->shadow_done) { + _THAPI_ASSERT(sh, "shadow-path slot needs a shadow cl"); + _shadow_append_query(sh, s->inj->event, s->chunk->slab, &s->off, s->shadow_done->event); + s->sh = sh; + } + _slot_instantiate(cl_data, s); } -/* Caller already holds the wrapper (e.g. iterating cl_data->events) and - * has removed it from any per-cl list. Drops it from the global events - * hash, optionally emits its timestamp tracepoint, and recycles. */ -static inline void _unregister_ze_event(struct _ze_event_h *ze_event, int get_results) { - struct _ze_event_h *evicted = NULL; - FIND_AND_DEL_ZE_EVENT(&ze_event->event, evicted); - /* evicted should be == ze_event; if not, our hash bookkeeping is corrupt. */ +/* INLINE path: bake the QKT into the user cl body (wait=inj, sig=user_signal). + * Fires when Appended for immediate cls and on every Execute for regular cls + * (it is now part of the cl body). The QKT signaling user_signal IS the + * user_signal chain — no separate barrier needed. */ +static void _append_inline_query(ze_command_list_handle_t command_list, + struct _ze_slot *s, + ze_event_handle_t inj_event, + ze_event_handle_t user_signal) { + _ZE_MUST(ZE_COMMAND_LIST_APPEND_QUERY_KERNEL_TIMESTAMPS_PTR( + command_list, 1, &inj_event, s->chunk->slab, &s->off, user_signal, 1, &inj_event)); +} - if (get_results) - _profile_event_results(ze_event->event); - if (ze_event->event_pool) - PUT_ZE_EVENT(ze_event); - else - PUT_ZE_EVENT_WRAPPER(ze_event); +/* Chain the user's signal event off our inj on the user cl: the prologue + * swapped user_signal for inj, so without this the user's Sync(user_signal) + * would hang forever. No-op (returns 0) when the user passed no signal; + * returns 1 when the barrier was appended. Mutex-agnostic — it issues an + * L0 Append on the user cl and touches no tracer state, so it is correct + * both inside the critical section (shadow path) and outside it (the + * failure-path compensation). Aborts on L0 failure (a silent hang is worse). */ +static int _chain_user_signal(ze_command_list_handle_t command_list, + ze_event_handle_t inj_event, + ze_event_handle_t user_signal) { + if (!user_signal) + return 0; + _ZE_MUST(ZE_COMMAND_LIST_APPEND_BARRIER_PTR(command_list, user_signal, 1, &inj_event)); + return 1; } -static inline void _on_reset_event(ze_event_handle_t event) { - struct _ze_event_h *ze_event = NULL; +/* Roll back the slot just handed out by _cl_slot_append. We were the last to + * touch the tail chunk and hold _ze_state_mutex, so decrementing n_used/n_held + * and zeroing the slot is safe; if the chunk was freshly allocated only for + * this Append (n_used now 0), free it back so a slot-append failure doesn't + * leak a chunk. */ +static void _slot_append_rollback(struct _ze_command_list_obj_data *cl_data, struct _ze_slot *s) { + free(s->waits); + struct _ze_slab_chunk *c = s->chunk; + c->n_used--; + c->n_held--; + memset(s, 0, sizeof(*s)); + if (c->n_used == 0) + _cl_chunk_free(cl_data, c, /*free_slab=*/1); +} - FIND_AND_DEL_ZE_EVENT(&event, ze_event); - if (!ze_event) { - THAPI_DBGLOG("Could not find event: %p", event); +/* Append-time hook from profiling_epilogue. The prologue swapped user's + * hSignalEvent for inj->event; user_signal is the original (possibly NULL), + * user_waits is the user's wait list, ctx is the cl's context (fetched + * once in the prologue, threaded in). Forks on cl_data->is_compute to + * pick the QKT placement — see "QKT placement" in the file header. */ +static void _universal_record_append(ze_command_list_handle_t command_list, + ze_context_handle_t ctx, + struct _ze_event_h *inj, + ze_event_handle_t user_signal, + ze_event_handle_t *user_waits, + uint32_t user_n_waits) { + if (!inj || !ctx) return; + struct _ze_event_h *shadow_done = NULL; + struct _ze_slot *s = NULL; + int barrier_chained = 0; + + inj->context = ctx; + + pthread_mutex_lock(&_ze_state_mutex); + struct _ze_command_list_obj_data *cl_data = _cl_find(command_list); + if (!cl_data) + goto fail_locked; + int inline_path = cl_data->is_compute; + + /* Shadow path needs a fence event (Query lives on the shadow cl; + * drain host-syncs on it). Inline path uses user_signal as the fence + * via the dep graph, no extra event needed. */ + if (!inline_path) { + shadow_done = _get_profiling_event(ctx); + if (!shadow_done) + goto fail_locked; + shadow_done->context = ctx; } - _profile_event_results(event); + /* Publish the cl->ctx mapping. _on_execute_one_cl reads it directly + * (no fallback fetch) when resolving the shadow cl, and + * _on_destroy_context's per-cl sweep matches against it. */ + cl_data->cached_context = ctx; - if (!(ze_event->flags & _ZE_IMMEDIATE_CMD)) - ADD_ZE_EVENT(ze_event); - else - PUT_ZE_EVENT_WRAPPER(ze_event); + s = _cl_slot_append(cl_data, ctx, inj, shadow_done, user_signal, user_waits, user_n_waits); + if (!s) + goto fail_locked; + + if (inline_path) { + _append_inline_query(command_list, s, inj->event, user_signal); + barrier_chained = 1; /* user_signal chained via the QKT itself */ + _slot_instantiate(cl_data, s); + pthread_mutex_unlock(&_ze_state_mutex); + return; + } + + /* Shadow path: chain user_signal off inj on the user cl, then place + * the Query on the shadow cl (immediate cls only — regular cls defer + * to the Execute epilogue, see _on_execute_one_cl). */ + barrier_chained = _chain_user_signal(command_list, inj->event, user_signal); + if (cl_data->is_immediate) { + ze_device_handle_t dev = NULL; + _ZE_MUST(ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev)); + struct _ze_shadow_cl *sh = _get_shadow_cl(ctx, dev); + if (!sh) + goto fail_locked; + _slot_publish(cl_data, s, sh); + } + pthread_mutex_unlock(&_ze_state_mutex); + return; + +fail_locked: + if (s) + _slot_append_rollback(cl_data, s); + if (shadow_done) + _put_ze_event(shadow_done); + _put_ze_event(inj); + pthread_mutex_unlock(&_ze_state_mutex); + /* Compensate outside the state mutex: if we bailed before chaining + * user_signal off inj, do it now or the user's Sync(user_signal) hangs. */ + if (!barrier_chained) + _chain_user_signal(command_list, inj->event, user_signal); } -static inline void _dump_and_reset_our_event(ze_event_handle_t event) { - struct _ze_event_h *ze_event = NULL; +/* Dispose the per-slot resources shared by every teardown path: the inj and + * shadow_done events, the waits[] copy, the preds[] array, and the slot's + * entry in event_latest_signaled. The event-disposal target differs by caller: + * _ZE_DISPOSE_POOL -> _put_ze_event (ctx alive: events recycle to the pool) + * _ZE_DISPOSE_WRAPPER -> _put_ze_event_wrapper (ctx dying: only recycle the + * wrapper struct; the L0 event/pool die with the ctx) + * Deliberately does NOT touch chunk accounting (n_held / n_pinned), refs, + * owner, or live — those are caller-specific and stay at the call site. + * Every field is nulled so the call is idempotent (safe to re-run on a slot + * whose preds/latest-signaled were already cleared during drain). */ +enum _ze_slot_dispose_mode { _ZE_DISPOSE_POOL, _ZE_DISPOSE_WRAPPER }; +static void _slot_dispose_resources(struct _ze_slot *s, enum _ze_slot_dispose_mode mode) { + if (s->inj) { + if (mode == _ZE_DISPOSE_WRAPPER) + _put_ze_event_wrapper(s->inj); + else + _put_ze_event(s->inj); + s->inj = NULL; + } + if (s->shadow_done) { + if (mode == _ZE_DISPOSE_WRAPPER) + _put_ze_event_wrapper(s->shadow_done); + else + _put_ze_event(s->shadow_done); + s->shadow_done = NULL; + } + free(s->waits); + s->waits = NULL; + s->n_waits = 0; + free(s->preds); + s->preds = NULL; + s->n_preds = 0; + _event_latest_clear_if(s->attr, s); + s->attr = NULL; +} - FIND_AND_DEL_ZE_EVENT(&event, ze_event); - if (!ze_event) { - THAPI_DBGLOG("Could not find event: %p", event); +/* Reclaim a slot: PUT events back to the per-context pool, free waits, + * decrement chunk n_held; if the chunk hits 0 AND isn't the active + * tail, unlink and free it. Regular cls are skipped (their inj is + * baked into the cl body — reclaim happens at cl destroy instead). */ +static void _slot_release(struct _ze_slot *s) { + if (!s) + return; + /* Detached slot: its owning cl was torn down (reset/destroy) while this + * slot was still a pred of a live slot elsewhere. Its resources were freed + * at reclaim and owner was nulled; the chunk struct was kept alive only to + * keep this slot's refs addressable. We are the downstream drain dropping + * the last ref — drop the chunk's pin and free the bare struct at zero. */ + if (!s->owner && s->chunk && s->chunk->n_pinned) { + struct _ze_slab_chunk *c = s->chunk; + if (--c->n_pinned == 0) + free(c); return; } + if (!s->owner || !s->owner->is_immediate) + return; + /* Reached only from _slot_drain, which already freed s->preds and cleared + * event_latest_signaled[s->attr]; the primitive re-running those is a no-op + * (free(NULL); _clear_if on a missing/overwritten key does nothing). */ + _slot_dispose_resources(s, _ZE_DISPOSE_POOL); + + struct _ze_slab_chunk *c = s->chunk; + struct _ze_command_list_obj_data *cl = s->owner; + if (!c) + return; + c->n_held--; + if (c->n_held == 0 && c != cl->chunks->prev) + _cl_chunk_free(cl, c, /*free_slab=*/1); +} - _profile_event_results(event); - ZE_EVENT_HOST_RESET_PTR(event); - ADD_ZE_EVENT(ze_event); +/* Drain one slot. Recurses on its preds, emits the slot's tracepoint, + * drops one ref on each pred (releasing fully-drained-and-unreferenced + * preds), then releases s if its own refs hit 0. Safe to call on an + * already-drained (live=0) slot. Slab read uses s->chunk->slab — preds + * may live in another cl, so we can't use the caller's slab. + * + * No cycle guard: preds come from in-order prev (strictly earlier slot + * in the same cl, DAG) and from event_latest_signaled[wait_event] (a + * slot published BEFORE us). A cycle would need user-declared mutual + * waits, which L0 itself deadlocks on. */ +static void _slot_drain(struct _ze_slot *s) { + if (!s || !s->live) + return; + for (uint32_t i = 0; i < s->n_preds; ++i) + _slot_drain(s->preds[i]); + s->live = 0; + /* Shadow-path only: block until the Query has fired, then reset + * shadow_done so the next Execute round starts with a clean event. + * The user's own sync doesn't cover the Query because it runs on the + * shadow cl. Inline-path slots have shadow_done==NULL — their QKT + * lives in the user cl body and the dep-graph walk that brought us + * here already implies it has run. */ + if (s->shadow_done && s->shadow_done->event) { + _ZE_MUST(ZE_EVENT_HOST_SYNCHRONIZE_PTR(s->shadow_done->event, UINT64_MAX)); + _ZE_MUST(ZE_EVENT_HOST_RESET_PTR(s->shadow_done->event)); + /* QKT completed device-side. Drop the live ref; if nothing else on + * this shadow cl is in flight, Reset it: the L0 driver leaks ~10 KB + * per AppendQueryKernelTimestamps and only reclaims at Reset/Destroy. */ + if (s->sh) { + s->sh->live_queries--; + if (s->sh->live_queries == 0) + _ZE_MUST(ZE_COMMAND_LIST_RESET_PTR(s->sh->cl)); + } + } + ze_event_handle_t attr = s->attr ? s->attr : (s->inj ? s->inj->event : NULL); + if (s->chunk && s->chunk->slab && attr) { + ze_kernel_timestamp_result_t r = + *(ze_kernel_timestamp_result_t *)((char *)s->chunk->slab + s->off); + /* Stash the kernel result under the user's own event so the user's + * zeEventQueryKernelTimestamp returns kernel timing, not the QKT/barrier + * op timing their event actually carries (we swapped it for inj). Only + * when the user supplied an event (s->attr); inj is ours, not queryable. */ + if (s->attr) + _event_kts_set(s->attr, r); + if (tracepoint_enabled(lttng_ust_ze_profiling, event_profiling_results)) + do_tracepoint(lttng_ust_ze_profiling, event_profiling_results, attr, ZE_RESULT_SUCCESS, + ZE_RESULT_SUCCESS, r.global.kernelStart, r.global.kernelEnd, + r.context.kernelStart, r.context.kernelEnd); + } + _event_latest_clear_if(s->attr, s); + /* Drop refs on preds; release any that hit 0 and are already drained. */ + for (uint32_t i = 0; i < s->n_preds; ++i) { + struct _ze_slot *p = s->preds[i]; + if (--p->refs == 0 && !p->live) + _slot_release(p); + } + free(s->preds); + s->preds = NULL; + s->n_preds = 0; + if (s->refs == 0) + _slot_release(s); } -/* Tear down a wrapper: optionally emit its timestamp tracepoint, then - * destroy the injected event+pool if we own them, then recycle the - * wrapper. Caller must have already removed it from any list/hash that - * references it. */ -static inline void _dispose_event_wrapper(struct _ze_event_h *ze_event, int do_dump) { - if (do_dump && ze_event->event) - _profile_event_results(ze_event->event); - if (ze_event->event_pool) { - if (ze_event->event) - ZE_EVENT_DESTROY_PTR(ze_event->event); - ZE_EVENT_POOL_DESTROY_PTR(ze_event->event_pool); +/* Drain every live slot in a cl (walk chunks oldest-to-newest, slots + * low-to-high — natural time order for emission). */ +static void _cl_drain(struct _ze_command_list_obj_data *cl_data) { + struct _ze_slab_chunk *c, *tmp; + DL_FOREACH_SAFE (cl_data->chunks, c, tmp) { + /* Bump refcount during traversal so the last _slot_drain doesn't + * free c out from under the inner loop. Drop after, free here. */ + c->n_held++; + for (uint32_t i = 0; i < c->n_used; ++i) + _slot_drain(&c->slots[i]); + c->n_held--; + if (c->n_held == 0 && c != cl_data->chunks->prev) + _cl_chunk_free(cl_data, c, /*free_slab=*/1); } - PUT_ZE_EVENT_WRAPPER(ze_event); + _cl_index_clear(cl_data); + cl_data->in_flight_q = NULL; + cl_data->in_flight_fence = NULL; +} + +static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data); /* fwd */ + +/* 1 if any slot in the cl is still in flight (instantiated, not yet drained). */ +static int _cl_any_live(struct _ze_command_list_obj_data *cl_data) { + _ZE_FOREACH_SLOT (cl_data, s) + if (s->live) + return 1; + return 0; +} + +/* Immediate cls only: once every slot in the cl is drained, raw-Reset the + * user's cl so the L0 driver reclaims its per-QKT storage (it accumulates + * otherwise on a long-lived reused immediate cl — see bench/mem_persistent_cl), + * then reclaim our own slot bookkeeping (the baked state is gone after the + * driver reset, exactly like a user zeCommandListReset on a regular cl). + * Raw *_PTR = untraced; safe only when no slot is still live (no in-flight + * work). Called at the tail of every sync-drain path that can touch an imm cl. */ +static void _imm_reset_if_drained(struct _ze_command_list_obj_data *cl_data) { + if (!cl_data || !cl_data->is_immediate || _cl_any_live(cl_data)) + return; + ZE_COMMAND_LIST_RESET_PTR((ze_command_list_handle_t)cl_data->ptr); + _cl_data_reset(cl_data); } -static void _event_cleanup() { - struct _ze_event_h *ze_event = NULL; - struct _ze_event_h *tmp = NULL; - HASH_ITER(hh, _ze_events, ze_event, tmp) { - HASH_DEL(_ze_events, ze_event); - _dispose_event_wrapper(ze_event, 1); +/* Reclaim one chunk during cl teardown (reset or single-cl destroy, ctx + * alive). Releases every slot's resources (events to pool, waits, preds, + * clears latest-signaled), then either frees the chunk or — if any slot is + * still referenced as a pred by a live slot in ANOTHER cl (refs>0) — DETACHES + * it: unlink from cl_data->chunks, null each slot's owner, and keep the bare + * struct alive with n_pinned = #referenced slots. The downstream drains that + * drop those refs free the struct (see _slot_release's detached branch). + * Without this, freeing the chunk here would dangle the referrers' preds[]. */ +static void _cl_chunk_reclaim(struct _ze_command_list_obj_data *cl_data, struct _ze_slab_chunk *c) { + uint32_t pinned = 0; + for (uint32_t i = 0; i < c->n_used; ++i) { + struct _ze_slot *s = &c->slots[i]; + _slot_dispose_resources(s, _ZE_DISPOSE_POOL); + if (s->refs) + pinned++; + } + if (pinned == 0) { + _cl_chunk_free(cl_data, c, /*free_slab=*/1); + return; + } + /* Detach: keep the struct alive for the surviving referenced slots. */ + DL_DELETE(cl_data->chunks, c); + if (c->slab) { + ZE_MEM_FREE_PTR(c->slab_ctx, c->slab); + c->slab = NULL; } + for (uint32_t i = 0; i < c->n_used; ++i) + c->slots[i].owner = NULL; + c->n_pinned = pinned; } -static void _on_destroy_context(ze_context_handle_t context) { - struct _ze_event_h *ze_event = NULL; - struct _ze_event_h *tmp = NULL; - pthread_mutex_lock(&_ze_events_mutex); - HASH_ITER(hh, _ze_events, ze_event, tmp) { - if (ze_event->context == context) { - HASH_DEL(_ze_events, ze_event); - _dispose_event_wrapper(ze_event, 1); - } +/* Reclaim all of a regular cl's slot state, keeping cl_data registered and + * empty for reuse. Used by the zeCommandListReset hook. */ +static void _cl_data_reset(struct _ze_command_list_obj_data *cl_data) { + struct _ze_slab_chunk *c, *tmp; + DL_FOREACH_SAFE (cl_data->chunks, c, tmp) + _cl_chunk_reclaim(cl_data, c); + _cl_index_clear(cl_data); + cl_data->in_flight_q = NULL; + cl_data->in_flight_fence = NULL; +} + +/* Release everything cl_data owns and free cl_data itself. Caller has + * already removed cl_data from _ze_cls (single-cl: _cl_find_and_del; + * per-ctx sweep: HASH_DEL inside the iter). When ctx is dying we just + * recycle wrapper structs (the L0 event/pool will be destroyed in + * _on_destroy_context step 3) and skip zeMemFree on the slab (the + * driver reclaims, and zeMemFree on a doomed ctx is racy); no slot can + * outlive the ctx, so no detach is needed. When the ctx is alive a slot + * may still be referenced cross-cl, so we reclaim per-chunk (detaching + * referenced chunks) just like reset. */ +static void _cl_data_destroy(struct _ze_command_list_obj_data *cl_data, int ctx_dying) { + struct _ze_slab_chunk *c, *tmp; + /* Unlink from the in-flight indexes before the struct is freed, or a later + * queue/fence sync would walk a dangling cl. (When ctx_dying the whole index + * is torn down separately, but unlinking here is still correct and cheap.) */ + _cl_index_clear(cl_data); + if (!ctx_dying) { + DL_FOREACH_SAFE (cl_data->chunks, c, tmp) + _cl_chunk_reclaim(cl_data, c); + free(cl_data); + return; } - pthread_mutex_unlock(&_ze_events_mutex); - pthread_mutex_lock(&_ze_event_pools_mutex); - struct _ze_event_pool_entry *pool = NULL; - HASH_FIND_PTR(_ze_event_pools, &context, pool); - if (pool) { - HASH_DEL(_ze_event_pools, pool); - struct _ze_event_h *elt = NULL, *tmp = NULL; - DL_FOREACH_SAFE(pool->events, elt, tmp) { - DL_DELETE(pool->events, elt); - /* Wrapper is in the free list — its event was already dumped+reset - * by whoever recycled it. Don't dump again, just tear down. */ - _dispose_event_wrapper(elt, 0); - } - free(pool); + DL_FOREACH_SAFE (cl_data->chunks, c, tmp) { + for (uint32_t i = 0; i < c->n_used; ++i) + _slot_dispose_resources(&c->slots[i], _ZE_DISPOSE_WRAPPER); + _cl_chunk_free(cl_data, c, /*free_slab=*/0); } - pthread_mutex_unlock(&_ze_event_pools_mutex); + free(cl_data); +} + +/* zeCommandListDestroy epilogue. Per L0 spec the device is no longer + * referencing the cl, so we don't drain — just release our state. + * Regular cls recycle inj here (cl body is about to die anyway); + * immediate cls' slots are typically already released at drain. */ +static void _on_destroy_command_list(ze_command_list_handle_t command_list) { + pthread_mutex_lock(&_ze_state_mutex); + struct _ze_command_list_obj_data *cl_data = _cl_find_and_del(command_list); + if (cl_data) + _cl_data_destroy(cl_data, /*ctx_dying=*/0); + pthread_mutex_unlock(&_ze_state_mutex); } +/* zeCommandListReset epilogue. The L0 spec requires the user to have + * synchronized before Reset, so our slots are drained — but for a REGULAR cl + * "drained" is not "reclaimed": _slot_release is a no-op for regular cls + * (their inj is baked into the cl body, kept for reuse across Executes), so + * the slots linger. Reset wipes that body, so we must reclaim now; otherwise + * the stale slots are re-published on the next Execute (massive over-count) + * and their chunks accumulate (leak). We drain defensively first in case the + * user under-synced, then reclaim. The cl stays registered, empty for reuse. */ static void _on_reset_command_list(ze_command_list_handle_t command_list) { - struct _ze_command_list_obj_data *cl_data = NULL; + pthread_mutex_lock(&_ze_state_mutex); + struct _ze_command_list_obj_data *cl_data = _cl_find(command_list); + if (cl_data) { + _cl_drain(cl_data); + _cl_data_reset(cl_data); + } + pthread_mutex_unlock(&_ze_state_mutex); +} - FIND_AND_DEL_ZE_CL(&command_list, cl_data); - if (!cl_data) { - THAPI_DBGLOG("Could not get command list: %p", command_list); - return; +/* zeContextDestroy prologue. Three sweeps to drop our own L0 objects + * that live inside this ctx; the user's own cls/events are their + * responsibility per the L0 contract. */ +static void _on_destroy_context(ze_context_handle_t hContext) { + /* 1) Drop cls bound to this ctx. */ + pthread_mutex_lock(&_ze_state_mutex); + struct _ze_command_list_obj_data *cl_data = NULL, *cl_tmp = NULL; + HASH_ITER (hh, _ze_cls, cl_data, cl_tmp) { + if (cl_data->cached_context != hContext) + continue; + HASH_DEL(_ze_cls, cl_data); + _cl_data_destroy(cl_data, /*ctx_dying=*/1); + } + + /* 2) Shadow cls keyed by (ctx, device). */ + struct _ze_shadow_cl *sh = NULL, *sh_tmp = NULL; + HASH_ITER (hh, _ze_shadow_cls, sh, sh_tmp) { + if (sh->key.context != hContext) + continue; + HASH_DEL(_ze_shadow_cls, sh); + if (sh->cl) + ZE_COMMAND_LIST_DESTROY_PTR(sh->cl); + free(sh); } - struct _ze_event_h *elt = NULL, *tmp = NULL; - DL_FOREACH_SAFE(cl_data->events, elt, tmp) { - DL_DELETE(cl_data->events, elt); - _unregister_ze_event(elt, cl_data->flags & _ZE_EXECUTED); + + /* 3) Per-ctx event pool freelist. */ + struct _ze_event_pool_entry *pe = NULL; + HASH_FIND_PTR(_ze_event_pools, &hContext, pe); + if (pe) { + HASH_DEL(_ze_event_pools, pe); + struct _ze_event_h *w, *w_tmp; + DL_FOREACH_SAFE (pe->events, w, w_tmp) { + if (w->event) + ZE_EVENT_DESTROY_PTR(w->event); + if (w->event_pool) + ZE_EVENT_POOL_DESTROY_PTR(w->event_pool); + DL_DELETE(pe->events, w); + _put_ze_event_wrapper(w); + } + free(pe); } - cl_data->flags &= ~_ZE_EXECUTED; - ADD_ZE_CL(cl_data); + pthread_mutex_unlock(&_ze_state_mutex); } -static void _on_execute_command_lists(uint32_t numCommandLists, - ze_command_list_handle_t *phCommandLists) { - for (uint32_t i = 0; i < numCommandLists; i++) { - struct _ze_command_list_obj_data *cl_data = NULL; - FIND_AND_DEL_ZE_CL(phCommandLists + i, cl_data); +/* The four user sync APIs all reduce to "drain the slots the synced anchor + * covers". They differ only in how the anchor selects work: + * + * _ZE_SYNC_CL zeCommandListHostSynchronize -> the one named cl + * _ZE_SYNC_QUEUE zeCommandQueueSynchronize -> every cl with in_flight_q == h + * _ZE_SYNC_FENCE zeFenceHostSynchronize -> every cl with in_flight_fence == h + * _ZE_SYNC_EVENT zeEventHostSynchronize -> the slot that last signaled h, + * walking its pred edges + * + * QUEUE/FENCE share one rule: a queue/fence wait completes exactly the cls a + * given Execute submitted, identified by the handle stamped on the cl at + * Execute. CL/EVENT name their target directly. After draining, a fully-drained + * immediate cl is raw-Reset to cap the driver's per-QKT storage leak + * (_imm_reset_if_drained); for the cl/queue/fence anchors _cl_drain already + * cleared in_flight_*, while the event anchor may leave live siblings, so it + * clears in_flight_* only once the cl has no slot left in flight. */ +enum _ze_sync_kind { _ZE_SYNC_CL, _ZE_SYNC_QUEUE, _ZE_SYNC_FENCE, _ZE_SYNC_EVENT }; +static void _on_sync(enum _ze_sync_kind kind, void *h) { + pthread_mutex_lock(&_ze_state_mutex); + if (kind == _ZE_SYNC_EVENT) { + struct _ze_slot *s = _event_latest_get((ze_event_handle_t)h); + if (s && s->owner) { + _slot_drain(s); + if (!_cl_any_live(s->owner)) { + _cl_index_clear(s->owner); + s->owner->in_flight_q = NULL; + s->owner->in_flight_fence = NULL; + _imm_reset_if_drained(s->owner); + } + } + } else if (kind == _ZE_SYNC_CL) { + struct _ze_command_list_obj_data *cl_data = _cl_find((ze_command_list_handle_t)h); if (cl_data) { - /* dump events if they were executed */ - if (cl_data->flags & _ZE_EXECUTED) { - struct _ze_event_h *elt = NULL; - DL_FOREACH(cl_data->events, elt) { _dump_and_reset_our_event(elt->event); } - } else - cl_data->flags |= _ZE_EXECUTED; - ADD_ZE_CL(cl_data); - } else - THAPI_DBGLOG("Could not get command list: %p", phCommandLists[i]); + _cl_drain(cl_data); + _imm_reset_if_drained(cl_data); + } + } else { /* _ZE_SYNC_QUEUE / _ZE_SYNC_FENCE: drain just the indexed cls */ + struct _ze_inflight_bucket *b = NULL; + if (kind == _ZE_SYNC_QUEUE) + HASH_FIND_PTR(_ze_q_index, &h, b); + else + HASH_FIND_PTR(_ze_fence_index, &h, b); + if (b) { + struct _ze_command_list_obj_data *cl_data = NULL, *tmp = NULL; + /* SAFE2 because _cl_drain -> _cl_index_clear unlinks cl_data from this + * very bucket (and may free the bucket on the last unlink). */ + if (kind == _ZE_SYNC_QUEUE) { + DL_FOREACH_SAFE2 (b->cls, cl_data, tmp, q_next) + _cl_drain(cl_data); + } else { + DL_FOREACH_SAFE2 (b->cls, cl_data, tmp, f_next) + _cl_drain(cl_data); + } + } } + pthread_mutex_unlock(&_ze_state_mutex); } -static void _on_destroy_command_list(ze_command_list_handle_t command_list) { - struct _ze_command_list_obj_data *cl_data = NULL; +/* zeEventQueryKernelTimestamp epilogue. If we drained a kernel result for + * this user event, overwrite *dstptr with it: the user's event carries the + * QKT/barrier op timing (we swapped their signal for inj at Append), but the + * caller wants the KERNEL timing, which we stashed at drain. Returns 1 if it + * served a stashed result. */ +static int _on_query_kernel_timestamp(ze_event_handle_t hEvent, + ze_kernel_timestamp_result_t *dstptr) { + if (!hEvent || !dstptr) + return 0; + pthread_mutex_lock(&_ze_state_mutex); + int found = _event_kts_get(hEvent, dstptr); + pthread_mutex_unlock(&_ze_state_mutex); + return found; +} + +/* zeEventDestroy epilogue (success only). The per-event state entry is keyed by + * the event's HANDLE ADDRESS, which the L0 driver recycles: a fresh event + * created after this one is destroyed can land on the same address. Without + * eviction the new event inherits the dead one's entry — + * .kts: a never-signaled event's zeEventQueryKernelTimestamp would be + * served the prior event's stale timing; + * .latest: a wait on the reused address would resolve to a freed slot, a + * use-after-free in the pred walk. + * Evicting the entry at destroy bounds the map to live events and closes the + * recycled-address reads. Gated on a successful destroy by the caller: a failed + * destroy leaves the event (and its address) alive, so its data stays. */ +static void _on_destroy_event(ze_event_handle_t hEvent) { + pthread_mutex_lock(&_ze_state_mutex); + _event_state_del(hEvent); + pthread_mutex_unlock(&_ze_state_mutex); +} - FIND_AND_DEL_ZE_CL(&command_list, cl_data); +/* Execute-epilogue handler for ONE cl. Runs AFTER L0 Execute returned, + * with the user cl in flight. Three phases: + * + * 1) If in_flight_q is set (prior Execute by another thread), + * force-sync that queue and drain before we overwrite it. + * Regression test: inorder_reg_Event_multithreaded_01. + * 2) Publish each not-yet-live slot (_slot_publish): shadow-path slots + * Append a fresh Query on the per-(ctx,device) shadow cl, then every + * slot is instantiated into the dep graph. The Append must run AFTER + * L0 Execute — appending earlier deadlocks if the shadow shares an + * engine with the user cl (tests/bugs/query_on_separate_cl_regular_user_cl). + * Inline-path cls bake the QKT into the cl body at Append, so their + * publish is instantiate-only. + * 3) Stamp in_flight_q = hQueue and in_flight_fence = hFence (the fence + * the user passed to this Execute, or NULL). */ +static void _on_execute_one_cl(ze_command_queue_handle_t hQueue, + ze_fence_handle_t hFence, + ze_command_list_handle_t command_list) { + pthread_mutex_lock(&_ze_state_mutex); + struct _ze_command_list_obj_data *cl_data = _cl_find(command_list); if (!cl_data) { - THAPI_DBGLOG("Could not get command list: %p", command_list); + pthread_mutex_unlock(&_ze_state_mutex); return; } - if (_do_profile) { - struct _ze_event_h *elt = NULL, *tmp = NULL; - DL_FOREACH_SAFE(cl_data->events, elt, tmp) { - DL_DELETE(cl_data->events, elt); - _unregister_ze_event(elt, cl_data->flags & _ZE_EXECUTED); + + if (cl_data->in_flight_q) { + _ZE_MUST(ZE_COMMAND_QUEUE_SYNCHRONIZE_PTR(cl_data->in_flight_q, UINT64_MAX)); + _cl_drain(cl_data); + } + /* Shadow cl is resolved lazily on first shadow-path slot. Inline-only cls + * never trigger the lookup. */ + struct _ze_shadow_cl *sh = NULL; + int sh_resolved = 0; + struct _ze_slab_chunk *c; + DL_FOREACH (cl_data->chunks, c) { + for (uint32_t j = 0; j < c->n_used; ++j) { + struct _ze_slot *slot = &c->slots[j]; + if (!slot->inj) + continue; + /* Already-live slots have nothing left to do this Execute: their + * dep-graph entry from Append-time _slot_instantiate is still valid, + * and (inline path) their QKT is baked into the cl body and re-fires + * automatically. Only fresh / drained slots need work here. */ + if (slot->live) + continue; + if (slot->shadow_done && !sh_resolved) { + /* cached_context was published by _universal_record_append before any + * shadow_done slot could exist, so it's always set here — no need + * for an L0 round-trip to recover it. */ + ze_context_handle_t ctx = cl_data->cached_context; + ze_device_handle_t dev = NULL; + _ZE_MUST(ZE_COMMAND_LIST_GET_DEVICE_HANDLE_PTR(command_list, &dev)); + sh = ctx ? _get_shadow_cl(ctx, dev) : NULL; + sh_resolved = 1; + } + if (slot->shadow_done && !sh) + continue; + _slot_publish(cl_data, slot, sh); } } - free(cl_data); + cl_data->in_flight_q = hQueue; + cl_data->in_flight_fence = hFence; + /* Index this cl under its queue (and fence) so a later queue/fence sync + * drains it without scanning every live cl. The force-sync+drain above + * already unlinked any prior in-flight membership, so no double-link. */ + _cl_index_set(cl_data, hQueue, hFence); + + pthread_mutex_unlock(&_ze_state_mutex); +} + +static void _on_execute_command_lists_epilogue(ze_command_queue_handle_t hQueue, + ze_fence_handle_t hFence, + uint32_t numCommandLists, + ze_command_list_handle_t *phCommandLists) { + for (uint32_t i = 0; i < numCommandLists; ++i) + _on_execute_one_cl(hQueue, hFence, phCommandLists[i]); } +/* ======================================================================== + * Property/info dumping + tracer init + * + * Separate concern from the slot/drain engine above: read device/driver/ + * kernel/memory properties and emit the lttng_ust_ze_properties / _build + * tracepoints, plus one-time loader/symbol init. Self-contained — the + * engine never calls into this section, and the only external callers are + * ze_model.rb hooks (_do_state, _dump_memory_info, + * _dump_command_list_device_timer, _in_loader_init) and gen_ze.rb + * (_init_tracer / _init_tracer_dump). + * ======================================================================== */ + static pthread_once_t _init = PTHREAD_ONCE_INIT; static __thread volatile int _in_init = 0; static volatile unsigned int _in_loader_init = 0; @@ -524,13 +1572,6 @@ static inline int _do_state() { tracepoint_enabled(lttng_ust_ze_properties, memory_info_range); } -static void THAPI_ATTRIBUTE_DESTRUCTOR _lib_cleanup() { - if (_do_cleanup) { - if (_do_profile) - _event_cleanup(); - } -} - static void _dump_driver_subdevice_properties(ze_driver_handle_t hDriver, ze_device_handle_t hDevice) { if (!tracepoint_enabled(lttng_ust_ze_properties, subdevice)) @@ -666,24 +1707,6 @@ static inline void _dump_memory_info(ze_command_list_handle_t hCommandList, cons _dump_memory_info_ctx(hContext, ptr); } -//////////////////////////////////////////// -#define _ZE_ERROR_MSG(NAME, RES) \ - do { \ - fprintf(stderr, "%s() failed at %d(%s): res=%x\n", (NAME), __LINE__, __FILE__, (RES)); \ - } while (0) -#define _ZE_ERROR_MSG_NOTERMINATE(NAME, RES) \ - do { \ - fprintf(stderr, "%s() error at %d(%s): res=%x\n", (NAME), __LINE__, __FILE__, (RES)); \ - } while (0) -#define _ERROR_MSG(MSG) \ - { \ - perror((MSG)) do { \ - { \ - perror((MSG)); \ - fprintf(stderr, "errno=%d at %d(%s)", errno, __LINE__, __FILE__); \ - } \ - while (0) - static void _load_tracer(void) { char *s = NULL; void *handle = NULL; @@ -736,12 +1759,6 @@ static void _load_tracer(void) { s = getenv("LTTNG_UST_ZE_PARANOID_MEMORY_LOCATION"); if (s) _do_paranoid_memory_location = 1; - - _do_cleanup = 1; - -#ifndef THAPI_USE_DESTRUCTORS - atexit(_lib_cleanup); -#endif } static void _load_tracer_dump(void) { diff --git a/backends/ze/ze_model.rb b/backends/ze/ze_model.rb index ec664445a..7341d253b 100644 --- a/backends/ze/ze_model.rb +++ b/backends/ze/ze_model.rb @@ -139,39 +139,107 @@ def upper_snake_case(str) register_epilogue 'zeCommandListCreate', <flags & ZE_COMMAND_LIST_FLAG_IN_ORDER) ? 1 : 0; + _on_create_command_list(*phCommandList, hDevice, desc->commandQueueGroupOrdinal, + /*immediate=*/0, _io); } } EOF register_epilogue 'zeCommandListCreateImmediate', <flags & ZE_COMMAND_QUEUE_FLAG_IN_ORDER) ? 1 : 0; + _on_create_command_list(*phCommandList, hDevice, altdesc->ordinal, + /*immediate=*/1, _io); } } EOF +# Reset hook: the L0 spec +# (https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#zecommandlistreset) +# says the user must have synchronized first, so our slots are drained — but +# for a REGULAR cl "drained" is not "reclaimed" (_slot_release is a no-op for +# regular cls; their inj is baked into the cl body for reuse across Executes). +# Reset wipes that body, so we reclaim the slots/chunks/events now. Without it +# the stale slots are re-published on the next Execute (over-count) and chunks +# leak. The cl stays registered, empty for reuse. register_epilogue 'zeCommandListReset', < 0) { - _on_execute_command_lists(numCommandLists, phCommandLists); - } - } + if (_do_profile && _retval == ZE_RESULT_SUCCESS && numCommandLists > 0 && phCommandLists) + _on_execute_command_lists_epilogue(hCommandQueue, hFence, numCommandLists, phCommandLists); +EOF + +# Sync hooks: walk dependency edges from the synced anchor and drain +# everything reachable. Each sync API has a different anchor. +register_epilogue 'zeCommandQueueSynchronize', < our injected event. +# epilogue: on success, call _universal_record_append which inserts +# a QueryKernelTimestamps(wait=inj, signal=user_sig) into +# the cmdlist and records the slot for drain. +# The event_profiling tracepoint is attributed to the +# user's original signal (or inj when user passed NULL). +# on sync (queue/event/fence/cl-host): drain the slabs. profiling_prologue = lambda { |event_name| <event; + /* Fetched once per profiled Append and threaded to both + * _get_profiling_event (prologue) and _universal_record_append (epilogue) + * so the tracer issues exactly one zeCommandListGetContextHandle per + * Append instead of three. */ + ze_context_handle_t _ctx = NULL; + if (_do_profile) { + if (ZE_COMMAND_LIST_GET_CONTEXT_HANDLE_PTR(hCommandList, &_ctx) == ZE_RESULT_SUCCESS && _ctx) { + pthread_mutex_lock(&_ze_state_mutex); + _ewrapper = _get_profiling_event(_ctx); + pthread_mutex_unlock(&_ze_state_mutex); + if (_ewrapper) + #{event_name} = _ewrapper->event; + } + /* If injection failed, fall through with the user's signal unchanged; + * we won't be able to time this Append, but it still runs. */ } EOF } -profiling_epilogue = lambda { |event_name| +profiling_epilogue = lambda { |_event_name, waits_expr = 'phWaitEvents', n_waits_expr = 'numWaitEvents'| <event; + _universal_record_append(hCommandList, _ctx, _ewrapper, _user_signal, + #{waits_expr}, #{n_waits_expr}); + tracepoint(lttng_ust_ze_profiling, event_profiling, _attr); + } else { + _put_ze_event(_ewrapper); + } } EOF } @@ -319,7 +400,7 @@ def upper_snake_case(str) ['zeCommandListAppendSignalEvent'].each do |c| register_prologue c, profiling_prologue.call('hEvent') - register_epilogue c, profiling_epilogue.call('hEvent') + register_epilogue c, profiling_epilogue.call('hEvent', 'NULL', '0') end # WARNING diff --git a/utils/thapi_log_to_bt_source_component.rb b/utils/thapi_log_to_bt_source_component.rb index 3e16753f7..f27fe4120 100755 --- a/utils/thapi_log_to_bt_source_component.rb +++ b/utils/thapi_log_to_bt_source_component.rb @@ -146,7 +146,10 @@ def parse_event(model, line, exclude_fields) def parse_log(model, input_path, exclude_fields) File.open(input_path, 'r') do |file| - file.each_line.map do |line| + file.each_line.filter_map do |line| + stripped = line.strip + next if stripped.empty? || stripped.start_with?('#') + parse_event(model, line, exclude_fields) end end