diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am
index f3d0ea28..681bed98 100644
--- a/backends/mpi/Makefile.am
+++ b/backends/mpi/Makefile.am
@@ -189,8 +189,23 @@ EXTRA_DIST += \
 	$(top_srcdir)/xprof/btx_interval_model.yaml \
 	btx_mpimatching_model.yaml
 
-$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml
-	$(METABABEL) -u btx_mpi_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include
+# The MPI interval filter needs to receive Level Zero allocation/free
+# events in addition to MPI events so that btx_mpiinterval_callbacks.cpp
+# can tag GPU-aware MPI calls via address-range lookup. metababel's bare
+# `-u model_a,model_b` union fails when both backend models declare the
+# same environment field (hostname), so we generate a merged upstream
+# model that copies only the ze events we need into MPI's stream class.
+# This adds a build-order dependency mpi -> ze.
+BTX_ZE_MODEL = $(top_builddir)/backends/ze/btx_ze_model.yaml
+
+btx_mpi_with_ze_model.yaml: $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL)
+	$(RUBY) $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL) > $@
+
+CLEANFILES += btx_mpi_with_ze_model.yaml
+EXTRA_DIST += gen_btx_mpi_with_ze_model.rb
+
+$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_with_ze_model.yaml
+	$(METABABEL) -u btx_mpi_with_ze_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include
 
 CLEANFILES += \
 	$(BTX_MPI_GENERATED) \
diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp
index 65f1cedd..38d39caf 100644
--- a/backends/mpi/btx_mpiinterval_callbacks.cpp
+++ b/backends/mpi/btx_mpiinterval_callbacks.cpp
@@ -1,17 +1,69 @@
 #include "xprof_utils.hpp"
 #include <iostream>
+#include <map>
 #include <metababel/metababel.h>
 #include <mpi.h.include>
 #include <sstream>
 #include <string>
 #include <tuple>
+#include <unordered_set>
+
+// ---------------------------------------------------------------------------
+// GPU-aware MPI tagging
+//
+// We maintain a per-process address-range map populated from
+// lttng_ust_ze:zeMemAlloc*_exit / zeMemFree_entry events emitted by the ze
+// backend (joined into our upstream model via backends/mpi/Makefile.am).
+// Each MPI buffer-bearing entry event looks up its buffer pointer in that
+// map; if the pointer falls inside a known GPU allocation, the
+// {hostname,vpid,vtid} key is recorded in gpu_aware_calls. The matching
+// exit callback consumes the key and prepends '*' to the MPI call name
+// before pushing it to the aggregator, producing a separate '*MPI_Foo'
+// row in the tally.
+//
+// When the user enables only --backend mpi, no ze events flow in, the
+// range-map stays empty, and no '*' ever appears (CPU-only profile).
+// ---------------------------------------------------------------------------
+
+// Per-process address-range map: base -> (base + size).
+typedef std::map<uintptr_t, uintptr_t> memory_interval_t;
 
 struct data_s {
   EntryState entry_state;
+  // GPU allocation ranges, keyed on {hostname, vpid}. Mirrors the
+  // rangeset_memory_{host,device,shared} pattern in
+  // backends/ze/btx_zeinterval_callbacks.cpp.
+  std::unordered_map<hp_t, memory_interval_t> rangeset_memory_device;
+  std::unordered_map<hp_t, memory_interval_t> rangeset_memory_shared;
+  // {hostname, vpid, vtid} of MPI calls observed to carry at least one
+  // device/shared GPU buffer. Populated in mpi_*_buf_entry_callback,
+  // consumed (and cleared) in send_host_message.
+  std::unordered_set<hpt_t> gpu_aware_calls;
 };
 
 typedef struct data_s data_t;
 
+// True iff `ptr` lies inside any allocation tracked in `m`.
+static bool memory_includes(const memory_interval_t &m, uintptr_t ptr) {
+  const auto it = m.upper_bound(ptr);
+  if (it == m.cbegin())
+    return false;
+  return ptr < std::prev(it)->second;
+}
+
+// True iff `ptr` lies inside a tracked GPU (device or shared) allocation
+// for the given {hostname, vpid}. Host USM is intentionally NOT tagged
+// as GPU-aware here (it behaves like host memory from MPI's perspective).
+static bool is_gpu_pointer(const data_t *data, const hp_t &hp, uintptr_t ptr) {
+  auto itd = data->rangeset_memory_device.find(hp);
+  if (itd != data->rangeset_memory_device.end() && memory_includes(itd->second, ptr))
+    return true;
+  auto its = data->rangeset_memory_shared.find(hp);
+  if (its != data->rangeset_memory_shared.end() && memory_includes(its->second, ptr))
+    return true;
+  return false;
+}
+
 static void send_host_message(void *btx_handle,
                               void *usr_data,
                               int64_t ts,
@@ -21,9 +73,16 @@ static void send_host_message(void *btx_handle,
                               uint64_t vtid,
                               bool err) {
 
+  auto *data = static_cast<data_t *>(usr_data);
+  const hpt_t hpt{hostname, vpid, vtid};
+
   std::string event_class_name_striped = strip_event_class_name_exit(event_class_name);
-  const int64_t entry_ts =
-      static_cast<data_t *>(usr_data)->entry_state.get_ts({hostname, vpid, vtid});
+  // GPU-aware MPI calls get a leading '*' so they sort and aggregate as a
+  // distinct row from their CPU-only counterparts in the tally output.
+  if (data->gpu_aware_calls.erase(hpt) > 0)
+    event_class_name_striped.insert(0, 1, '*');
+
+  const int64_t entry_ts = data->entry_state.get_ts(hpt);
 
   btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI,
                               event_class_name_striped.c_str(), (ts - entry_ts), err);
@@ -198,6 +257,166 @@ static void traffic_int_entry_callback(void *btx_handle,
                                  count * size, oss.str().c_str());
 }
 
+// --- ze allocation / free tracking ----------------------------------------
+// Bind allocation size at entry; the matching exit callback uses it together
+// with the freshly-returned pointer to populate the range-map. The
+// event_class_name suffix (Device vs Shared vs Host) selects which rangeset
+// the allocation lands in.
+static void ze_alloc_entry_callback(void *btx_handle,
+                                    void *usr_data,
+                                    int64_t ts,
+                                    const char *event_class_name,
+                                    const char *hostname,
+                                    int64_t vpid,
+                                    uint64_t vtid,
+                                    size_t size) {
+  static_cast<data_t *>(usr_data)->entry_state.set_data({hostname, vpid, vtid}, size);
+}
+
+static void ze_alloc_exit_callback(void *btx_handle,
+                                   void *usr_data,
+                                   int64_t ts,
+                                   const char *event_class_name,
+                                   const char *hostname,
+                                   int64_t vpid,
+                                   uint64_t vtid,
+                                   void *pptr_val) {
+  auto *data = static_cast<data_t *>(usr_data);
+  auto size = data->entry_state.get_data<size_t>({hostname, vpid, vtid});
+  // We don't see zeResult in the matching model to avoid pulling in
+  // ze_api.h; a failed allocation has pptr_val == NULL.
+  if (pptr_val == nullptr)
+    return;
+
+  const std::string ev{event_class_name};
+  // Host USM is reachable from the host CPU so MPI sees it as a normal host
+  // pointer; classifying it as GPU-aware would produce false positives.
+  // Device and Shared (managed) USM live (at least partly) on the device,
+  // so MPI calls on them are GPU-aware.
+  std::unordered_map<hp_t, memory_interval_t> *mi = nullptr;
+  if (ev.find("zeMemAllocDevice") != std::string::npos)
+    mi = &data->rangeset_memory_device;
+  else if (ev.find("zeMemAllocShared") != std::string::npos)
+    mi = &data->rangeset_memory_shared;
+  else
+    return; // host USM, ignored
+
+  const uintptr_t base = reinterpret_cast<uintptr_t>(pptr_val);
+  (*mi)[{hostname, vpid}][base] = base + size;
+}
+
+static void ze_free_entry_callback(void *btx_handle,
+                                   void *usr_data,
+                                   int64_t ts,
+                                   const char *event_class_name,
+                                   const char *hostname,
+                                   int64_t vpid,
+                                   uint64_t vtid,
+                                   void *ptr) {
+  auto *data = static_cast<data_t *>(usr_data);
+  if (ptr == nullptr)
+    return;
+  const uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+  const hp_t hp{hostname, vpid};
+  // The freed pointer may have been classified as device OR shared at
+  // allocation; try both.
+  auto itd = data->rangeset_memory_device.find(hp);
+  if (itd != data->rangeset_memory_device.end())
+    itd->second.erase(base);
+  auto its = data->rangeset_memory_shared.find(hp);
+  if (its != data->rangeset_memory_shared.end())
+    its->second.erase(base);
+}
+
+// --- MPI buffer classification --------------------------------------------
+static void
+tag_if_gpu(data_t *data, const char *hostname, int64_t vpid, uint64_t vtid, const void *ptr) {
+  if (ptr == nullptr)
+    return;
+  // MPICH sentinels: MPI_IN_PLACE / MPI_BOTTOM. These would never match a
+  // real allocation but checking explicitly keeps the intent clear.
+  if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM)
+    return;
+  if (is_gpu_pointer(data, {hostname, vpid}, reinterpret_cast<uintptr_t>(ptr)))
+    data->gpu_aware_calls.insert({hostname, vpid, vtid});
+}
+
+// metababel strips qualifiers when generating callback typedefs, so buffer
+// parameters declared `const void *` in the matching model become `void *`
+// in the generated mpi_*_entry_callback_f typedefs. Match exactly.
+static void mpi_1buf_entry_callback(void *btx_handle,
+                                    void *usr_data,
+                                    int64_t ts,
+                                    const char *event_class_name,
+                                    const char *hostname,
+                                    int64_t vpid,
+                                    uint64_t vtid,
+                                    void *buf) {
+  tag_if_gpu(static_cast<data_t *>(usr_data), hostname, vpid, vtid, buf);
+}
+
+// MPI_Sendrecv, MPI_Allreduce, MPI_Reduce, MPI_Gather, ... — the bulk.
+static void mpi_sendrecv_entry_callback(void *btx_handle,
+                                        void *usr_data,
+                                        int64_t ts,
+                                        const char *event_class_name,
+                                        const char *hostname,
+                                        int64_t vpid,
+                                        uint64_t vtid,
+                                        void *sendbuf,
+                                        void *recvbuf) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, sendbuf);
+  tag_if_gpu(data, hostname, vpid, vtid, recvbuf);
+}
+
+// MPI_Reduce_local(inbuf, inoutbuf, ...)
+static void mpi_reduce_local_entry_callback(void *btx_handle,
+                                            void *usr_data,
+                                            int64_t ts,
+                                            const char *event_class_name,
+                                            const char *hostname,
+                                            int64_t vpid,
+                                            uint64_t vtid,
+                                            void *inbuf,
+                                            void *inoutbuf) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, inbuf);
+  tag_if_gpu(data, hostname, vpid, vtid, inoutbuf);
+}
+
+// MPI_Get_accumulate, MPI_Fetch_and_op (origin_addr + result_addr).
+static void mpi_rma_fetch_entry_callback(void *btx_handle,
+                                         void *usr_data,
+                                         int64_t ts,
+                                         const char *event_class_name,
+                                         const char *hostname,
+                                         int64_t vpid,
+                                         uint64_t vtid,
+                                         void *origin_addr,
+                                         void *result_addr) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
+  tag_if_gpu(data, hostname, vpid, vtid, result_addr);
+}
+
+// MPI_Compare_and_swap(origin_addr, compare_addr, result_addr, ...)
+static void mpi_cas_entry_callback(void *btx_handle,
+                                   void *usr_data,
+                                   int64_t ts,
+                                   const char *event_class_name,
+                                   const char *hostname,
+                                   int64_t vpid,
+                                   uint64_t vtid,
+                                   void *origin_addr,
+                                   void *compare_addr,
+                                   void *result_addr) {
+  auto *data = static_cast<data_t *>(usr_data);
+  tag_if_gpu(data, hostname, vpid, vtid, origin_addr);
+  tag_if_gpu(data, hostname, vpid, vtid, compare_addr);
+  tag_if_gpu(data, hostname, vpid, vtid, result_addr);
+}
+
 void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component);
   btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component);
@@ -209,4 +428,16 @@ void btx_register_usr_callbacks(void *btx_handle) {
   btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback);
 
   btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback);
+
+  // GPU-aware MPI: ze alloc/free updates the range-map, MPI buffer entries
+  // look up against it. One callback per MPI buffer-shape (see the matching
+  // model for why we cannot lump them into a single regex).
+  btx_register_callbacks_ze_alloc_entry(btx_handle, &ze_alloc_entry_callback);
+  btx_register_callbacks_ze_alloc_exit(btx_handle, &ze_alloc_exit_callback);
+  btx_register_callbacks_ze_free_entry(btx_handle, &ze_free_entry_callback);
+  btx_register_callbacks_mpi_1buf_entry(btx_handle, &mpi_1buf_entry_callback);
+  btx_register_callbacks_mpi_sendrecv_entry(btx_handle, &mpi_sendrecv_entry_callback);
+  btx_register_callbacks_mpi_reduce_local_entry(btx_handle, &mpi_reduce_local_entry_callback);
+  btx_register_callbacks_mpi_rma_fetch_entry(btx_handle, &mpi_rma_fetch_entry_callback);
+  btx_register_callbacks_mpi_cas_entry(btx_handle, &mpi_cas_entry_callback);
 }
diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml
index 829bbedc..79826888 100644
--- a/backends/mpi/btx_mpimatching_model.yaml
+++ b/backends/mpi/btx_mpimatching_model.yaml
@@ -14,11 +14,14 @@
       :field_class:
         :type: integer_unsigned
   :event_classes:
+  # Restrict the bare entry/exit patterns to the MPI provider so that
+  # ze allocation/free events sourced from the ze backend (see below)
+  # do not accidentally match them.
   - :set_id: entries
-    :name: "_entry$"
+    :name: "^lttng_ust_mpi:.*_entry$"
   - :set_id: exits
     :register: false
-    :name: "_exit$"
+    :name: "^lttng_ust_mpi:.*_exit$"
   - :set_id: exits_mpiError_present
     :domain: exits
     :payload_field_class:
@@ -52,3 +55,114 @@
       - :name: ^(datatype|origin_count)$
         :field_class:
           :cast_type: MPI_Datatype
+  # ------------------------------------------------------------------
+  # GPU-aware MPI tagging
+  #
+  # Categories below feed btx_mpiinterval_callbacks.cpp's range-map
+  # lookup. The MPI buffer categories capture the payload pointer(s)
+  # of every buffer-bearing MPI call; the ze alloc/free categories
+  # maintain a {hostname,vpid} -> address-range map. An MPI call whose
+  # buffer falls in the range-map is tagged with a leading '*' in the
+  # tally output. This is only effective when the user enables both
+  # --backend mpi and --backend ze; with only --backend mpi the
+  # range-map stays empty and no '*' ever appears.
+  # ------------------------------------------------------------------
+  # MPI buffer categories. metababel requires each :name: regex to bind
+  # to EXACTLY ONE field per matched event, so we cannot lump all buffer
+  # names into a single regex slot: an event like MPI_Reduce_local has
+  # both `inbuf` and `inoutbuf`, and MPI_Compare_and_swap has three
+  # buffer fields, which would over-match. Define one category per
+  # distinct MPI buffer-shape, then a catch-all for single-buffer calls.
+  # The cast_type accepts both `const void *` (input buffers) and plain
+  # `void *` (output buffers) since MPI prototypes use both.
+  #
+  # 3-buffer: MPI_Compare_and_swap
+  - :set_id: mpi_cas_entry
+    :domain: entries
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^origin_addr$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+      - :name: ^compare_addr$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+      - :name: ^result_addr$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+  # 2-buffer Send/Recv pair (Sendrecv, Allreduce, Reduce, Gather, ...).
+  - :set_id: mpi_sendrecv_entry
+    :domain: entries - mpi_cas_entry
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^sendbuf$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+      - :name: ^recvbuf$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+  # 2-buffer Reduce_local pair (inbuf + inoutbuf).
+  - :set_id: mpi_reduce_local_entry
+    :domain: entries - mpi_cas_entry - mpi_sendrecv_entry
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^inbuf$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+      - :name: ^inoutbuf$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+  # 2-buffer RMA fetch pair (Get_accumulate, Fetch_and_op).
+  - :set_id: mpi_rma_fetch_entry
+    :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^origin_addr$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+      - :name: ^result_addr$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+  # 1-buffer everything else (Send/Recv/Bcast/Put/Get/Accumulate/...).
+  # After excluding the multi-buffer categories above, every remaining
+  # buffer-bearing MPI call carries exactly one of {buf, buffer,
+  # origin_addr}. Pack/Unpack (inbuf + outbuf) is intentionally NOT
+  # included: the buffers never traverse the network, so GPU-aware
+  # tagging is meaningless there.
+  - :set_id: mpi_1buf_entry
+    :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry - mpi_rma_fetch_entry
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^(buf|buffer|origin_addr)$
+        :field_class:
+          :cast_type: ^(const )?void \*$
+  # Level Zero USM allocation / free events sourced from the ze
+  # backend (see backends/ze/btx_zematching_model.yaml for matching
+  # parents). We bind size at entry and the allocated pointer at exit,
+  # mirroring btx_zeinterval_callbacks.cpp:entries_alloc / exits_alloc.
+  - :set_id: ze_alloc_entry
+    :name: "^lttng_ust_ze:zeMemAlloc.*_entry$"
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^size$
+  # We deliberately omit zeResult here so that the consumer callback
+  # signature does not need to depend on ze_api.h. A failed allocation
+  # has pptr_val == NULL, which the callback already guards against.
+  - :set_id: ze_alloc_exit
+    :name: "^lttng_ust_ze:zeMemAlloc.*_exit$"
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^pptr_val$
+  - :set_id: ze_free_entry
+    :name: "^lttng_ust_ze:zeMemFree_entry$"
+    :payload_field_class:
+      :type: structure
+      :members:
+      - :name: ^ptr$
diff --git a/backends/mpi/gen_btx_mpi_with_ze_model.rb b/backends/mpi/gen_btx_mpi_with_ze_model.rb
new file mode 100644
index 00000000..e68530f0
--- /dev/null
+++ b/backends/mpi/gen_btx_mpi_with_ze_model.rb
@@ -0,0 +1,77 @@
+#!/usr/bin/env ruby
+# Generate a merged upstream YAML model for the MPI interval filter.
+#
+# btx_filter_mpi needs to receive Level Zero allocation/free events in
+# addition to MPI events so that btx_mpiinterval_callbacks.cpp can
+# maintain an address-range map and tag GPU-aware MPI calls. Two
+# constraints conspire to make this non-trivial:
+#
+#  1. metababel's bare `-u model_a,model_b` union fails because each
+#     backend model declares :environment: :entries:[{name: hostname}],
+#     causing the matching engine to see an ambiguous member.
+#
+#  2. The MPI bt2 plugin (libMPIInterval) is built with only mpi.h on
+#     the include path; pulling in the full ze event payloads would
+#     drag in <ze_api.h> types (ze_context_handle_t, ze_result_t,
+#     ze_device_mem_alloc_desc_t *, ...) that the MPI consumer cannot
+#     resolve.
+#
+# Side-step both by emitting a single merged model that:
+#   - starts from btx_mpi_model.yaml (single :environment:),
+#   - copies only the ze allocation/free event classes we need into
+#     MPI's stream class,
+#   - keeps only the payload fields the MPI consumer actually uses,
+#     so all C types referenced in the generated upstream typedefs
+#     remain primitive (size_t / void *) and need no ze headers.
+#
+# Usage: ruby gen_btx_mpi_with_ze_model.rb MPI_MODEL.yaml ZE_MODEL.yaml > OUT.yaml
+require 'yaml'
+
+mpi_path = ARGV[0]
+ze_path  = ARGV[1]
+raise 'usage: gen_btx_mpi_with_ze_model.rb MPI_MODEL ZE_MODEL' unless mpi_path && ze_path
+
+# (event name) -> (allow-list of payload field names the MPI consumer
+# needs to see). Fields not in the allow-list are dropped from the
+# copied event, keeping the generated upstream typedefs free of any
+# ze-specific types.
+ZE_EVENT_FIELDS = {
+  'lttng_ust_ze:zeMemAllocHost_entry'    => %w[size],
+  'lttng_ust_ze:zeMemAllocHost_exit'     => %w[pptr_val],
+  'lttng_ust_ze:zeMemAllocDevice_entry'  => %w[size],
+  'lttng_ust_ze:zeMemAllocDevice_exit'   => %w[pptr_val],
+  'lttng_ust_ze:zeMemAllocShared_entry'  => %w[size],
+  'lttng_ust_ze:zeMemAllocShared_exit'   => %w[pptr_val],
+  'lttng_ust_ze:zeMemFree_entry'         => %w[ptr],
+  'lttng_ust_ze:zeMemFree_exit'          => %w[],
+}.freeze
+
+mpi_model = YAML.load_file(mpi_path)
+ze_model  = YAML.load_file(ze_path)
+
+ze_events = ze_model.fetch(:stream_classes, []).flat_map do |sc|
+  sc.fetch(:event_classes, []).map do |ec|
+    wanted = ZE_EVENT_FIELDS[ec[:name]]
+    next nil unless wanted
+
+    # Deep-copy via Marshal so we can mutate freely without aliasing the
+    # original ze model.
+    ec = Marshal.load(Marshal.dump(ec))
+    if (pfc = ec[:payload_field_class]) && (members = pfc[:members])
+      pfc[:members] = members.select { |m| wanted.include?(m[:name].to_s) }
+      ec.delete(:payload_field_class) if pfc[:members].empty?
+    end
+    ec
+  end.compact
+end
+
+if ze_events.empty?
+  warn "WARNING: no ze allocation events found in #{ze_path}; " \
+       'GPU-aware MPI tagging will be inactive.'
+end
+
+# Inject the ze events into the MPI stream class. There is only one
+# stream class per backend model (see utils/gen_babeltrace_model_helper.rb).
+mpi_model[:stream_classes].first[:event_classes].concat(ze_events)
+
+puts mpi_model.to_yaml