diff --git a/backends/mpi/Makefile.am b/backends/mpi/Makefile.am index f3d0ea28..681bed98 100644 --- a/backends/mpi/Makefile.am +++ b/backends/mpi/Makefile.am @@ -189,8 +189,23 @@ EXTRA_DIST += \ $(top_srcdir)/xprof/btx_interval_model.yaml \ btx_mpimatching_model.yaml -$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_model.yaml - $(METABABEL) -u btx_mpi_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include +# The MPI interval filter needs to receive Level Zero allocation/free +# events in addition to MPI events so that btx_mpiinterval_callbacks.cpp +# can tag GPU-aware MPI calls via address-range lookup. metababel's bare +# `-u model_a,model_b` union fails when both backend models declare the +# same environment field (hostname), so we generate a merged upstream +# model that copies only the ze events we need into MPI's stream class. +# This adds a build-order dependency mpi -> ze. +BTX_ZE_MODEL = $(top_builddir)/backends/ze/btx_ze_model.yaml + +btx_mpi_with_ze_model.yaml: $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL) + $(RUBY) $(srcdir)/gen_btx_mpi_with_ze_model.rb btx_mpi_model.yaml $(BTX_ZE_MODEL) > $@ + +CLEANFILES += btx_mpi_with_ze_model.yaml +EXTRA_DIST += gen_btx_mpi_with_ze_model.rb + +$(BTX_MPI_GENERATED) &: $(top_srcdir)/xprof/btx_interval_model.yaml btx_mpimatching_model.yaml btx_mpi_with_ze_model.yaml + $(METABABEL) -u btx_mpi_with_ze_model.yaml -d $(top_srcdir)/xprof/btx_interval_model.yaml -t FILTER -o btx_filter_mpi -p mpiinterval -c interval --matching $(srcdir)/btx_mpimatching_model.yaml -i mpi.h.include CLEANFILES += \ $(BTX_MPI_GENERATED) \ diff --git a/backends/mpi/btx_mpiinterval_callbacks.cpp b/backends/mpi/btx_mpiinterval_callbacks.cpp index 65f1cedd..38d39caf 100644 --- a/backends/mpi/btx_mpiinterval_callbacks.cpp +++ b/backends/mpi/btx_mpiinterval_callbacks.cpp @@ -1,17 +1,69 @@ #include "xprof_utils.hpp" #include +#include #include #include #include #include #include +#include + +// --------------------------------------------------------------------------- +// GPU-aware MPI tagging +// +// We maintain a per-process address-range map populated from +// lttng_ust_ze:zeMemAlloc*_exit / zeMemFree_entry events emitted by the ze +// backend (joined into our upstream model via backends/mpi/Makefile.am). +// Each MPI buffer-bearing entry event looks up its buffer pointer in that +// map; if the pointer falls inside a known GPU allocation, the +// {hostname,vpid,vtid} key is recorded in gpu_aware_calls. The matching +// exit callback consumes the key and prepends '*' to the MPI call name +// before pushing it to the aggregator, producing a separate '*MPI_Foo' +// row in the tally. +// +// When the user enables only --backend mpi, no ze events flow in, the +// range-map stays empty, and no '*' ever appears (CPU-only profile). +// --------------------------------------------------------------------------- + +// Per-process address-range map: base -> (base + size). +typedef std::map memory_interval_t; struct data_s { EntryState entry_state; + // GPU allocation ranges, keyed on {hostname, vpid}. Mirrors the + // rangeset_memory_{host,device,shared} pattern in + // backends/ze/btx_zeinterval_callbacks.cpp. + std::unordered_map rangeset_memory_device; + std::unordered_map rangeset_memory_shared; + // {hostname, vpid, vtid} of MPI calls observed to carry at least one + // device/shared GPU buffer. Populated in mpi_*_buf_entry_callback, + // consumed (and cleared) in send_host_message. + std::unordered_set gpu_aware_calls; }; typedef struct data_s data_t; +// True iff `ptr` lies inside any allocation tracked in `m`. +static bool memory_includes(const memory_interval_t &m, uintptr_t ptr) { + const auto it = m.upper_bound(ptr); + if (it == m.cbegin()) + return false; + return ptr < std::prev(it)->second; +} + +// True iff `ptr` lies inside a tracked GPU (device or shared) allocation +// for the given {hostname, vpid}. Host USM is intentionally NOT tagged +// as GPU-aware here (it behaves like host memory from MPI's perspective). +static bool is_gpu_pointer(const data_t *data, const hp_t &hp, uintptr_t ptr) { + auto itd = data->rangeset_memory_device.find(hp); + if (itd != data->rangeset_memory_device.end() && memory_includes(itd->second, ptr)) + return true; + auto its = data->rangeset_memory_shared.find(hp); + if (its != data->rangeset_memory_shared.end() && memory_includes(its->second, ptr)) + return true; + return false; +} + static void send_host_message(void *btx_handle, void *usr_data, int64_t ts, @@ -21,9 +73,16 @@ static void send_host_message(void *btx_handle, uint64_t vtid, bool err) { + auto *data = static_cast(usr_data); + const hpt_t hpt{hostname, vpid, vtid}; + std::string event_class_name_striped = strip_event_class_name_exit(event_class_name); - const int64_t entry_ts = - static_cast(usr_data)->entry_state.get_ts({hostname, vpid, vtid}); + // GPU-aware MPI calls get a leading '*' so they sort and aggregate as a + // distinct row from their CPU-only counterparts in the tally output. + if (data->gpu_aware_calls.erase(hpt) > 0) + event_class_name_striped.insert(0, 1, '*'); + + const int64_t entry_ts = data->entry_state.get_ts(hpt); btx_push_message_lttng_host(btx_handle, hostname, vpid, vtid, entry_ts, BACKEND_MPI, event_class_name_striped.c_str(), (ts - entry_ts), err); @@ -198,6 +257,166 @@ static void traffic_int_entry_callback(void *btx_handle, count * size, oss.str().c_str()); } +// --- ze allocation / free tracking ---------------------------------------- +// Bind allocation size at entry; the matching exit callback uses it together +// with the freshly-returned pointer to populate the range-map. The +// event_class_name suffix (Device vs Shared vs Host) selects which rangeset +// the allocation lands in. +static void ze_alloc_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + size_t size) { + static_cast(usr_data)->entry_state.set_data({hostname, vpid, vtid}, size); +} + +static void ze_alloc_exit_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + void *pptr_val) { + auto *data = static_cast(usr_data); + auto size = data->entry_state.get_data({hostname, vpid, vtid}); + // We don't see zeResult in the matching model to avoid pulling in + // ze_api.h; a failed allocation has pptr_val == NULL. + if (pptr_val == nullptr) + return; + + const std::string ev{event_class_name}; + // Host USM is reachable from the host CPU so MPI sees it as a normal host + // pointer; classifying it as GPU-aware would produce false positives. + // Device and Shared (managed) USM live (at least partly) on the device, + // so MPI calls on them are GPU-aware. + std::unordered_map *mi = nullptr; + if (ev.find("zeMemAllocDevice") != std::string::npos) + mi = &data->rangeset_memory_device; + else if (ev.find("zeMemAllocShared") != std::string::npos) + mi = &data->rangeset_memory_shared; + else + return; // host USM, ignored + + const uintptr_t base = reinterpret_cast(pptr_val); + (*mi)[{hostname, vpid}][base] = base + size; +} + +static void ze_free_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + void *ptr) { + auto *data = static_cast(usr_data); + if (ptr == nullptr) + return; + const uintptr_t base = reinterpret_cast(ptr); + const hp_t hp{hostname, vpid}; + // The freed pointer may have been classified as device OR shared at + // allocation; try both. + auto itd = data->rangeset_memory_device.find(hp); + if (itd != data->rangeset_memory_device.end()) + itd->second.erase(base); + auto its = data->rangeset_memory_shared.find(hp); + if (its != data->rangeset_memory_shared.end()) + its->second.erase(base); +} + +// --- MPI buffer classification -------------------------------------------- +static void +tag_if_gpu(data_t *data, const char *hostname, int64_t vpid, uint64_t vtid, const void *ptr) { + if (ptr == nullptr) + return; + // MPICH sentinels: MPI_IN_PLACE / MPI_BOTTOM. These would never match a + // real allocation but checking explicitly keeps the intent clear. + if (ptr == MPI_IN_PLACE || ptr == MPI_BOTTOM) + return; + if (is_gpu_pointer(data, {hostname, vpid}, reinterpret_cast(ptr))) + data->gpu_aware_calls.insert({hostname, vpid, vtid}); +} + +// metababel strips qualifiers when generating callback typedefs, so buffer +// parameters declared `const void *` in the matching model become `void *` +// in the generated mpi_*_entry_callback_f typedefs. Match exactly. +static void mpi_1buf_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + void *buf) { + tag_if_gpu(static_cast(usr_data), hostname, vpid, vtid, buf); +} + +// MPI_Sendrecv, MPI_Allreduce, MPI_Reduce, MPI_Gather, ... — the bulk. +static void mpi_sendrecv_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + void *sendbuf, + void *recvbuf) { + auto *data = static_cast(usr_data); + tag_if_gpu(data, hostname, vpid, vtid, sendbuf); + tag_if_gpu(data, hostname, vpid, vtid, recvbuf); +} + +// MPI_Reduce_local(inbuf, inoutbuf, ...) +static void mpi_reduce_local_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + void *inbuf, + void *inoutbuf) { + auto *data = static_cast(usr_data); + tag_if_gpu(data, hostname, vpid, vtid, inbuf); + tag_if_gpu(data, hostname, vpid, vtid, inoutbuf); +} + +// MPI_Get_accumulate, MPI_Fetch_and_op (origin_addr + result_addr). +static void mpi_rma_fetch_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + void *origin_addr, + void *result_addr) { + auto *data = static_cast(usr_data); + tag_if_gpu(data, hostname, vpid, vtid, origin_addr); + tag_if_gpu(data, hostname, vpid, vtid, result_addr); +} + +// MPI_Compare_and_swap(origin_addr, compare_addr, result_addr, ...) +static void mpi_cas_entry_callback(void *btx_handle, + void *usr_data, + int64_t ts, + const char *event_class_name, + const char *hostname, + int64_t vpid, + uint64_t vtid, + void *origin_addr, + void *compare_addr, + void *result_addr) { + auto *data = static_cast(usr_data); + tag_if_gpu(data, hostname, vpid, vtid, origin_addr); + tag_if_gpu(data, hostname, vpid, vtid, compare_addr); + tag_if_gpu(data, hostname, vpid, vtid, result_addr); +} + void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component); btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component); @@ -209,4 +428,16 @@ void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_traffic_int_entry(btx_handle, &traffic_int_entry_callback); btx_register_callbacks_lttng_ust_mpi_type_property(btx_handle, &type_property_callback); + + // GPU-aware MPI: ze alloc/free updates the range-map, MPI buffer entries + // look up against it. One callback per MPI buffer-shape (see the matching + // model for why we cannot lump them into a single regex). + btx_register_callbacks_ze_alloc_entry(btx_handle, &ze_alloc_entry_callback); + btx_register_callbacks_ze_alloc_exit(btx_handle, &ze_alloc_exit_callback); + btx_register_callbacks_ze_free_entry(btx_handle, &ze_free_entry_callback); + btx_register_callbacks_mpi_1buf_entry(btx_handle, &mpi_1buf_entry_callback); + btx_register_callbacks_mpi_sendrecv_entry(btx_handle, &mpi_sendrecv_entry_callback); + btx_register_callbacks_mpi_reduce_local_entry(btx_handle, &mpi_reduce_local_entry_callback); + btx_register_callbacks_mpi_rma_fetch_entry(btx_handle, &mpi_rma_fetch_entry_callback); + btx_register_callbacks_mpi_cas_entry(btx_handle, &mpi_cas_entry_callback); } diff --git a/backends/mpi/btx_mpimatching_model.yaml b/backends/mpi/btx_mpimatching_model.yaml index 829bbedc..79826888 100644 --- a/backends/mpi/btx_mpimatching_model.yaml +++ b/backends/mpi/btx_mpimatching_model.yaml @@ -14,11 +14,14 @@ :field_class: :type: integer_unsigned :event_classes: + # Restrict the bare entry/exit patterns to the MPI provider so that + # ze allocation/free events sourced from the ze backend (see below) + # do not accidentally match them. - :set_id: entries - :name: "_entry$" + :name: "^lttng_ust_mpi:.*_entry$" - :set_id: exits :register: false - :name: "_exit$" + :name: "^lttng_ust_mpi:.*_exit$" - :set_id: exits_mpiError_present :domain: exits :payload_field_class: @@ -52,3 +55,114 @@ - :name: ^(datatype|origin_count)$ :field_class: :cast_type: MPI_Datatype + # ------------------------------------------------------------------ + # GPU-aware MPI tagging + # + # Categories below feed btx_mpiinterval_callbacks.cpp's range-map + # lookup. The MPI buffer categories capture the payload pointer(s) + # of every buffer-bearing MPI call; the ze alloc/free categories + # maintain a {hostname,vpid} -> address-range map. An MPI call whose + # buffer falls in the range-map is tagged with a leading '*' in the + # tally output. This is only effective when the user enables both + # --backend mpi and --backend ze; with only --backend mpi the + # range-map stays empty and no '*' ever appears. + # ------------------------------------------------------------------ + # MPI buffer categories. metababel requires each :name: regex to bind + # to EXACTLY ONE field per matched event, so we cannot lump all buffer + # names into a single regex slot: an event like MPI_Reduce_local has + # both `inbuf` and `inoutbuf`, and MPI_Compare_and_swap has three + # buffer fields, which would over-match. Define one category per + # distinct MPI buffer-shape, then a catch-all for single-buffer calls. + # The cast_type accepts both `const void *` (input buffers) and plain + # `void *` (output buffers) since MPI prototypes use both. + # + # 3-buffer: MPI_Compare_and_swap + - :set_id: mpi_cas_entry + :domain: entries + :payload_field_class: + :type: structure + :members: + - :name: ^origin_addr$ + :field_class: + :cast_type: ^(const )?void \*$ + - :name: ^compare_addr$ + :field_class: + :cast_type: ^(const )?void \*$ + - :name: ^result_addr$ + :field_class: + :cast_type: ^(const )?void \*$ + # 2-buffer Send/Recv pair (Sendrecv, Allreduce, Reduce, Gather, ...). + - :set_id: mpi_sendrecv_entry + :domain: entries - mpi_cas_entry + :payload_field_class: + :type: structure + :members: + - :name: ^sendbuf$ + :field_class: + :cast_type: ^(const )?void \*$ + - :name: ^recvbuf$ + :field_class: + :cast_type: ^(const )?void \*$ + # 2-buffer Reduce_local pair (inbuf + inoutbuf). + - :set_id: mpi_reduce_local_entry + :domain: entries - mpi_cas_entry - mpi_sendrecv_entry + :payload_field_class: + :type: structure + :members: + - :name: ^inbuf$ + :field_class: + :cast_type: ^(const )?void \*$ + - :name: ^inoutbuf$ + :field_class: + :cast_type: ^(const )?void \*$ + # 2-buffer RMA fetch pair (Get_accumulate, Fetch_and_op). + - :set_id: mpi_rma_fetch_entry + :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry + :payload_field_class: + :type: structure + :members: + - :name: ^origin_addr$ + :field_class: + :cast_type: ^(const )?void \*$ + - :name: ^result_addr$ + :field_class: + :cast_type: ^(const )?void \*$ + # 1-buffer everything else (Send/Recv/Bcast/Put/Get/Accumulate/...). + # After excluding the multi-buffer categories above, every remaining + # buffer-bearing MPI call carries exactly one of {buf, buffer, + # origin_addr}. Pack/Unpack (inbuf + outbuf) is intentionally NOT + # included: the buffers never traverse the network, so GPU-aware + # tagging is meaningless there. + - :set_id: mpi_1buf_entry + :domain: entries - mpi_cas_entry - mpi_sendrecv_entry - mpi_reduce_local_entry - mpi_rma_fetch_entry + :payload_field_class: + :type: structure + :members: + - :name: ^(buf|buffer|origin_addr)$ + :field_class: + :cast_type: ^(const )?void \*$ + # Level Zero USM allocation / free events sourced from the ze + # backend (see backends/ze/btx_zematching_model.yaml for matching + # parents). We bind size at entry and the allocated pointer at exit, + # mirroring btx_zeinterval_callbacks.cpp:entries_alloc / exits_alloc. + - :set_id: ze_alloc_entry + :name: "^lttng_ust_ze:zeMemAlloc.*_entry$" + :payload_field_class: + :type: structure + :members: + - :name: ^size$ + # We deliberately omit zeResult here so that the consumer callback + # signature does not need to depend on ze_api.h. A failed allocation + # has pptr_val == NULL, which the callback already guards against. + - :set_id: ze_alloc_exit + :name: "^lttng_ust_ze:zeMemAlloc.*_exit$" + :payload_field_class: + :type: structure + :members: + - :name: ^pptr_val$ + - :set_id: ze_free_entry + :name: "^lttng_ust_ze:zeMemFree_entry$" + :payload_field_class: + :type: structure + :members: + - :name: ^ptr$ diff --git a/backends/mpi/gen_btx_mpi_with_ze_model.rb b/backends/mpi/gen_btx_mpi_with_ze_model.rb new file mode 100644 index 00000000..e68530f0 --- /dev/null +++ b/backends/mpi/gen_btx_mpi_with_ze_model.rb @@ -0,0 +1,77 @@ +#!/usr/bin/env ruby +# Generate a merged upstream YAML model for the MPI interval filter. +# +# btx_filter_mpi needs to receive Level Zero allocation/free events in +# addition to MPI events so that btx_mpiinterval_callbacks.cpp can +# maintain an address-range map and tag GPU-aware MPI calls. Two +# constraints conspire to make this non-trivial: +# +# 1. metababel's bare `-u model_a,model_b` union fails because each +# backend model declares :environment: :entries:[{name: hostname}], +# causing the matching engine to see an ambiguous member. +# +# 2. The MPI bt2 plugin (libMPIInterval) is built with only mpi.h on +# the include path; pulling in the full ze event payloads would +# drag in types (ze_context_handle_t, ze_result_t, +# ze_device_mem_alloc_desc_t *, ...) that the MPI consumer cannot +# resolve. +# +# Side-step both by emitting a single merged model that: +# - starts from btx_mpi_model.yaml (single :environment:), +# - copies only the ze allocation/free event classes we need into +# MPI's stream class, +# - keeps only the payload fields the MPI consumer actually uses, +# so all C types referenced in the generated upstream typedefs +# remain primitive (size_t / void *) and need no ze headers. +# +# Usage: ruby gen_btx_mpi_with_ze_model.rb MPI_MODEL.yaml ZE_MODEL.yaml > OUT.yaml +require 'yaml' + +mpi_path = ARGV[0] +ze_path = ARGV[1] +raise 'usage: gen_btx_mpi_with_ze_model.rb MPI_MODEL ZE_MODEL' unless mpi_path && ze_path + +# (event name) -> (allow-list of payload field names the MPI consumer +# needs to see). Fields not in the allow-list are dropped from the +# copied event, keeping the generated upstream typedefs free of any +# ze-specific types. +ZE_EVENT_FIELDS = { + 'lttng_ust_ze:zeMemAllocHost_entry' => %w[size], + 'lttng_ust_ze:zeMemAllocHost_exit' => %w[pptr_val], + 'lttng_ust_ze:zeMemAllocDevice_entry' => %w[size], + 'lttng_ust_ze:zeMemAllocDevice_exit' => %w[pptr_val], + 'lttng_ust_ze:zeMemAllocShared_entry' => %w[size], + 'lttng_ust_ze:zeMemAllocShared_exit' => %w[pptr_val], + 'lttng_ust_ze:zeMemFree_entry' => %w[ptr], + 'lttng_ust_ze:zeMemFree_exit' => %w[], +}.freeze + +mpi_model = YAML.load_file(mpi_path) +ze_model = YAML.load_file(ze_path) + +ze_events = ze_model.fetch(:stream_classes, []).flat_map do |sc| + sc.fetch(:event_classes, []).map do |ec| + wanted = ZE_EVENT_FIELDS[ec[:name]] + next nil unless wanted + + # Deep-copy via Marshal so we can mutate freely without aliasing the + # original ze model. + ec = Marshal.load(Marshal.dump(ec)) + if (pfc = ec[:payload_field_class]) && (members = pfc[:members]) + pfc[:members] = members.select { |m| wanted.include?(m[:name].to_s) } + ec.delete(:payload_field_class) if pfc[:members].empty? + end + ec + end.compact +end + +if ze_events.empty? + warn "WARNING: no ze allocation events found in #{ze_path}; " \ + 'GPU-aware MPI tagging will be inactive.' +end + +# Inject the ze events into the MPI stream class. There is only one +# stream class per backend model (see utils/gen_babeltrace_model_helper.rb). +mpi_model[:stream_classes].first[:event_classes].concat(ze_events) + +puts mpi_model.to_yaml