From 9c648faf7aed0114f7b6f1a8880fdc2f617881bd Mon Sep 17 00:00:00 2001 From: vic Date: Mon, 9 Feb 2026 11:05:28 +0100 Subject: [PATCH 1/2] Fix MG CAGRA search OOB --- cpp/src/neighbors/detail/cagra/compute_distance.hpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index f9974fa3df..ef6ad48202 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -235,6 +235,7 @@ struct dataset_descriptor_host { std::mutex mutex; std::atomic ready; // Not sure if std::holds_alternative is thread-safe std::variant value; + cudaEvent_t init_event{nullptr}; template state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)} @@ -247,6 +248,7 @@ struct dataset_descriptor_host { auto& [ptr, stream] = std::get(value); RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream)); } + if (init_event != nullptr) { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(init_event)); } } void eval(rmm::cuda_stream_view stream) @@ -257,6 +259,10 @@ struct dataset_descriptor_host { dev_descriptor_t* ptr = nullptr; RAFT_CUDA_TRY(cudaMallocAsync(&ptr, size, stream)); fun(ptr, stream); + // Record an event after initialization so that other streams can establish + // a GPU-side dependency without expensive host synchronization. + RAFT_CUDA_TRY(cudaEventCreateWithFlags(&init_event, cudaEventDisableTiming)); + RAFT_CUDA_TRY(cudaEventRecord(init_event, stream)); value = std::make_tuple(ptr, stream); ready.store(true, std::memory_order_release); } @@ -265,6 +271,11 @@ struct dataset_descriptor_host { auto get(rmm::cuda_stream_view stream) -> dev_descriptor_t* { if (!ready.load(std::memory_order_acquire)) { eval(stream); } + // Make the caller's stream wait for the init to complete. This is a + // lightweight GPU-side dependency with no host blocking. On the same + // stream that performed the init (or after the event has already + // completed) this is essentially a no-op. + if (init_event != nullptr) { RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, init_event)); } return std::get<0>(std::get(value)); } }; From 49a24f4fb74057ccded3387d47d470f9a2d595ad Mon Sep 17 00:00:00 2001 From: vic Date: Tue, 10 Feb 2026 18:05:21 +0100 Subject: [PATCH 2/2] adressing review --- cpp/src/neighbors/detail/cagra/compute_distance.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index ef6ad48202..22b325fca5 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -257,11 +257,11 @@ struct dataset_descriptor_host { if (std::holds_alternative(value)) { auto& [fun, size] = std::get(value); dev_descriptor_t* ptr = nullptr; + RAFT_CUDA_TRY(cudaEventCreateWithFlags(&init_event, cudaEventDisableTiming)); RAFT_CUDA_TRY(cudaMallocAsync(&ptr, size, stream)); fun(ptr, stream); // Record an event after initialization so that other streams can establish // a GPU-side dependency without expensive host synchronization. - RAFT_CUDA_TRY(cudaEventCreateWithFlags(&init_event, cudaEventDisableTiming)); RAFT_CUDA_TRY(cudaEventRecord(init_event, stream)); value = std::make_tuple(ptr, stream); ready.store(true, std::memory_order_release);