diff --git a/src/framework/containers/particles_comm.cpp b/src/framework/containers/particles_comm.cpp index 8fe9d1ec..0fa140f9 100644 --- a/src/framework/containers/particles_comm.cpp +++ b/src/framework/containers/particles_comm.cpp @@ -28,6 +28,11 @@ namespace ntt { npart_t nsend, npart_t nrecv, npart_t offset) { +#if defined(DEVICE_ENABLED) + // guard for Intel GPUs. + // Should be a null-operation for other architectures. + Kokkos::fence(); +#endif #if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) MPI_Sendrecv(send_arr.data(), nsend, @@ -98,6 +103,11 @@ namespace ntt { template void send(array_t& send_arr, int send_rank, npart_t nsend) { +#if defined(DEVICE_ENABLED) + // guard for Intel GPUs. + // Should be a null-operation for other architectures. + Kokkos::fence(); +#endif #if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); #else @@ -109,6 +119,11 @@ namespace ntt { template void recv(array_t& recv_arr, int recv_rank, npart_t nrecv, npart_t offset) { +#if defined(DEVICE_ENABLED) + // guard for Intel GPUs. + // Should be a null-operation for other architectures. + Kokkos::fence(); +#endif #if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) MPI_Recv(recv_arr.data() + offset, nrecv, diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index 8741c966..904fb3fc 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -25,27 +25,33 @@ namespace ntt { const auto num_tags = ntags(); array_t npptag { "nparts_per_tag", ntags() }; - // count # of particles per each tag - auto npptag_scat = Kokkos::Experimental::create_scatter_view(npptag); + // count # of particles per each tag, skipping the alive bin in-kernel. + constexpr short tag_alive_s = static_cast(ParticleTag::alive); Kokkos::parallel_for( "NpartPerTag", rangeActiveParticles(), Lambda(prtlidx_t p) { - auto npptag_acc = npptag_scat.access(); - if (this_tag(p) < 0 || this_tag(p) >= static_cast(num_tags)) { + const short t = this_tag(p); + if (t < 0 || t >= static_cast(num_tags)) { raise::KernelError(HERE, "Invalid tag value"); } - npptag_acc(this_tag(p)) += 1; + if (t != tag_alive_s) { + Kokkos::atomic_add(&npptag(t), static_cast(1)); + } }); - Kokkos::Experimental::contribute(npptag, npptag_scat); - // copy the count to a vector on the host + // copy the count to a vector on the host and reconstruct the alive bin auto npptag_h = Kokkos::create_mirror_view(npptag); Kokkos::deep_copy(npptag_h, npptag); std::vector npptag_vec(num_tags); + npart_t non_alive_total = 0; for (auto t { 0u }; t < num_tags; ++t) { npptag_vec[t] = npptag_h(t); + if (static_cast(t) != tag_alive_s) { + non_alive_total += npptag_h(t); + } } + npptag_vec[tag_alive_s] = npart() - non_alive_total; // count the offsets on the host and copy to device const array_t tag_offsets("tag_offsets", num_tags - 3); diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index e3598f8b..52103c17 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -33,6 +33,11 @@ namespace comm { int recv_rank, ncells_t nsend, ncells_t nrecv) { +#if defined(DEVICE_ENABLED) + // guard for Intel GPUs. + // Should be a null-operation for other architectures. + Kokkos::fence(); +#endif #if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) MPI_Sendrecv(send_arr.data(), nsend, @@ -68,6 +73,11 @@ namespace comm { template void send(ndarray_t& send_arr, int send_rank, ncells_t nsend) { +#if defined(DEVICE_ENABLED) + // guard for Intel GPUs. + // Should be a null-operation for other architectures. + Kokkos::fence(); +#endif #if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) MPI_Send(send_arr.data(), nsend, mpi::get_type(), send_rank, 0, MPI_COMM_WORLD); #else @@ -84,6 +94,11 @@ namespace comm { template void recv(ndarray_t& recv_arr, int recv_rank, ncells_t nrecv) { +#if defined(DEVICE_ENABLED) + // guard for Intel GPUs. + // Should be a null-operation for other architectures. + Kokkos::fence(); +#endif #if !defined(DEVICE_ENABLED) || defined(GPU_AWARE_MPI) MPI_Recv(recv_arr.data(), nrecv, diff --git a/src/framework/domain/metadomain_comm.cpp b/src/framework/domain/metadomain_comm.cpp index e492752c..f8c0f60d 100644 --- a/src/framework/domain/metadomain_comm.cpp +++ b/src/framework/domain/metadomain_comm.cpp @@ -349,19 +349,6 @@ namespace ntt { false); } } else { - if (comm_em) { - comm::CommunicateField(domain.index(), - domain.fields.em, - domain.fields.em, - send_ind, - recv_ind, - send_rank, - recv_rank, - send_slice, - recv_slice, - comp_range_fld, - false); - } if (comm_j) { comm::CommunicateField(domain.index(), domain.fields.cur,