From 5ab2991dc51f300f26b6e916e3b3760926c30f15 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Fri, 23 Nov 2018 10:23:46 -0800 Subject: [PATCH] Add tests for all collectives. This changes test_correctness to test_allreduces for consistency. New argument passing added into test_utils. Note that the allgather tests currently fail (see #22). --- test/CMakeLists.txt | 99 ++----- test/test_allgather.cpp | 178 ++++++++++++ ...est_correctness.cpp => test_allreduce.cpp} | 34 +-- test/test_alltoall.cpp | 165 +++++++++++ test/test_bcast.cpp | 142 +++++++++ test/test_gather.cpp | 165 +++++++++++ test/test_reduce.cpp | 165 +++++++++++ test/test_reduce_scatter.cpp | 170 +++++++++++ test/test_scatter.cpp | 167 +++++++++++ test/test_utils.hpp | 270 +++++++++++++++++- test/test_utils_cuda.hpp | 61 +++- 11 files changed, 1495 insertions(+), 121 deletions(-) create mode 100644 test/test_allgather.cpp rename test/{test_correctness.cpp => test_allreduce.cpp} (87%) create mode 100644 test/test_alltoall.cpp create mode 100644 test/test_bcast.cpp create mode 100644 test/test_gather.cpp create mode 100644 test/test_reduce.cpp create mode 100644 test/test_reduce_scatter.cpp create mode 100644 test/test_scatter.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bd90805c..23b3e88d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,8 +1,3 @@ -# The name changed in CMake 3.10 -if (NOT MPIEXEC_EXECUTABLE AND MPIEXEC) - set(MPIEXEC_EXECUTABLE ${MPIEXEC}) -endif () - set_full_path(TEST_HEADERS test_utils.hpp) if (AL_HAS_CUDA) @@ -23,78 +18,30 @@ target_sources(aluminum_test_headers INTERFACE "${TEST_HEADERS}") target_include_directories( aluminum_test_headers INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}") -add_executable(TestCorrectness.exe test_correctness.cpp ${TEST_HEADERS}) -target_link_libraries(TestCorrectness.exe PRIVATE Al) -if (AL_HAS_CUDA) - target_link_libraries(TestCorrectness.exe PUBLIC cuda) -endif () - -# This is mostly a sanity check -set(TEST_ARGS MPI 8) -add_test(NAME TestCorrectness - COMMAND $ ${TEST_ARGS}) +set(TEST_SRCS + test_allreduce.cpp + test_reduce.cpp + test_reduce_scatter.cpp + test_allgather.cpp + test_alltoall.cpp + test_bcast.cpp + test_gather.cpp + test_scatter.cpp + test_multi_nballreduces.cpp + test_nccl_collectives.cpp) + +foreach(src ${TEST_SRCS}) + string(REPLACE ".cpp" ".exe" _test_exe_name "${src}") + add_executable(${_test_exe_name} ${src}) + target_link_libraries(${_test_exe_name} PRIVATE Al aluminum_test_headers) + if (AL_HAS_CUDA) + target_link_libraries(${_test_exe_name} PUBLIC cuda) + endif() +endforeach() -if (MPIEXEC_EXECUTABLE) - add_test(NAME TestCorrectness_np4 - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4 - ${MPIEXEC_PREFLAGS} - $ - ${MPIEXEC_POSTFLAGS} ${TEST_ARGS}) -endif () - -add_executable(TestMultiNBAllReduces.exe - test_multi_nballreduces.cpp ${TEST_HEADERS}) -target_link_libraries(TestMultiNBAllReduces.exe PRIVATE Al) if (AL_HAS_CUDA) - target_link_libraries(TestMultiNBAllReduces.exe PUBLIC cuda) -endif () - -set(TEST_ARGS "8") -add_test(NAME TestMultiNBAllReduces - COMMAND $ ${TEST_ARGS}) - -if (MPIEXEC_EXECUTABLE) - add_test(NAME TestMultiNBAllReduces_np4 - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4 - ${MPIEXEC_PREFLAGS} - $ - ${MPIEXEC_POSTFLAGS} ${TEST_ARGS}) -endif () - -add_executable(TestNCCLCollectives.exe - test_nccl_collectives.cpp ${TEST_HEADERS}) -target_link_libraries(TestNCCLCollectives.exe PRIVATE Al) -if (AL_HAS_CUDA) - target_link_libraries(TestNCCLCollectives.exe PUBLIC cuda) -endif () - -set(TEST_ARGS "8") -add_test(NAME TestNCCLCollectives - COMMAND $ ${TEST_ARGS}) - -if (MPIEXEC_EXECUTABLE) - add_test(NAME TestNCCLCollectives_np4 - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4 - ${MPIEXEC_PREFLAGS} - $ - ${MPIEXEC_POSTFLAGS} ${TEST_ARGS}) -endif () - -if (AL_HAS_CUDA) - add_executable(TestStreamMemOps.exe + add_executable(test_stream_mem_ops.exe test_stream_mem_ops.cpp ${TEST_HEADERS}) - target_link_libraries(TestStreamMemOps.exe PRIVATE Al) - target_link_libraries(TestStreamMemOps.exe PUBLIC cuda) - - set(TEST_ARGS "8") - add_test(NAME TestStreamMemOps - COMMAND $ ${TEST_ARGS}) - - if (MPIEXEC_EXECUTABLE) - add_test(NAME TestStreamMemOps_np4 - COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4 - ${MPIEXEC_PREFLAGS} - $ - ${MPIEXEC_POSTFLAGS} ${TEST_ARGS}) - endif () + target_link_libraries(test_stream_mem_ops.exe PRIVATE Al) + target_link_libraries(test_stream_mem_ops.exe PUBLIC cuda) endif () diff --git a/test/test_allgather.cpp b/test/test_allgather.cpp new file mode 100644 index 00000000..3c95f232 --- /dev/null +++ b/test/test_allgather.cpp @@ -0,0 +1,178 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the +// Lawrence Livermore National Laboratory in collaboration with University of +// Illinois Urbana-Champaign. +// +// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-756777. +// All rights reserved. +// +// This file is part of Aluminum GPU-aware Communication Library. For details, see +// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "Al.hpp" +#include "test_utils.hpp" +#ifdef AL_HAS_NCCL +#include "test_utils_nccl_cuda.hpp" +#endif +#ifdef AL_HAS_MPI_CUDA +#include "test_utils_mpi_cuda.hpp" +#endif + +#include +#include +#include + +// Size is the per-rank send size. +size_t start_size = 1; +size_t max_size = 1<<30; + +/** + * Test allgather algo on input, check with expected. + */ +template +void test_allgather_algo(const typename VectorType::type& expected, + const typename VectorType::type& expected_inplace, + typename VectorType::type input, + typename VectorType::type input_inplace, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + auto recv = get_vector(input.size() * comm.size()); + // Test regular allgather. + Al::Allgather(input.data(), recv.data(), input.size(), comm, algo); + if (!check_vector(expected, recv)) { + std::cout << comm.rank() << ": regular allgather does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place allgather. + std::stringstream ss; + ss << comm.rank() << ": input: "; + for (const auto& v : input_inplace.copyout()) ss << v << " "; + std::cout << ss.str() << std::endl; + Al::Allgather(input_inplace.data(), input_inplace.size() / comm.size(), + comm, algo); + MPI_Barrier(MPI_COMM_WORLD); + if (!check_vector(expected_inplace, input_inplace)) { + std::cout << comm.rank() << ": in-place allgather does not match" << + std::endl; + std::abort(); + } +} + +/** + * Test non-blocking allgather algo on input, check with expected. + */ +template +void test_nb_allgather_algo(const typename VectorType::type& expected, + const typename VectorType::type& expected_inplace, + typename VectorType::type input, + typename VectorType::type input_inplace, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + typename Backend::req_type req = get_request(); + auto recv = get_vector(input.size() * comm.size()); + // Test regular allgather. + Al::NonblockingAllgather(input.data(), recv.data(), + input.size(), comm, req, algo); + Al::Wait(req); + if (!check_vector(expected, recv)) { + std::cout << comm.rank() << ": regular allgather does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place allgather. + Al::NonblockingAllgather(input_inplace.data(), + input_inplace.size() / comm.size(), + comm, req, algo); + Al::Wait(req); + if (!check_vector(expected_inplace, input_inplace)) { + std::cout << comm.rank() << ": in-place allgather does not match" << + std::endl; + std::abort(); + } +} + +template +void test_correctness() { + auto algos = get_allgather_algorithms(); + auto nb_algos = get_nb_allgather_algorithms(); + typename Backend::comm_type comm; // Use COMM_WORLD. + // Compute sizes to test. + std::vector sizes = get_sizes(start_size, max_size, true); + for (const auto& size : sizes) { + if (comm.rank() == 0) { + std::cout << "Testing size " << human_readable_size(size) << std::endl; + } + // Compute true value. + size_t global_size = size * comm.size(); + typename VectorType::type &&data = gen_data(size); + auto expected = get_vector(global_size); + get_expected_allgather_result(data, expected); + typename VectorType::type &&data_inplace = gen_data(global_size); + auto expected_inplace(data_inplace); + get_expected_allgather_inplace_result(expected_inplace); + // Test algorithms. + for (auto&& algo : algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl; + } + test_allgather_algo(expected, expected_inplace, + data, data_inplace, comm, algo); + } + for (auto&& algo : nb_algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl; + } + test_nb_allgather_algo(expected, expected_inplace, + data, data_inplace, comm, algo); + } + } +} + +int main(int argc, char** argv) { + // Need to set the CUDA device before initializing Aluminum. +#ifdef AL_HAS_CUDA + set_device(); +#endif + Al::Initialize(argc, argv); + + std::string backend = "MPI"; + parse_args(argc, argv, backend, start_size, max_size); + + if (backend == "MPI") { + std::cerr << "Allgather not supported on MPI backend." << std::endl; + std::abort(); +#ifdef AL_HAS_NCCL + } else if (backend == "NCCL") { + test_correctness(); +#endif +#ifdef AL_HAS_MPI_CUDA + } else if (backend == "MPI-CUDA") { + test_correctness(); +#endif + } + + Al::Finalize(); + return 0; +} diff --git a/test/test_correctness.cpp b/test/test_allreduce.cpp similarity index 87% rename from test/test_correctness.cpp rename to test/test_allreduce.cpp index eece6f97..f45fa165 100644 --- a/test/test_correctness.cpp +++ b/test/test_allreduce.cpp @@ -39,13 +39,9 @@ #include #include +size_t start_size = 1; size_t max_size = 1<<30; -void get_expected_result(std::vector& expected) { - MPI_Allreduce(MPI_IN_PLACE, expected.data(), expected.size(), - MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); -} - /** * Test allreduce algo on input, check with expected. */ @@ -113,14 +109,7 @@ void test_correctness() { auto nb_algos = get_nb_allreduce_algorithms(); typename Backend::comm_type comm; // Use COMM_WORLD. // Compute sizes to test. - std::vector sizes = {0}; - for (size_t size = 1; size <= max_size; size *= 2) { - sizes.push_back(size); - // Avoid duplicating 2. - if (size > 1) { - sizes.push_back(size + 1); - } - } + std::vector sizes = get_sizes(start_size, max_size, true); for (const auto& size : sizes) { if (comm.rank() == 0) { std::cout << "Testing size " << human_readable_size(size) << std::endl; @@ -128,7 +117,7 @@ void test_correctness() { // Compute true value. typename VectorType::type &&data = gen_data(size); auto expected(data); - get_expected_result(expected); + get_expected_allreduce_result(expected); // Test algorithms. for (auto&& algo : algos) { MPI_Barrier(MPI_COMM_WORLD); @@ -155,12 +144,7 @@ int main(int argc, char** argv) { Al::Initialize(argc, argv); std::string backend = "MPI"; - if (argc >= 2) { - backend = argv[1]; - } - if (argc == 3) { - max_size = std::stoul(argv[2]); - } + parse_args(argc, argv, backend, start_size, max_size); if (backend == "MPI") { test_correctness(); @@ -172,16 +156,6 @@ int main(int argc, char** argv) { } else if (backend == "MPI-CUDA") { test_correctness(); #endif - } else { - std::cerr << "usage: " << argv[0] << " [MPI"; -#ifdef AL_HAS_NCCL - std::cerr << " | NCCL"; -#endif -#ifdef AL_HAS_MPI_CUDA - std::cerr << " | MPI-CUDA"; -#endif - std::cerr << "]" << std::endl; - return -1; } Al::Finalize(); diff --git a/test/test_alltoall.cpp b/test/test_alltoall.cpp new file mode 100644 index 00000000..a5e577f6 --- /dev/null +++ b/test/test_alltoall.cpp @@ -0,0 +1,165 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the +// Lawrence Livermore National Laboratory in collaboration with University of +// Illinois Urbana-Champaign. +// +// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-756777. +// All rights reserved. +// +// This file is part of Aluminum GPU-aware Communication Library. For details, see +// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "Al.hpp" +#include "test_utils.hpp" +#ifdef AL_HAS_NCCL +#include "test_utils_nccl_cuda.hpp" +#endif +#ifdef AL_HAS_MPI_CUDA +#include "test_utils_mpi_cuda.hpp" +#endif + +#include +#include +#include + +// Per-process send/recv size. +size_t start_size = 1; +size_t max_size = 1<<30; + +/** + * Test alltoall algo on input, check with expected. + */ +template +void test_alltoall_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + auto recv = get_vector(input.size()); + // Test regular alltoall. + Al::Alltoall(input.data(), recv.data(), input.size() / comm.size(), + comm, algo); + if (!check_vector(expected, recv)) { + std::cout << comm.rank() << ": regular alltoall does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place alltoall. + Al::Alltoall(input.data(), input.size() / comm.size(), comm, algo); + if (!check_vector(expected, input)) { + std::cout << comm.rank() << ": in-place alltoall does not match" << + std::endl; + std::abort(); + } +} + +/** + * Test non-blocking alltoall algo on input, check with expected. + */ +template +void test_nb_alltoall_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + typename Backend::req_type req = get_request(); + auto recv = get_vector(input.size()); + // Test regular alltoall. + Al::NonblockingAlltoall(input.data(), recv.data(), + input.size() / comm.size(), + comm, req, algo); + Al::Wait(req); + if (!check_vector(expected, recv)) { + std::cout << comm.rank() << ": regular alltoall does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place alltoall. + Al::NonblockingAlltoall(input.data(), input.size() / comm.size(), + comm, req, algo); + Al::Wait(req); + if (!check_vector(expected, input)) { + std::cout << comm.rank() << ": in-place alltoall does not match" << + std::endl; + std::abort(); + } +} + +template +void test_correctness() { + auto algos = get_alltoall_algorithms(); + auto nb_algos = get_nb_alltoall_algorithms(); + typename Backend::comm_type comm; // Use COMM_WORLD. + // Compute sizes to test. + std::vector sizes = get_sizes(start_size, max_size, true); + for (const auto& size : sizes) { + if (comm.rank() == 0) { + std::cout << "Testing size " << human_readable_size(size) << std::endl; + } + size_t full_size = size * comm.size(); + // Compute true value. + typename VectorType::type &&data = gen_data(full_size); + auto expected(data); + get_expected_alltoall_result(expected); + // Test algorithms. + for (auto&& algo : algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl; + } + test_alltoall_algo(expected, data, comm, algo); + } + for (auto&& algo : nb_algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl; + } + test_nb_alltoall_algo(expected, data, comm, algo); + } + } +} + +int main(int argc, char** argv) { + // Need to set the CUDA device before initializing Aluminum. +#ifdef AL_HAS_CUDA + set_device(); +#endif + Al::Initialize(argc, argv); + + std::string backend = "MPI"; + parse_args(argc, argv, backend, start_size, max_size); + + if (backend == "MPI") { + std::cerr << "Alltoall not supported on MPI backend." << std::endl; + std::abort(); +#ifdef AL_HAS_NCCL + } else if (backend == "NCCL") { + std::cerr << "Alltoall not supported on NCCL backend." << std::endl; + std::abort(); +#endif +#ifdef AL_HAS_MPI_CUDA + } else if (backend == "MPI-CUDA") { + test_correctness(); +#endif + } + + Al::Finalize(); + return 0; +} diff --git a/test/test_bcast.cpp b/test/test_bcast.cpp new file mode 100644 index 00000000..28bc1a83 --- /dev/null +++ b/test/test_bcast.cpp @@ -0,0 +1,142 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the +// Lawrence Livermore National Laboratory in collaboration with University of +// Illinois Urbana-Champaign. +// +// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-756777. +// All rights reserved. +// +// This file is part of Aluminum GPU-aware Communication Library. For details, see +// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "Al.hpp" +#include "test_utils.hpp" +#ifdef AL_HAS_NCCL +#include "test_utils_nccl_cuda.hpp" +#endif +#ifdef AL_HAS_MPI_CUDA +#include "test_utils_mpi_cuda.hpp" +#endif + +#include +#include +#include + +size_t start_size = 1; +size_t max_size = 1<<30; + +/** + * Test bcast algo on input, check with expected. + */ +template +void test_bcast_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + // Test in-place bcast (bcast is always in-place). + Al::Bcast(input.data(), input.size(), + 0, comm, algo); + if (!check_vector(expected, input)) { + std::cout << comm.rank() << ": in-place bcast does not match" << + std::endl; + std::abort(); + } +} + +/** + * Test non-blocking bcast algo on input, check with expected. + */ +template +void test_nb_bcast_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + typename Backend::req_type req = get_request(); + // Test in-place bcast (bcast is always in-place). + Al::NonblockingBcast(input.data(), input.size(), + 0, comm, req, algo); + Al::Wait(req); + if (!check_vector(expected, input)) { + std::cout << comm.rank() << ": in-place bcast does not match" << + std::endl; + std::abort(); + } +} + +template +void test_correctness() { + auto algos = get_bcast_algorithms(); + auto nb_algos = get_nb_bcast_algorithms(); + typename Backend::comm_type comm; // Use COMM_WORLD. + // Compute sizes to test. + std::vector sizes = get_sizes(start_size, max_size, true); + for (const auto& size : sizes) { + if (comm.rank() == 0) { + std::cout << "Testing size " << human_readable_size(size) << std::endl; + } + // Compute true value. + typename VectorType::type &&data = gen_data(size); + auto expected(data); + get_expected_bcast_result(expected); + // Test algorithms. + for (auto&& algo : algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + // TODO: Update when we have real algorithm sets for each op. + std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl; + } + test_bcast_algo(expected, data, comm, algo); + } + for (auto&& algo : nb_algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl; + } + test_nb_bcast_algo(expected, data, comm, algo); + } + } +} + +int main(int argc, char** argv) { + // Need to set the CUDA device before initializing Aluminum. +#ifdef AL_HAS_CUDA + set_device(); +#endif + Al::Initialize(argc, argv); + + std::string backend = "MPI"; + parse_args(argc, argv, backend, start_size, max_size); + + if (backend == "MPI") { + std::cerr << "Bcast not supported on MPI backend." << std::endl; + std::abort(); +#ifdef AL_HAS_NCCL + } else if (backend == "NCCL") { + test_correctness(); +#endif +#ifdef AL_HAS_MPI_CUDA + } else if (backend == "MPI-CUDA") { + test_correctness(); +#endif + } + + Al::Finalize(); + return 0; +} diff --git a/test/test_gather.cpp b/test/test_gather.cpp new file mode 100644 index 00000000..9aeccd29 --- /dev/null +++ b/test/test_gather.cpp @@ -0,0 +1,165 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the +// Lawrence Livermore National Laboratory in collaboration with University of +// Illinois Urbana-Champaign. +// +// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-756777. +// All rights reserved. +// +// This file is part of Aluminum GPU-aware Communication Library. For details, see +// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "Al.hpp" +#include "test_utils.hpp" +#ifdef AL_HAS_NCCL +#include "test_utils_nccl_cuda.hpp" +#endif +#ifdef AL_HAS_MPI_CUDA +#include "test_utils_mpi_cuda.hpp" +#endif + +#include +#include +#include + +// Per-rank data size. +size_t start_size = 1; +size_t max_size = 1<<30; + +/** + * Test gather algo on input, check with expected. + */ +template +void test_gather_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + auto recv = get_vector(input.size()); + // Test regular gather. + Al::Gather(input.data(), recv.data(), input.size() / comm.size(), + 0, comm, algo); + if (comm.rank() == 0 && !check_vector(expected, recv)) { + std::cout << comm.rank() << ": regular gather does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place gather. + Al::Gather(input.data(), input.size() / comm.size(), 0, comm, algo); + if (comm.rank() == 0 && !check_vector(expected, input)) { + std::cout << comm.rank() << ": in-place gather does not match" << + std::endl; + std::abort(); + } +} + +/** + * Test non-blocking gather algo on input, check with expected. + */ +template +void test_nb_gather_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + typename Backend::req_type req = get_request(); + auto recv = get_vector(input.size()); + // Test regular gather. + Al::NonblockingGather(input.data(), recv.data(), + input.size() / comm.size(), + 0, comm, req, algo); + Al::Wait(req); + if (comm.rank() == 0 && !check_vector(expected, recv)) { + std::cout << comm.rank() << ": regular gather does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place gather. + Al::NonblockingGather(input.data(), input.size() / comm.size(), + 0, comm, req, algo); + Al::Wait(req); + if (comm.rank() == 0 && !check_vector(expected, input)) { + std::cout << comm.rank() << ": in-place gather does not match" << + std::endl; + std::abort(); + } +} + +template +void test_correctness() { + auto algos = get_gather_algorithms(); + auto nb_algos = get_nb_gather_algorithms(); + typename Backend::comm_type comm; // Use COMM_WORLD. + // Compute sizes to test. + std::vector sizes = get_sizes(start_size, max_size, true); + for (const auto& size : sizes) { + if (comm.rank() == 0) { + std::cout << "Testing size " << human_readable_size(size) << std::endl; + } + // Compute true value. + typename VectorType::type &&data = gen_data(size*comm.size()); + auto expected(data); + get_expected_gather_result(expected); + // Test algorithms. + for (auto&& algo : algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + // TODO: Update when we have real algorithm sets for each op. + std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl; + } + test_gather_algo(expected, data, comm, algo); + } + for (auto&& algo : nb_algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl; + } + test_nb_gather_algo(expected, data, comm, algo); + } + } +} + +int main(int argc, char** argv) { + // Need to set the CUDA device before initializing Aluminum. +#ifdef AL_HAS_CUDA + set_device(); +#endif + Al::Initialize(argc, argv); + + std::string backend = "MPI"; + parse_args(argc, argv, backend, start_size, max_size); + + if (backend == "MPI") { + std::cerr << "Gather not supported on MPI backend." << std::endl; + std::abort(); +#ifdef AL_HAS_NCCL + } else if (backend == "NCCL") { + std::cerr << "Gather not supported on NCCL backend." << std::endl; + std::abort(); +#endif +#ifdef AL_HAS_MPI_CUDA + } else if (backend == "MPI-CUDA") { + test_correctness(); +#endif + } + + Al::Finalize(); + return 0; +} diff --git a/test/test_reduce.cpp b/test/test_reduce.cpp new file mode 100644 index 00000000..881286f9 --- /dev/null +++ b/test/test_reduce.cpp @@ -0,0 +1,165 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the +// Lawrence Livermore National Laboratory in collaboration with University of +// Illinois Urbana-Champaign. +// +// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-756777. +// All rights reserved. +// +// This file is part of Aluminum GPU-aware Communication Library. For details, see +// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "Al.hpp" +#include "test_utils.hpp" +#ifdef AL_HAS_NCCL +#include "test_utils_nccl_cuda.hpp" +#endif +#ifdef AL_HAS_MPI_CUDA +#include "test_utils_mpi_cuda.hpp" +#endif + +#include +#include +#include + +size_t start_size = 1; +size_t max_size = 1<<30; + +/** + * Test reduce algo on input, check with expected. + */ +template +void test_reduce_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + auto recv = get_vector(input.size()); + // Test regular reduce. + Al::Reduce(input.data(), recv.data(), input.size(), + Al::ReductionOperator::sum, 0, comm, algo); + if (comm.rank() == 0 && !check_vector(expected, recv)) { + std::cout << comm.rank() << ": regular reduce does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place reduce. + Al::Reduce(input.data(), input.size(), + Al::ReductionOperator::sum, 0, comm, algo); + if (comm.rank() == 0 && !check_vector(expected, input)) { + std::cout << comm.rank() << ": in-place reduce does not match" << + std::endl; + std::abort(); + } +} + +/** + * Test non-blocking reduce algo on input, check with expected. + */ +template +void test_nb_reduce_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + typename Backend::req_type req = get_request(); + auto recv = get_vector(input.size()); + // Test regular reduce. + Al::NonblockingReduce(input.data(), recv.data(), input.size(), + Al::ReductionOperator::sum, 0, comm, + req, algo); + Al::Wait(req); + if (comm.rank() == 0 && !check_vector(expected, recv)) { + std::cout << comm.rank() << ": regular reduce does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place reduce. + Al::NonblockingReduce(input.data(), input.size(), + Al::ReductionOperator::sum, 0, comm, + req, algo); + Al::Wait(req); + if (comm.rank() == 0 && !check_vector(expected, input)) { + std::cout << comm.rank() << ": in-place reduce does not match" << + std::endl; + std::abort(); + } +} + +template +void test_correctness() { + auto algos = get_reduce_algorithms(); + auto nb_algos = get_nb_reduce_algorithms(); + typename Backend::comm_type comm; // Use COMM_WORLD. + // Compute sizes to test. + std::vector sizes = get_sizes(start_size, max_size, true); + for (const auto& size : sizes) { + if (comm.rank() == 0) { + std::cout << "Testing size " << human_readable_size(size) << std::endl; + } + // Compute true value. + typename VectorType::type &&data = gen_data(size); + auto expected(data); + get_expected_reduce_result(expected); + // Test algorithms. + for (auto&& algo : algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + // TODO: Update when we have real algorithm sets for each op. + std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl; + } + test_reduce_algo(expected, data, comm, algo); + } + for (auto&& algo : nb_algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl; + } + test_nb_reduce_algo(expected, data, comm, algo); + } + } +} + +int main(int argc, char** argv) { + // Need to set the CUDA device before initializing Aluminum. +#ifdef AL_HAS_CUDA + set_device(); +#endif + Al::Initialize(argc, argv); + + std::string backend = "MPI"; + parse_args(argc, argv, backend, start_size, max_size); + + if (backend == "MPI") { + std::cerr << "Reduce not supported on MPI backend." << std::endl; + std::abort(); +#ifdef AL_HAS_NCCL + } else if (backend == "NCCL") { + test_correctness(); +#endif +#ifdef AL_HAS_MPI_CUDA + } else if (backend == "MPI-CUDA") { + test_correctness(); +#endif + } + + Al::Finalize(); + return 0; +} diff --git a/test/test_reduce_scatter.cpp b/test/test_reduce_scatter.cpp new file mode 100644 index 00000000..6a845e5f --- /dev/null +++ b/test/test_reduce_scatter.cpp @@ -0,0 +1,170 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the +// Lawrence Livermore National Laboratory in collaboration with University of +// Illinois Urbana-Champaign. +// +// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-756777. +// All rights reserved. +// +// This file is part of Aluminum GPU-aware Communication Library. For details, see +// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "Al.hpp" +#include "test_utils.hpp" +#ifdef AL_HAS_NCCL +#include "test_utils_nccl_cuda.hpp" +#endif +#ifdef AL_HAS_MPI_CUDA +#include "test_utils_mpi_cuda.hpp" +#endif + +#include +#include +#include + +// Size is the per-rank recv size. +size_t start_size = 1; +size_t max_size = 1<<30; + +/** + * Test reduce-scatter algo on input, check with expected. + */ +template +void test_reduce_scatter_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + auto recv = get_vector(input.size() / comm.size()); + // Test regular reduce-scatter. + Al::Reduce_scatter(input.data(), recv.data(), + input.size() / comm.size(), + Al::ReductionOperator::sum, comm, algo); + if (!check_vector(expected, recv, 0, input.size() / comm.size())) { + std::cout << comm.rank() << ": regular reduce-scatter does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place reduce-scatter. + Al::Reduce_scatter(input.data(), input.size() / comm.size(), + Al::ReductionOperator::sum, comm, algo); + if (!check_vector(expected, input, 0, input.size() / comm.size())) { + std::cout << comm.rank() << ": in-place reduce-scatter does not match" << + std::endl; + std::abort(); + } +} + +/** + * Test non-blocking reduce-scatter algo on input, check with expected. + */ +template +void test_nb_reduce_scatter_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + typename Backend::req_type req = get_request(); + auto recv = get_vector(input.size() / comm.size()); + // Test regular reduce-scatter. + Al::NonblockingReduce_scatter(input.data(), recv.data(), + input.size() / comm.size(), + Al::ReductionOperator::sum, comm, + req, algo); + Al::Wait(req); + if (!check_vector(expected, recv, 0, input.size() / comm.size())) { + std::cout << comm.rank() << ": regular reduce-scatter does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place reduce-scatter. + Al::NonblockingReduce_scatter(input.data(), + input.size() / comm.size(), + Al::ReductionOperator::sum, comm, + req, algo); + Al::Wait(req); + if (!check_vector(expected, input, 0, input.size() / comm.size())) { + std::cout << comm.rank() << ": in-place reduce-scatter does not match" << + std::endl; + std::abort(); + } +} + +template +void test_correctness() { + auto algos = get_reduce_scatter_algorithms(); + auto nb_algos = get_nb_reduce_scatter_algorithms(); + typename Backend::comm_type comm; // Use COMM_WORLD. + // Compute sizes to test. + std::vector sizes = get_sizes(start_size, max_size, true); + for (const auto& size : sizes) { + if (comm.rank() == 0) { + std::cout << "Testing size " << human_readable_size(size) << std::endl; + } + // Compute true value. + size_t global_size = size * comm.size(); + typename VectorType::type &&data = gen_data(global_size); + auto expected(data); + get_expected_reduce_scatter_result(expected); + // Test algorithms. + for (auto&& algo : algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + // TODO: Update when we have real algorithm sets for each op. + std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl; + } + test_reduce_scatter_algo(expected, data, comm, algo); + } + for (auto&& algo : nb_algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl; + } + test_nb_reduce_scatter_algo(expected, data, comm, algo); + } + } +} + +int main(int argc, char** argv) { + // Need to set the CUDA device before initializing Aluminum. +#ifdef AL_HAS_CUDA + set_device(); +#endif + Al::Initialize(argc, argv); + + std::string backend = "MPI"; + parse_args(argc, argv, backend, start_size, max_size); + + if (backend == "MPI") { + std::cerr << "Reduce-scatter not supported on MPI backend." << std::endl; + std::abort(); +#ifdef AL_HAS_NCCL + } else if (backend == "NCCL") { + test_correctness(); +#endif +#ifdef AL_HAS_MPI_CUDA + } else if (backend == "MPI-CUDA") { + test_correctness(); +#endif + } + + Al::Finalize(); + return 0; +} diff --git a/test/test_scatter.cpp b/test/test_scatter.cpp new file mode 100644 index 00000000..4b40b635 --- /dev/null +++ b/test/test_scatter.cpp @@ -0,0 +1,167 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2018, Lawrence Livermore National Security, LLC. Produced at the +// Lawrence Livermore National Laboratory in collaboration with University of +// Illinois Urbana-Champaign. +// +// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-756777. +// All rights reserved. +// +// This file is part of Aluminum GPU-aware Communication Library. For details, see +// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "Al.hpp" +#include "test_utils.hpp" +#ifdef AL_HAS_NCCL +#include "test_utils_nccl_cuda.hpp" +#endif +#ifdef AL_HAS_MPI_CUDA +#include "test_utils_mpi_cuda.hpp" +#endif + +#include +#include +#include + +// Per-rank size. +size_t start_size = 1; +size_t max_size = 1<<30; + +/** + * Test scatter algo on input, check with expected. + */ +template +void test_scatter_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + auto recv = get_vector(input.size() / comm.size()); + // Test regular scatter. + Al::Scatter(input.data(), recv.data(), + input.size() / comm.size(), + 0, comm, algo); + if (!check_vector(expected, recv, 0, input.size() / comm.size())) { + std::cout << comm.rank() << ": regular scatter does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place scatter. + Al::Scatter(input.data(), input.size() / comm.size(), + 0, comm, algo); + if (!check_vector(expected, input, 0, input.size() / comm.size())) { + std::cout << comm.rank() << ": in-place scatter does not match" << + std::endl; + std::abort(); + } +} + +/** + * Test non-blocking scatter algo on input, check with expected. + */ +template +void test_nb_scatter_algo(const typename VectorType::type& expected, + typename VectorType::type input, + typename Backend::comm_type& comm, + typename Backend::algo_type algo) { + typename Backend::req_type req = get_request(); + auto recv = get_vector(input.size() / comm.size()); + // Test regular scatter. + Al::NonblockingScatter(input.data(), recv.data(), + input.size() / comm.size(), + 0, comm, req, algo); + Al::Wait(req); + if (!check_vector(expected, recv, 0, input.size() / comm.size())) { + std::cout << comm.rank() << ": regular scatter does not match" << + std::endl; + std::abort(); + } + MPI_Barrier(MPI_COMM_WORLD); + // Test in-place scatter. + Al::NonblockingScatter(input.data(), input.size() / comm.size(), + 0, comm, req, algo); + Al::Wait(req); + if (!check_vector(expected, input, 0, input.size() / comm.size())) { + std::cout << comm.rank() << ": in-place scatter does not match" << + std::endl; + std::abort(); + } +} + +template +void test_correctness() { + auto algos = get_scatter_algorithms(); + auto nb_algos = get_nb_scatter_algorithms(); + typename Backend::comm_type comm; // Use COMM_WORLD. + // Compute sizes to test. + std::vector sizes = get_sizes(start_size, max_size, true); + for (const auto& size : sizes) { + if (comm.rank() == 0) { + std::cout << "Testing size " << human_readable_size(size) << std::endl; + } + // Compute true value. + typename VectorType::type &&data = gen_data(size*comm.size()); + auto expected(data); + get_expected_scatter_result(expected); + // Test algorithms. + for (auto&& algo : algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + // TODO: Update when we have real algorithm sets for each op. + std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl; + } + test_scatter_algo(expected, data, comm, algo); + } + for (auto&& algo : nb_algos) { + MPI_Barrier(MPI_COMM_WORLD); + if (comm.rank() == 0) { + std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl; + } + test_nb_scatter_algo(expected, data, comm, algo); + } + } +} + +int main(int argc, char** argv) { + // Need to set the CUDA device before initializing Aluminum. +#ifdef AL_HAS_CUDA + set_device(); +#endif + Al::Initialize(argc, argv); + + std::string backend = "MPI"; + parse_args(argc, argv, backend, start_size, max_size); + + if (backend == "MPI") { + std::cerr << "Scatter not supported on MPI backend." << std::endl; + std::abort(); +#ifdef AL_HAS_NCCL + } else if (backend == "NCCL") { + std::cerr << "Scatter not supported on NCCL backend." << std::endl; + std::abort(); +#endif +#ifdef AL_HAS_MPI_CUDA + } else if (backend == "MPI-CUDA") { + test_correctness(); +#endif + } + + Al::Finalize(); + return 0; +} diff --git a/test/test_utils.hpp b/test/test_utils.hpp index afba369e..abbd0b56 100644 --- a/test/test_utils.hpp +++ b/test/test_utils.hpp @@ -49,6 +49,65 @@ typename VectorType::type get_vector(size_t count) { return typename VectorType::type(count); } +/** Parse input arguments. */ +void parse_args(int argc, char** argv, + std::string& backend, size_t& start_size, size_t& max_size) { + if (argc == 1) { + backend = "MPI"; + return; + } else { + backend = argv[1]; + if (argc == 3) { + start_size = std::stoul(argv[2]); + max_size = start_size; + } else if (argc == 4) { + start_size = std::stoul(argv[2]); + max_size = std::stoul(argv[3]); + } else if (argc > 5) { + std::cerr << "Unexpected argument." << std::endl; + std::abort(); + } + } + if (backend != "MPI" +#ifdef AL_HAS_NCCL + && backend != "NCCL" +#endif +#ifdef AL_HAS_MPI_CUDA + && backend != "MPI-CUDA" +#endif + ) { + std::cerr << "Usage: " << argv[0] << " [MPI" +#ifdef AL_HAS_NCCL + << " | NCCL" +#endif +#ifdef AL_HAS_MPI_CUDA + << " | MPI-CUDA" +#endif + << "] [start size] [max size]" + << std::endl; + std::abort(); + } +} + +/** + * Return every size to test between start_size and max_size (inclusive). + * If odds is true, generate odd-numbered values too. + */ +std::vector get_sizes(size_t start_size, size_t max_size, + bool odds = false) { + std::vector sizes; + if (start_size == 0) { + sizes.push_back(0); + } + for (size_t size = start_size; size <= max_size; size *= 2) { + sizes.push_back(size); + if (odds && size > 1) { + sizes.push_back(size + 1); + } + } + return sizes; +} + /** Generate random data of length count. */ template typename VectorType::type gen_data(size_t count); @@ -63,6 +122,7 @@ gen_data(size_t count) { int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); rng_gen.seed(rank); + rng_seeded = true; } } std::uniform_real_distribution rng; @@ -117,7 +177,58 @@ std::vector get_allreduce_algorithms() { Backend::algo_type::automatic}; return algos; } - + +// TODO: Update these once we have real algorithm sets for each op. + +template +std::vector get_reduce_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_reduce_scatter_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_allgather_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_bcast_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_alltoall_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_gather_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_scatter_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + template <> std::vector get_allreduce_algorithms() { @@ -139,6 +250,55 @@ std::vector get_nb_allreduce_algorithms() { Backend::algo_type::automatic}; return algos; } + +template +std::vector get_nb_reduce_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_nb_reduce_scatter_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_nb_allgather_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_nb_bcast_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_nb_alltoall_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_nb_gather_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} + +template +std::vector get_nb_scatter_algorithms() { + std::vector algos = { + Backend::algo_type::automatic}; + return algos; +} template <> std::vector @@ -157,27 +317,35 @@ get_nb_allreduce_algorithms() { #define eps (1e-4) bool check_vector(const std::vector& expected, - const std::vector& actual) { - bool match = true; + const std::vector& actual, + size_t start = 0, + size_t end = std::numeric_limits::max()) { int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); - for (size_t i = 0; i < expected.size(); ++i) { + if (end == std::numeric_limits::max()) { + end = expected.size(); + } + for (size_t i = start; i < end; ++i) { float e = expected[i]; - if (std::abs(e - actual[i]) > eps) { #ifdef AL_DEBUG std::stringstream ss; ss << "[" << rank << "] @" << i << " Expected: " << e << ", Actual: " << actual[i] << "\n"; + // Helpful for debugging to print out small vectors completely. + if (expected.size() < 128) { + ss << "[" << rank << "] expected: "; + for (const auto& v : expected) ss << v << " "; + ss << "actual: "; + for (const auto& v : actual) ss << v << " "; + ss << "\n"; + } std::cerr << ss.str(); - match = false; - return false; -#else - return false; #endif + return false; } } - return match; + return true; } void print_stats(std::vector& times) { @@ -223,3 +391,85 @@ inline typename Al::MPIBackend::req_type get_request() { return Al::MPIBackend::null_req; } + +void get_expected_allreduce_result(std::vector& expected) { + MPI_Allreduce(MPI_IN_PLACE, expected.data(), expected.size(), + MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); +} + +void get_expected_reduce_scatter_result(std::vector& expected) { + int nprocs; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Reduce_scatter_block(MPI_IN_PLACE, expected.data(), + expected.size() / nprocs, + MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); +} + +void get_expected_allgather_result(std::vector& input, + std::vector& expected) { + MPI_Allgather(input.data(), input.size(), MPI_FLOAT, + expected.data(), input.size(), MPI_FLOAT, + MPI_COMM_WORLD); +} + +void get_expected_allgather_inplace_result(std::vector& expected) { + int nprocs; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Allgather(MPI_IN_PLACE, expected.size() / nprocs, MPI_FLOAT, + expected.data(), expected.size() / nprocs, MPI_FLOAT, + MPI_COMM_WORLD); +} + +void get_expected_alltoall_result(std::vector& expected) { + int nprocs; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Alltoall(MPI_IN_PLACE, expected.size() / nprocs, MPI_FLOAT, + expected.data(), expected.size() / nprocs, MPI_FLOAT, + MPI_COMM_WORLD); +} + +void get_expected_reduce_result(std::vector& expected) { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (rank == 0) { + MPI_Reduce(MPI_IN_PLACE, expected.data(), expected.size(), + MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); + } else { + MPI_Reduce(expected.data(), expected.data(), expected.size(), + MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); + } +} + +void get_expected_bcast_result(std::vector& expected) { + MPI_Bcast(expected.data(), expected.size(), MPI_FLOAT, 0, MPI_COMM_WORLD); +} + +void get_expected_gather_result(std::vector& expected) { + int rank, nprocs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + if (rank == 0) { + MPI_Gather(MPI_IN_PLACE, expected.size() / nprocs, MPI_FLOAT, + expected.data(), expected.size() / nprocs, MPI_FLOAT, + 0, MPI_COMM_WORLD); + } else { + MPI_Gather(expected.data(), expected.size() / nprocs, MPI_FLOAT, + expected.data(), expected.size() / nprocs, MPI_FLOAT, + 0, MPI_COMM_WORLD); + } +} + +void get_expected_scatter_result(std::vector& expected) { + int rank, nprocs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + if (rank == 0) { + MPI_Scatter(expected.data(), expected.size() / nprocs, MPI_FLOAT, + MPI_IN_PLACE, expected.size() / nprocs, MPI_FLOAT, + 0, MPI_COMM_WORLD); + } else { + MPI_Scatter(expected.data(), expected.size() / nprocs, MPI_FLOAT, + expected.data(), expected.size() / nprocs, MPI_FLOAT, + 0, MPI_COMM_WORLD); + } +} diff --git a/test/test_utils_cuda.hpp b/test/test_utils_cuda.hpp index af80c0e8..e48a9737 100644 --- a/test/test_utils_cuda.hpp +++ b/test/test_utils_cuda.hpp @@ -210,16 +210,67 @@ class CUDAVector { bool check_vector(const CUDAVector& expected, - const CUDAVector& actual) { + const CUDAVector& actual, + size_t start = 0, + size_t end = std::numeric_limits::max()) { std::vector &&expected_host = expected.copyout(); std::vector &&actual_host = actual.copyout(); - return check_vector(expected_host, actual_host); + return check_vector(expected_host, actual_host, start, end); } -void get_expected_result(CUDAVector& expected) { +void get_expected_allreduce_result(CUDAVector& expected) { std::vector &&host_data = expected.copyout(); - MPI_Allreduce(MPI_IN_PLACE, host_data.data(), expected.size(), - MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + get_expected_allreduce_result(host_data); + expected.copyin(host_data); +} + +void get_expected_reduce_scatter_result(CUDAVector& expected) { + std::vector &&host_data = expected.copyout(); + get_expected_reduce_scatter_result(host_data); + expected.copyin(host_data); +} + +void get_expected_allgather_result(CUDAVector& input, + CUDAVector& expected) { + std::vector &&host_input = input.copyout(); + std::vector &&host_data = expected.copyout(); + get_expected_allgather_result(host_input, host_data); + expected.copyin(host_data); +} + +void get_expected_allgather_inplace_result(CUDAVector& expected) { + std::vector &&host_data = expected.copyout(); + get_expected_allgather_inplace_result(host_data); + expected.copyin(host_data); +} + +void get_expected_alltoall_result(CUDAVector& expected) { + std::vector &&host_data = expected.copyout(); + get_expected_alltoall_result(host_data); + expected.copyin(host_data); +} + +void get_expected_reduce_result(CUDAVector& expected) { + std::vector &&host_data = expected.copyout(); + get_expected_reduce_result(host_data); + expected.copyin(host_data); +} + +void get_expected_bcast_result(CUDAVector& expected) { + std::vector &&host_data = expected.copyout(); + get_expected_bcast_result(host_data); + expected.copyin(host_data); +} + +void get_expected_gather_result(CUDAVector& expected) { + std::vector &&host_data = expected.copyout(); + get_expected_gather_result(host_data); + expected.copyin(host_data); +} + +void get_expected_scatter_result(CUDAVector& expected) { + std::vector &&host_data = expected.copyout(); + get_expected_scatter_result(host_data); expected.copyin(host_data); }