From 5ab2991dc51f300f26b6e916e3b3760926c30f15 Mon Sep 17 00:00:00 2001
From: Nikoli Dryden <dryden1@llnl.gov>
Date: Fri, 23 Nov 2018 10:23:46 -0800
Subject: [PATCH] Add tests for all collectives.

This changes test_correctness to test_allreduces for consistency.

New argument passing added into test_utils.

Note that the allgather tests currently fail (see #22).
---
 test/CMakeLists.txt                           |  99 ++-----
 test/test_allgather.cpp                       | 178 ++++++++++++
 ...est_correctness.cpp => test_allreduce.cpp} |  34 +--
 test/test_alltoall.cpp                        | 165 +++++++++++
 test/test_bcast.cpp                           | 142 +++++++++
 test/test_gather.cpp                          | 165 +++++++++++
 test/test_reduce.cpp                          | 165 +++++++++++
 test/test_reduce_scatter.cpp                  | 170 +++++++++++
 test/test_scatter.cpp                         | 167 +++++++++++
 test/test_utils.hpp                           | 270 +++++++++++++++++-
 test/test_utils_cuda.hpp                      |  61 +++-
 11 files changed, 1495 insertions(+), 121 deletions(-)
 create mode 100644 test/test_allgather.cpp
 rename test/{test_correctness.cpp => test_allreduce.cpp} (87%)
 create mode 100644 test/test_alltoall.cpp
 create mode 100644 test/test_bcast.cpp
 create mode 100644 test/test_gather.cpp
 create mode 100644 test/test_reduce.cpp
 create mode 100644 test/test_reduce_scatter.cpp
 create mode 100644 test/test_scatter.cpp

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bd90805c..23b3e88d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,8 +1,3 @@
-# The name changed in CMake 3.10
-if (NOT MPIEXEC_EXECUTABLE AND MPIEXEC)
-  set(MPIEXEC_EXECUTABLE ${MPIEXEC})
-endif ()
-
 set_full_path(TEST_HEADERS
   test_utils.hpp)
 if (AL_HAS_CUDA)
@@ -23,78 +18,30 @@ target_sources(aluminum_test_headers INTERFACE "${TEST_HEADERS}")
 target_include_directories(
   aluminum_test_headers INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}")
 
-add_executable(TestCorrectness.exe test_correctness.cpp ${TEST_HEADERS})
-target_link_libraries(TestCorrectness.exe PRIVATE Al)
-if (AL_HAS_CUDA)
-  target_link_libraries(TestCorrectness.exe PUBLIC cuda)
-endif ()
-
-# This is mostly a sanity check
-set(TEST_ARGS MPI 8)
-add_test(NAME TestCorrectness
-  COMMAND $<TARGET_FILE:TestCorrectness.exe> ${TEST_ARGS})
+set(TEST_SRCS
+  test_allreduce.cpp
+  test_reduce.cpp
+  test_reduce_scatter.cpp
+  test_allgather.cpp
+  test_alltoall.cpp
+  test_bcast.cpp
+  test_gather.cpp
+  test_scatter.cpp
+  test_multi_nballreduces.cpp
+  test_nccl_collectives.cpp)
+
+foreach(src ${TEST_SRCS})
+  string(REPLACE ".cpp" ".exe" _test_exe_name "${src}")
+  add_executable(${_test_exe_name} ${src})
+  target_link_libraries(${_test_exe_name} PRIVATE Al aluminum_test_headers)
+  if (AL_HAS_CUDA)
+    target_link_libraries(${_test_exe_name} PUBLIC cuda)
+  endif()
+endforeach()
 
-if (MPIEXEC_EXECUTABLE)
-  add_test(NAME TestCorrectness_np4
-    COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4
-    ${MPIEXEC_PREFLAGS}
-    $<TARGET_FILE:TestCorrectness.exe>
-    ${MPIEXEC_POSTFLAGS} ${TEST_ARGS})
-endif ()
-
-add_executable(TestMultiNBAllReduces.exe
-  test_multi_nballreduces.cpp ${TEST_HEADERS})
-target_link_libraries(TestMultiNBAllReduces.exe PRIVATE Al)
 if (AL_HAS_CUDA)
-  target_link_libraries(TestMultiNBAllReduces.exe PUBLIC cuda)
-endif ()
-
-set(TEST_ARGS "8")
-add_test(NAME TestMultiNBAllReduces
-  COMMAND $<TARGET_FILE:TestMultiNBAllReduces.exe> ${TEST_ARGS})
-
-if (MPIEXEC_EXECUTABLE)
-  add_test(NAME TestMultiNBAllReduces_np4
-    COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4
-    ${MPIEXEC_PREFLAGS}
-    $<TARGET_FILE:TestMultiNBAllReduces.exe>
-    ${MPIEXEC_POSTFLAGS} ${TEST_ARGS})
-endif ()
-
-add_executable(TestNCCLCollectives.exe
-  test_nccl_collectives.cpp ${TEST_HEADERS})
-target_link_libraries(TestNCCLCollectives.exe PRIVATE Al)
-if (AL_HAS_CUDA)
-  target_link_libraries(TestNCCLCollectives.exe PUBLIC cuda)
-endif ()
-
-set(TEST_ARGS "8")
-add_test(NAME TestNCCLCollectives
-  COMMAND $<TARGET_FILE:TestNCCLCollectives.exe> ${TEST_ARGS})
-
-if (MPIEXEC_EXECUTABLE)
-  add_test(NAME TestNCCLCollectives_np4
-    COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4
-    ${MPIEXEC_PREFLAGS}
-    $<TARGET_FILE:TestNCCLCollectives.exe>
-    ${MPIEXEC_POSTFLAGS} ${TEST_ARGS})
-endif ()
-
-if (AL_HAS_CUDA)
-  add_executable(TestStreamMemOps.exe
+  add_executable(test_stream_mem_ops.exe
   test_stream_mem_ops.cpp ${TEST_HEADERS})
-  target_link_libraries(TestStreamMemOps.exe PRIVATE Al)
-  target_link_libraries(TestStreamMemOps.exe PUBLIC cuda)
-
-  set(TEST_ARGS "8")
-  add_test(NAME TestStreamMemOps
-    COMMAND $<TARGET_FILE:TestStreamMemOps.exe> ${TEST_ARGS})
-
-  if (MPIEXEC_EXECUTABLE)
-    add_test(NAME TestStreamMemOps_np4
-      COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4
-      ${MPIEXEC_PREFLAGS}
-      $<TARGET_FILE:TestStreamMemOps.exe>
-      ${MPIEXEC_POSTFLAGS} ${TEST_ARGS})
-  endif ()
+  target_link_libraries(test_stream_mem_ops.exe PRIVATE Al)
+  target_link_libraries(test_stream_mem_ops.exe PUBLIC cuda)
 endif ()
diff --git a/test/test_allgather.cpp b/test/test_allgather.cpp
new file mode 100644
index 00000000..3c95f232
--- /dev/null
+++ b/test/test_allgather.cpp
@@ -0,0 +1,178 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
+// Lawrence Livermore National Laboratory in collaboration with University of
+// Illinois Urbana-Champaign.
+//
+// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-756777.
+// All rights reserved.
+//
+// This file is part of Aluminum GPU-aware Communication Library. For details, see
+// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "Al.hpp"
+#include "test_utils.hpp"
+#ifdef AL_HAS_NCCL
+#include "test_utils_nccl_cuda.hpp"
+#endif
+#ifdef AL_HAS_MPI_CUDA
+#include "test_utils_mpi_cuda.hpp"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include <string>
+
+// Size is the per-rank send size.
+size_t start_size = 1;
+size_t max_size = 1<<30;
+
+/**
+ * Test allgather algo on input, check with expected.
+ */
+template <typename Backend>
+void test_allgather_algo(const typename VectorType<Backend>::type& expected,
+                         const typename VectorType<Backend>::type& expected_inplace,
+                         typename VectorType<Backend>::type input,
+                         typename VectorType<Backend>::type input_inplace,
+                         typename Backend::comm_type& comm,
+                         typename Backend::algo_type algo) {
+  auto recv = get_vector<Backend>(input.size() * comm.size());
+  // Test regular allgather.
+  Al::Allgather<Backend>(input.data(), recv.data(), input.size(), comm, algo);
+  if (!check_vector(expected, recv)) {
+    std::cout << comm.rank() << ": regular allgather does not match" <<
+        std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place allgather.
+  std::stringstream ss;
+  ss << comm.rank() << ": input: ";
+  for (const auto& v : input_inplace.copyout()) ss << v << " ";
+  std::cout << ss.str() << std::endl;
+  Al::Allgather<Backend>(input_inplace.data(), input_inplace.size() / comm.size(),
+                         comm, algo);
+  MPI_Barrier(MPI_COMM_WORLD);
+  if (!check_vector(expected_inplace, input_inplace)) {
+    std::cout << comm.rank() << ": in-place allgather does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+/**
+ * Test non-blocking allgather algo on input, check with expected.
+ */
+template <typename Backend>
+void test_nb_allgather_algo(const typename VectorType<Backend>::type& expected,
+                            const typename VectorType<Backend>::type& expected_inplace,
+                            typename VectorType<Backend>::type input,
+                            typename VectorType<Backend>::type input_inplace,
+                            typename Backend::comm_type& comm,
+                            typename Backend::algo_type algo) {
+  typename Backend::req_type req = get_request<Backend>();
+  auto recv = get_vector<Backend>(input.size() * comm.size());
+  // Test regular allgather.
+  Al::NonblockingAllgather<Backend>(input.data(), recv.data(),
+                                    input.size(), comm, req, algo);
+  Al::Wait<Backend>(req);
+  if (!check_vector(expected, recv)) {
+    std::cout << comm.rank() << ": regular allgather does not match" <<
+      std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place allgather.
+  Al::NonblockingAllgather<Backend>(input_inplace.data(),
+                                    input_inplace.size() / comm.size(),
+                                    comm, req, algo);
+  Al::Wait<Backend>(req);
+  if (!check_vector(expected_inplace, input_inplace)) {
+    std::cout << comm.rank() << ": in-place allgather does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+template <typename Backend>
+void test_correctness() {
+  auto algos = get_allgather_algorithms<Backend>();
+  auto nb_algos = get_nb_allgather_algorithms<Backend>();
+  typename Backend::comm_type comm;  // Use COMM_WORLD.
+  // Compute sizes to test.
+  std::vector<size_t> sizes = get_sizes(start_size, max_size, true);
+  for (const auto& size : sizes) {
+    if (comm.rank() == 0) {
+      std::cout << "Testing size " << human_readable_size(size) << std::endl;
+    }
+    // Compute true value.
+    size_t global_size = size * comm.size();
+    typename VectorType<Backend>::type &&data = gen_data<Backend>(size);
+    auto expected = get_vector<Backend>(global_size);
+    get_expected_allgather_result(data, expected);
+    typename VectorType<Backend>::type &&data_inplace = gen_data<Backend>(global_size);
+    auto expected_inplace(data_inplace);
+    get_expected_allgather_inplace_result(expected_inplace);
+    // Test algorithms.
+    for (auto&& algo : algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_allgather_algo<Backend>(expected, expected_inplace,
+                                   data, data_inplace, comm, algo);
+    }
+    for (auto&& algo : nb_algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_nb_allgather_algo<Backend>(expected, expected_inplace,
+                                      data, data_inplace, comm, algo);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  // Need to set the CUDA device before initializing Aluminum.
+#ifdef AL_HAS_CUDA
+  set_device();
+#endif
+  Al::Initialize(argc, argv);
+
+  std::string backend = "MPI";
+  parse_args(argc, argv, backend, start_size, max_size);
+
+  if (backend == "MPI") {
+    std::cerr << "Allgather not supported on MPI backend." << std::endl;
+    std::abort();
+#ifdef AL_HAS_NCCL
+  } else if (backend == "NCCL") {
+    test_correctness<Al::NCCLBackend>();
+#endif
+#ifdef AL_HAS_MPI_CUDA
+  } else if (backend == "MPI-CUDA") {
+    test_correctness<Al::MPICUDABackend>();
+#endif
+  }
+
+  Al::Finalize();
+  return 0;
+}
diff --git a/test/test_correctness.cpp b/test/test_allreduce.cpp
similarity index 87%
rename from test/test_correctness.cpp
rename to test/test_allreduce.cpp
index eece6f97..f45fa165 100644
--- a/test/test_correctness.cpp
+++ b/test/test_allreduce.cpp
@@ -39,13 +39,9 @@
 #include <math.h>
 #include <string>
 
+size_t start_size = 1;
 size_t max_size = 1<<30;
 
-void get_expected_result(std::vector<float>& expected) {
-  MPI_Allreduce(MPI_IN_PLACE, expected.data(), expected.size(),
-                MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
-}
-
 /**
  * Test allreduce algo on input, check with expected.
  */
@@ -113,14 +109,7 @@ void test_correctness() {
   auto nb_algos = get_nb_allreduce_algorithms<Backend>();
   typename Backend::comm_type comm;  // Use COMM_WORLD.
   // Compute sizes to test.
-  std::vector<size_t> sizes = {0};
-  for (size_t size = 1; size <= max_size; size *= 2) {
-    sizes.push_back(size);
-    // Avoid duplicating 2.
-    if (size > 1) {
-      sizes.push_back(size + 1);
-    }
-  }
+  std::vector<size_t> sizes = get_sizes(start_size, max_size, true);
   for (const auto& size : sizes) {
     if (comm.rank() == 0) {
       std::cout << "Testing size " << human_readable_size(size) << std::endl;
@@ -128,7 +117,7 @@ void test_correctness() {
     // Compute true value.
     typename VectorType<Backend>::type &&data = gen_data<Backend>(size);
     auto expected(data);
-    get_expected_result(expected);
+    get_expected_allreduce_result(expected);
     // Test algorithms.
     for (auto&& algo : algos) {
       MPI_Barrier(MPI_COMM_WORLD);
@@ -155,12 +144,7 @@ int main(int argc, char** argv) {
   Al::Initialize(argc, argv);
 
   std::string backend = "MPI";
-  if (argc >= 2) {
-    backend = argv[1];
-  }
-  if (argc == 3) {
-    max_size = std::stoul(argv[2]);
-  }
+  parse_args(argc, argv, backend, start_size, max_size);
 
   if (backend == "MPI") {
     test_correctness<Al::MPIBackend>();
@@ -172,16 +156,6 @@ int main(int argc, char** argv) {
   } else if (backend == "MPI-CUDA") {
     test_correctness<Al::MPICUDABackend>();
 #endif
-  } else {
-    std::cerr << "usage: " << argv[0] << " [MPI";
-#ifdef AL_HAS_NCCL
-    std::cerr << " | NCCL";
-#endif
-#ifdef AL_HAS_MPI_CUDA
-    std::cerr << " | MPI-CUDA";
-#endif
-    std::cerr << "]" << std::endl;
-    return -1;
   }
 
   Al::Finalize();
diff --git a/test/test_alltoall.cpp b/test/test_alltoall.cpp
new file mode 100644
index 00000000..a5e577f6
--- /dev/null
+++ b/test/test_alltoall.cpp
@@ -0,0 +1,165 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
+// Lawrence Livermore National Laboratory in collaboration with University of
+// Illinois Urbana-Champaign.
+//
+// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-756777.
+// All rights reserved.
+//
+// This file is part of Aluminum GPU-aware Communication Library. For details, see
+// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "Al.hpp"
+#include "test_utils.hpp"
+#ifdef AL_HAS_NCCL
+#include "test_utils_nccl_cuda.hpp"
+#endif
+#ifdef AL_HAS_MPI_CUDA
+#include "test_utils_mpi_cuda.hpp"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include <string>
+
+// Per-process send/recv size.
+size_t start_size = 1;
+size_t max_size = 1<<30;
+
+/**
+ * Test alltoall algo on input, check with expected.
+ */
+template <typename Backend>
+void test_alltoall_algo(const typename VectorType<Backend>::type& expected,
+                        typename VectorType<Backend>::type input,
+                        typename Backend::comm_type& comm,
+                        typename Backend::algo_type algo) {
+  auto recv = get_vector<Backend>(input.size());
+  // Test regular alltoall.
+  Al::Alltoall<Backend>(input.data(), recv.data(), input.size() / comm.size(),
+                        comm, algo);
+  if (!check_vector(expected, recv)) {
+    std::cout << comm.rank() << ": regular alltoall does not match" <<
+        std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place alltoall.
+  Al::Alltoall<Backend>(input.data(), input.size() / comm.size(), comm, algo);
+  if (!check_vector(expected, input)) {
+    std::cout << comm.rank() << ": in-place alltoall does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+/**
+ * Test non-blocking alltoall algo on input, check with expected.
+ */
+template <typename Backend>
+void test_nb_alltoall_algo(const typename VectorType<Backend>::type& expected,
+                           typename VectorType<Backend>::type input,
+                           typename Backend::comm_type& comm,
+                           typename Backend::algo_type algo) {
+  typename Backend::req_type req = get_request<Backend>();
+  auto recv = get_vector<Backend>(input.size());
+  // Test regular alltoall.
+  Al::NonblockingAlltoall<Backend>(input.data(), recv.data(),
+                                   input.size() / comm.size(),
+                                   comm, req, algo);
+  Al::Wait<Backend>(req);
+  if (!check_vector(expected, recv)) {
+    std::cout << comm.rank() << ": regular alltoall does not match" <<
+      std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place alltoall.
+  Al::NonblockingAlltoall<Backend>(input.data(), input.size() / comm.size(),
+                                   comm, req, algo);
+  Al::Wait<Backend>(req);
+  if (!check_vector(expected, input)) {
+    std::cout << comm.rank() << ": in-place alltoall does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+template <typename Backend>
+void test_correctness() {
+  auto algos = get_alltoall_algorithms<Backend>();
+  auto nb_algos = get_nb_alltoall_algorithms<Backend>();
+  typename Backend::comm_type comm;  // Use COMM_WORLD.
+  // Compute sizes to test.
+  std::vector<size_t> sizes = get_sizes(start_size, max_size, true);
+  for (const auto& size : sizes) {
+    if (comm.rank() == 0) {
+      std::cout << "Testing size " << human_readable_size(size) << std::endl;
+    }
+    size_t full_size = size * comm.size();
+    // Compute true value.
+    typename VectorType<Backend>::type &&data = gen_data<Backend>(full_size);
+    auto expected(data);
+    get_expected_alltoall_result(expected);
+    // Test algorithms.
+    for (auto&& algo : algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_alltoall_algo<Backend>(expected, data, comm, algo);
+    }
+    for (auto&& algo : nb_algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_nb_alltoall_algo<Backend>(expected, data, comm, algo);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  // Need to set the CUDA device before initializing Aluminum.
+#ifdef AL_HAS_CUDA
+  set_device();
+#endif
+  Al::Initialize(argc, argv);
+
+  std::string backend = "MPI";
+  parse_args(argc, argv, backend, start_size, max_size);
+
+  if (backend == "MPI") {
+    std::cerr << "Alltoall not supported on MPI backend." << std::endl;
+    std::abort();
+#ifdef AL_HAS_NCCL
+  } else if (backend == "NCCL") {
+    std::cerr << "Alltoall not supported on NCCL backend." << std::endl;
+    std::abort();
+#endif
+#ifdef AL_HAS_MPI_CUDA
+  } else if (backend == "MPI-CUDA") {
+    test_correctness<Al::MPICUDABackend>();
+#endif
+  }
+
+  Al::Finalize();
+  return 0;
+}
diff --git a/test/test_bcast.cpp b/test/test_bcast.cpp
new file mode 100644
index 00000000..28bc1a83
--- /dev/null
+++ b/test/test_bcast.cpp
@@ -0,0 +1,142 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
+// Lawrence Livermore National Laboratory in collaboration with University of
+// Illinois Urbana-Champaign.
+//
+// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-756777.
+// All rights reserved.
+//
+// This file is part of Aluminum GPU-aware Communication Library. For details, see
+// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "Al.hpp"
+#include "test_utils.hpp"
+#ifdef AL_HAS_NCCL
+#include "test_utils_nccl_cuda.hpp"
+#endif
+#ifdef AL_HAS_MPI_CUDA
+#include "test_utils_mpi_cuda.hpp"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include <string>
+
+size_t start_size = 1;
+size_t max_size = 1<<30;
+
+/**
+ * Test bcast algo on input, check with expected.
+ */
+template <typename Backend>
+void test_bcast_algo(const typename VectorType<Backend>::type& expected,
+                     typename VectorType<Backend>::type input,
+                     typename Backend::comm_type& comm,
+                     typename Backend::algo_type algo) {
+  // Test in-place bcast (bcast is always in-place).
+  Al::Bcast<Backend>(input.data(), input.size(),
+                     0, comm, algo);
+  if (!check_vector(expected, input)) {
+    std::cout << comm.rank() << ": in-place bcast does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+/**
+ * Test non-blocking bcast algo on input, check with expected.
+ */
+template <typename Backend>
+void test_nb_bcast_algo(const typename VectorType<Backend>::type& expected,
+                         typename VectorType<Backend>::type input,
+                         typename Backend::comm_type& comm,
+                         typename Backend::algo_type algo) {
+  typename Backend::req_type req = get_request<Backend>();
+  // Test in-place bcast (bcast is always in-place).
+  Al::NonblockingBcast<Backend>(input.data(), input.size(),
+                                0, comm, req, algo);
+  Al::Wait<Backend>(req);
+  if (!check_vector(expected, input)) {
+    std::cout << comm.rank() << ": in-place bcast does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+template <typename Backend>
+void test_correctness() {
+  auto algos = get_bcast_algorithms<Backend>();
+  auto nb_algos = get_nb_bcast_algorithms<Backend>();
+  typename Backend::comm_type comm;  // Use COMM_WORLD.
+  // Compute sizes to test.
+  std::vector<size_t> sizes = get_sizes(start_size, max_size, true);
+  for (const auto& size : sizes) {
+    if (comm.rank() == 0) {
+      std::cout << "Testing size " << human_readable_size(size) << std::endl;
+    }
+    // Compute true value.
+    typename VectorType<Backend>::type &&data = gen_data<Backend>(size);
+    auto expected(data);
+    get_expected_bcast_result(expected);
+    // Test algorithms.
+    for (auto&& algo : algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        // TODO: Update when we have real algorithm sets for each op.
+        std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_bcast_algo<Backend>(expected, data, comm, algo);
+    }
+    for (auto&& algo : nb_algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_nb_bcast_algo<Backend>(expected, data, comm, algo);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  // Need to set the CUDA device before initializing Aluminum.
+#ifdef AL_HAS_CUDA
+  set_device();
+#endif
+  Al::Initialize(argc, argv);
+
+  std::string backend = "MPI";
+  parse_args(argc, argv, backend, start_size, max_size);
+
+  if (backend == "MPI") {
+    std::cerr << "Bcast not supported on MPI backend." << std::endl;
+    std::abort();
+#ifdef AL_HAS_NCCL
+  } else if (backend == "NCCL") {
+    test_correctness<Al::NCCLBackend>();
+#endif
+#ifdef AL_HAS_MPI_CUDA
+  } else if (backend == "MPI-CUDA") {
+    test_correctness<Al::MPICUDABackend>();
+#endif
+  }
+
+  Al::Finalize();
+  return 0;
+}
diff --git a/test/test_gather.cpp b/test/test_gather.cpp
new file mode 100644
index 00000000..9aeccd29
--- /dev/null
+++ b/test/test_gather.cpp
@@ -0,0 +1,165 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
+// Lawrence Livermore National Laboratory in collaboration with University of
+// Illinois Urbana-Champaign.
+//
+// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-756777.
+// All rights reserved.
+//
+// This file is part of Aluminum GPU-aware Communication Library. For details, see
+// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "Al.hpp"
+#include "test_utils.hpp"
+#ifdef AL_HAS_NCCL
+#include "test_utils_nccl_cuda.hpp"
+#endif
+#ifdef AL_HAS_MPI_CUDA
+#include "test_utils_mpi_cuda.hpp"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include <string>
+
+// Per-rank data size.
+size_t start_size = 1;
+size_t max_size = 1<<30;
+
+/**
+ * Test gather algo on input, check with expected.
+ */
+template <typename Backend>
+void test_gather_algo(const typename VectorType<Backend>::type& expected,
+                      typename VectorType<Backend>::type input,
+                      typename Backend::comm_type& comm,
+                      typename Backend::algo_type algo) {
+  auto recv = get_vector<Backend>(input.size());
+  // Test regular gather.
+  Al::Gather<Backend>(input.data(), recv.data(), input.size() / comm.size(),
+                      0, comm, algo);
+  if (comm.rank() == 0 && !check_vector(expected, recv)) {
+    std::cout << comm.rank() << ": regular gather does not match" <<
+        std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place gather.
+  Al::Gather<Backend>(input.data(), input.size() / comm.size(), 0, comm, algo);
+  if (comm.rank() == 0 && !check_vector(expected, input)) {
+    std::cout << comm.rank() << ": in-place gather does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+/**
+ * Test non-blocking gather algo on input, check with expected.
+ */
+template <typename Backend>
+void test_nb_gather_algo(const typename VectorType<Backend>::type& expected,
+                         typename VectorType<Backend>::type input,
+                         typename Backend::comm_type& comm,
+                         typename Backend::algo_type algo) {
+  typename Backend::req_type req = get_request<Backend>();
+  auto recv = get_vector<Backend>(input.size());
+  // Test regular gather.
+  Al::NonblockingGather<Backend>(input.data(), recv.data(),
+                                 input.size() / comm.size(),
+                                 0, comm, req, algo);
+  Al::Wait<Backend>(req);
+  if (comm.rank() == 0 && !check_vector(expected, recv)) {
+    std::cout << comm.rank() << ": regular gather does not match" <<
+      std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place gather.
+  Al::NonblockingGather<Backend>(input.data(), input.size() / comm.size(),
+                                 0, comm, req, algo);
+  Al::Wait<Backend>(req);
+  if (comm.rank() == 0 && !check_vector(expected, input)) {
+    std::cout << comm.rank() << ": in-place gather does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+template <typename Backend>
+void test_correctness() {
+  auto algos = get_gather_algorithms<Backend>();
+  auto nb_algos = get_nb_gather_algorithms<Backend>();
+  typename Backend::comm_type comm;  // Use COMM_WORLD.
+  // Compute sizes to test.
+  std::vector<size_t> sizes = get_sizes(start_size, max_size, true);
+  for (const auto& size : sizes) {
+    if (comm.rank() == 0) {
+      std::cout << "Testing size " << human_readable_size(size) << std::endl;
+    }
+    // Compute true value.
+    typename VectorType<Backend>::type &&data = gen_data<Backend>(size*comm.size());
+    auto expected(data);
+    get_expected_gather_result(expected);
+    // Test algorithms.
+    for (auto&& algo : algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        // TODO: Update when we have real algorithm sets for each op.
+        std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_gather_algo<Backend>(expected, data, comm, algo);
+    }
+    for (auto&& algo : nb_algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_nb_gather_algo<Backend>(expected, data, comm, algo);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  // Need to set the CUDA device before initializing Aluminum.
+#ifdef AL_HAS_CUDA
+  set_device();
+#endif
+  Al::Initialize(argc, argv);
+
+  std::string backend = "MPI";
+  parse_args(argc, argv, backend, start_size, max_size);
+
+  if (backend == "MPI") {
+    std::cerr << "Gather not supported on MPI backend." << std::endl;
+    std::abort();
+#ifdef AL_HAS_NCCL
+  } else if (backend == "NCCL") {
+    std::cerr << "Gather not supported on NCCL backend." << std::endl;
+    std::abort();
+#endif
+#ifdef AL_HAS_MPI_CUDA
+  } else if (backend == "MPI-CUDA") {
+    test_correctness<Al::MPICUDABackend>();
+#endif
+  }
+
+  Al::Finalize();
+  return 0;
+}
diff --git a/test/test_reduce.cpp b/test/test_reduce.cpp
new file mode 100644
index 00000000..881286f9
--- /dev/null
+++ b/test/test_reduce.cpp
@@ -0,0 +1,165 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
+// Lawrence Livermore National Laboratory in collaboration with University of
+// Illinois Urbana-Champaign.
+//
+// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-756777.
+// All rights reserved.
+//
+// This file is part of Aluminum GPU-aware Communication Library. For details, see
+// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "Al.hpp"
+#include "test_utils.hpp"
+#ifdef AL_HAS_NCCL
+#include "test_utils_nccl_cuda.hpp"
+#endif
+#ifdef AL_HAS_MPI_CUDA
+#include "test_utils_mpi_cuda.hpp"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include <string>
+
+size_t start_size = 1;
+size_t max_size = 1<<30;
+
+/**
+ * Test reduce algo on input, check with expected.
+ */
+template <typename Backend>
+void test_reduce_algo(const typename VectorType<Backend>::type& expected,
+                      typename VectorType<Backend>::type input,
+                      typename Backend::comm_type& comm,
+                      typename Backend::algo_type algo) {
+  auto recv = get_vector<Backend>(input.size());
+  // Test regular reduce.
+  Al::Reduce<Backend>(input.data(), recv.data(), input.size(),
+                      Al::ReductionOperator::sum, 0, comm, algo);
+  if (comm.rank() == 0 && !check_vector(expected, recv)) {
+    std::cout << comm.rank() << ": regular reduce does not match" <<
+        std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place reduce.
+  Al::Reduce<Backend>(input.data(), input.size(),
+                      Al::ReductionOperator::sum, 0, comm, algo);
+  if (comm.rank() == 0 && !check_vector(expected, input)) {
+    std::cout << comm.rank() << ": in-place reduce does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+/**
+ * Test non-blocking reduce algo on input, check with expected.
+ */
+template <typename Backend>
+void test_nb_reduce_algo(const typename VectorType<Backend>::type& expected,
+                         typename VectorType<Backend>::type input,
+                         typename Backend::comm_type& comm,
+                         typename Backend::algo_type algo) {
+  typename Backend::req_type req = get_request<Backend>();
+  auto recv = get_vector<Backend>(input.size());
+  // Test regular reduce.
+  Al::NonblockingReduce<Backend>(input.data(), recv.data(), input.size(),
+                                 Al::ReductionOperator::sum, 0, comm,
+                                 req, algo);
+  Al::Wait<Backend>(req);
+  if (comm.rank() == 0 && !check_vector(expected, recv)) {
+    std::cout << comm.rank() << ": regular reduce does not match" <<
+      std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place reduce.
+  Al::NonblockingReduce<Backend>(input.data(), input.size(),
+                                 Al::ReductionOperator::sum, 0, comm,
+                                 req, algo);
+  Al::Wait<Backend>(req);
+  if (comm.rank() == 0 && !check_vector(expected, input)) {
+    std::cout << comm.rank() << ": in-place reduce does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+template <typename Backend>
+void test_correctness() {
+  auto algos = get_reduce_algorithms<Backend>();
+  auto nb_algos = get_nb_reduce_algorithms<Backend>();
+  typename Backend::comm_type comm;  // Use COMM_WORLD.
+  // Compute sizes to test.
+  std::vector<size_t> sizes = get_sizes(start_size, max_size, true);
+  for (const auto& size : sizes) {
+    if (comm.rank() == 0) {
+      std::cout << "Testing size " << human_readable_size(size) << std::endl;
+    }
+    // Compute true value.
+    typename VectorType<Backend>::type &&data = gen_data<Backend>(size);
+    auto expected(data);
+    get_expected_reduce_result(expected);
+    // Test algorithms.
+    for (auto&& algo : algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        // TODO: Update when we have real algorithm sets for each op.
+        std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_reduce_algo<Backend>(expected, data, comm, algo);
+    }
+    for (auto&& algo : nb_algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_nb_reduce_algo<Backend>(expected, data, comm, algo);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  // Need to set the CUDA device before initializing Aluminum.
+#ifdef AL_HAS_CUDA
+  set_device();
+#endif
+  Al::Initialize(argc, argv);
+
+  std::string backend = "MPI";
+  parse_args(argc, argv, backend, start_size, max_size);
+
+  if (backend == "MPI") {
+    std::cerr << "Reduce not supported on MPI backend." << std::endl;
+    std::abort();
+#ifdef AL_HAS_NCCL
+  } else if (backend == "NCCL") {
+    test_correctness<Al::NCCLBackend>();
+#endif
+#ifdef AL_HAS_MPI_CUDA
+  } else if (backend == "MPI-CUDA") {
+    test_correctness<Al::MPICUDABackend>();
+#endif
+  }
+
+  Al::Finalize();
+  return 0;
+}
diff --git a/test/test_reduce_scatter.cpp b/test/test_reduce_scatter.cpp
new file mode 100644
index 00000000..6a845e5f
--- /dev/null
+++ b/test/test_reduce_scatter.cpp
@@ -0,0 +1,170 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
+// Lawrence Livermore National Laboratory in collaboration with University of
+// Illinois Urbana-Champaign.
+//
+// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-756777.
+// All rights reserved.
+//
+// This file is part of Aluminum GPU-aware Communication Library. For details, see
+// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "Al.hpp"
+#include "test_utils.hpp"
+#ifdef AL_HAS_NCCL
+#include "test_utils_nccl_cuda.hpp"
+#endif
+#ifdef AL_HAS_MPI_CUDA
+#include "test_utils_mpi_cuda.hpp"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include <string>
+
+// Size is the per-rank recv size.
+size_t start_size = 1;
+size_t max_size = 1<<30;
+
+/**
+ * Test reduce-scatter algo on input, check with expected.
+ */
+template <typename Backend>
+void test_reduce_scatter_algo(const typename VectorType<Backend>::type& expected,
+                              typename VectorType<Backend>::type input,
+                              typename Backend::comm_type& comm,
+                              typename Backend::algo_type algo) {
+  auto recv = get_vector<Backend>(input.size() / comm.size());
+  // Test regular reduce-scatter.
+  Al::Reduce_scatter<Backend>(input.data(), recv.data(),
+                              input.size() / comm.size(),
+                              Al::ReductionOperator::sum, comm, algo);
+  if (!check_vector(expected, recv, 0, input.size() / comm.size())) {
+    std::cout << comm.rank() << ": regular reduce-scatter does not match" <<
+        std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place reduce-scatter.
+  Al::Reduce_scatter<Backend>(input.data(), input.size() / comm.size(),
+                              Al::ReductionOperator::sum, comm, algo);
+  if (!check_vector(expected, input, 0, input.size() / comm.size())) {
+    std::cout << comm.rank() << ": in-place reduce-scatter does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+/**
+ * Test non-blocking reduce-scatter algo on input, check with expected.
+ */
+template <typename Backend>
+void test_nb_reduce_scatter_algo(const typename VectorType<Backend>::type& expected,
+                                 typename VectorType<Backend>::type input,
+                                 typename Backend::comm_type& comm,
+                                 typename Backend::algo_type algo) {
+  typename Backend::req_type req = get_request<Backend>();
+  auto recv = get_vector<Backend>(input.size() / comm.size());
+  // Test regular reduce-scatter.
+  Al::NonblockingReduce_scatter<Backend>(input.data(), recv.data(),
+                                         input.size() / comm.size(),
+                                         Al::ReductionOperator::sum, comm,
+                                         req, algo);
+  Al::Wait<Backend>(req);
+  if (!check_vector(expected, recv, 0, input.size() / comm.size())) {
+    std::cout << comm.rank() << ": regular reduce-scatter does not match" <<
+      std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place reduce-scatter.
+  Al::NonblockingReduce_scatter<Backend>(input.data(),
+                                         input.size() / comm.size(),
+                                         Al::ReductionOperator::sum, comm,
+                                         req, algo);
+  Al::Wait<Backend>(req);
+  if (!check_vector(expected, input, 0, input.size() / comm.size())) {
+    std::cout << comm.rank() << ": in-place reduce-scatter does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+template <typename Backend>
+void test_correctness() {
+  auto algos = get_reduce_scatter_algorithms<Backend>();
+  auto nb_algos = get_nb_reduce_scatter_algorithms<Backend>();
+  typename Backend::comm_type comm;  // Use COMM_WORLD.
+  // Compute sizes to test.
+  std::vector<size_t> sizes = get_sizes(start_size, max_size, true);
+  for (const auto& size : sizes) {
+    if (comm.rank() == 0) {
+      std::cout << "Testing size " << human_readable_size(size) << std::endl;
+    }
+    // Compute true value.
+    size_t global_size = size * comm.size();
+    typename VectorType<Backend>::type &&data = gen_data<Backend>(global_size);
+    auto expected(data);
+    get_expected_reduce_scatter_result(expected);
+    // Test algorithms.
+    for (auto&& algo : algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        // TODO: Update when we have real algorithm sets for each op.
+        std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_reduce_scatter_algo<Backend>(expected, data, comm, algo);
+    }
+    for (auto&& algo : nb_algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_nb_reduce_scatter_algo<Backend>(expected, data, comm, algo);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  // Need to set the CUDA device before initializing Aluminum.
+#ifdef AL_HAS_CUDA
+  set_device();
+#endif
+  Al::Initialize(argc, argv);
+
+  std::string backend = "MPI";
+  parse_args(argc, argv, backend, start_size, max_size);
+
+  if (backend == "MPI") {
+    std::cerr << "Reduce-scatter not supported on MPI backend." << std::endl;
+    std::abort();
+#ifdef AL_HAS_NCCL
+  } else if (backend == "NCCL") {
+    test_correctness<Al::NCCLBackend>();
+#endif
+#ifdef AL_HAS_MPI_CUDA
+  } else if (backend == "MPI-CUDA") {
+    test_correctness<Al::MPICUDABackend>();
+#endif
+  }
+
+  Al::Finalize();
+  return 0;
+}
diff --git a/test/test_scatter.cpp b/test/test_scatter.cpp
new file mode 100644
index 00000000..4b40b635
--- /dev/null
+++ b/test/test_scatter.cpp
@@ -0,0 +1,167 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2018, Lawrence Livermore National Security, LLC.  Produced at the
+// Lawrence Livermore National Laboratory in collaboration with University of
+// Illinois Urbana-Champaign.
+//
+// Written by the LBANN Research Team (N. Dryden, N. Maruyama, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-756777.
+// All rights reserved.
+//
+// This file is part of Aluminum GPU-aware Communication Library. For details, see
+// http://software.llnl.gov/Aluminum or https://github.com/LLNL/Aluminum.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "Al.hpp"
+#include "test_utils.hpp"
+#ifdef AL_HAS_NCCL
+#include "test_utils_nccl_cuda.hpp"
+#endif
+#ifdef AL_HAS_MPI_CUDA
+#include "test_utils_mpi_cuda.hpp"
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include <string>
+
+// Per-rank size.
+size_t start_size = 1;
+size_t max_size = 1<<30;
+
+/**
+ * Test scatter algo on input, check with expected.
+ */
+template <typename Backend>
+void test_scatter_algo(const typename VectorType<Backend>::type& expected,
+                       typename VectorType<Backend>::type input,
+                       typename Backend::comm_type& comm,
+                       typename Backend::algo_type algo) {
+  auto recv = get_vector<Backend>(input.size() / comm.size());
+  // Test regular scatter.
+  Al::Scatter<Backend>(input.data(), recv.data(),
+                       input.size() / comm.size(),
+                       0, comm, algo);
+  if (!check_vector(expected, recv, 0, input.size() / comm.size())) {
+    std::cout << comm.rank() << ": regular scatter does not match" <<
+        std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place scatter.
+  Al::Scatter<Backend>(input.data(), input.size() / comm.size(),
+                       0, comm, algo);
+  if (!check_vector(expected, input, 0, input.size() / comm.size())) {
+    std::cout << comm.rank() << ": in-place scatter does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+/**
+ * Test non-blocking scatter algo on input, check with expected.
+ */
+template <typename Backend>
+void test_nb_scatter_algo(const typename VectorType<Backend>::type& expected,
+                         typename VectorType<Backend>::type input,
+                         typename Backend::comm_type& comm,
+                         typename Backend::algo_type algo) {
+  typename Backend::req_type req = get_request<Backend>();
+  auto recv = get_vector<Backend>(input.size() / comm.size());
+  // Test regular scatter.
+  Al::NonblockingScatter<Backend>(input.data(), recv.data(),
+                                  input.size() / comm.size(),
+                                  0, comm, req, algo);
+  Al::Wait<Backend>(req);
+  if (!check_vector(expected, recv, 0, input.size() / comm.size())) {
+    std::cout << comm.rank() << ": regular scatter does not match" <<
+      std::endl;
+    std::abort();
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  // Test in-place scatter.
+  Al::NonblockingScatter<Backend>(input.data(), input.size() / comm.size(),
+                                  0, comm, req, algo);
+  Al::Wait<Backend>(req);
+  if (!check_vector(expected, input, 0, input.size() / comm.size())) {
+    std::cout << comm.rank() << ": in-place scatter does not match" <<
+      std::endl;
+    std::abort();
+  }
+}
+
+template <typename Backend>
+void test_correctness() {
+  auto algos = get_scatter_algorithms<Backend>();
+  auto nb_algos = get_nb_scatter_algorithms<Backend>();
+  typename Backend::comm_type comm;  // Use COMM_WORLD.
+  // Compute sizes to test.
+  std::vector<size_t> sizes = get_sizes(start_size, max_size, true);
+  for (const auto& size : sizes) {
+    if (comm.rank() == 0) {
+      std::cout << "Testing size " << human_readable_size(size) << std::endl;
+    }
+    // Compute true value.
+    typename VectorType<Backend>::type &&data = gen_data<Backend>(size*comm.size());
+    auto expected(data);
+    get_expected_scatter_result(expected);
+    // Test algorithms.
+    for (auto&& algo : algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        // TODO: Update when we have real algorithm sets for each op.
+        std::cout << " Algo: " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_scatter_algo<Backend>(expected, data, comm, algo);
+    }
+    for (auto&& algo : nb_algos) {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (comm.rank() == 0) {
+        std::cout << " Algo: NB " << Al::allreduce_name(algo) << std::endl;
+      }
+      test_nb_scatter_algo<Backend>(expected, data, comm, algo);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  // Need to set the CUDA device before initializing Aluminum.
+#ifdef AL_HAS_CUDA
+  set_device();
+#endif
+  Al::Initialize(argc, argv);
+
+  std::string backend = "MPI";
+  parse_args(argc, argv, backend, start_size, max_size);
+
+  if (backend == "MPI") {
+    std::cerr << "Scatter not supported on MPI backend." << std::endl;
+    std::abort();
+#ifdef AL_HAS_NCCL
+  } else if (backend == "NCCL") {
+    std::cerr << "Scatter not supported on NCCL backend." << std::endl;
+    std::abort();
+#endif
+#ifdef AL_HAS_MPI_CUDA
+  } else if (backend == "MPI-CUDA") {
+    test_correctness<Al::MPICUDABackend>();
+#endif
+  }
+
+  Al::Finalize();
+  return 0;
+}
diff --git a/test/test_utils.hpp b/test/test_utils.hpp
index afba369e..abbd0b56 100644
--- a/test/test_utils.hpp
+++ b/test/test_utils.hpp
@@ -49,6 +49,65 @@ typename VectorType<Backend>::type get_vector(size_t count) {
   return typename VectorType<Backend>::type(count);
 }
 
+/** Parse input arguments. */
+void parse_args(int argc, char** argv,
+                std::string& backend, size_t& start_size, size_t& max_size) {
+  if (argc == 1) {
+    backend = "MPI";
+    return;
+  } else {
+    backend = argv[1];
+    if (argc == 3) {
+      start_size = std::stoul(argv[2]);
+      max_size = start_size;
+    } else if (argc == 4) {
+      start_size = std::stoul(argv[2]);
+      max_size = std::stoul(argv[3]);
+    } else if (argc > 5) {
+      std::cerr << "Unexpected argument." << std::endl;
+      std::abort();
+    }
+  }
+  if (backend != "MPI"
+#ifdef AL_HAS_NCCL
+      && backend != "NCCL"
+#endif
+#ifdef AL_HAS_MPI_CUDA
+      && backend != "MPI-CUDA"
+#endif
+    ) {
+    std::cerr << "Usage: " << argv[0] << " [MPI"
+#ifdef AL_HAS_NCCL
+              << " | NCCL"
+#endif
+#ifdef AL_HAS_MPI_CUDA
+              << " | MPI-CUDA"
+#endif
+              << "] [start size] [max size]"
+              << std::endl;
+    std::abort();
+  }
+}
+
+/**
+ * Return every size to test between start_size and max_size (inclusive).
+ * If odds is true, generate odd-numbered values too.
+ */
+std::vector<size_t> get_sizes(size_t start_size, size_t max_size,
+                              bool odds = false) {
+  std::vector<size_t> sizes;
+  if (start_size == 0) {
+    sizes.push_back(0);
+  }
+  for (size_t size = start_size; size <= max_size; size *= 2) {
+    sizes.push_back(size);
+    if (odds && size > 1) {
+      sizes.push_back(size + 1);
+    }
+  }
+  return sizes;
+}
+
 /** Generate random data of length count. */
 template <typename Backend=Al::MPIBackend>
 typename VectorType<Backend>::type gen_data(size_t count);
@@ -63,6 +122,7 @@ gen_data<Al::MPIBackend>(size_t count) {
       int rank;
       MPI_Comm_rank(MPI_COMM_WORLD, &rank);
       rng_gen.seed(rank);
+      rng_seeded = true;
     }
   }
   std::uniform_real_distribution<float> rng;
@@ -117,7 +177,58 @@ std::vector<typename Backend::algo_type> get_allreduce_algorithms() {
     Backend::algo_type::automatic};
   return algos;
 }
- 
+
+// TODO: Update these once we have real algorithm sets for each op.
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_reduce_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_reduce_scatter_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_allgather_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_bcast_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_alltoall_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_gather_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_scatter_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
 template <>
 std::vector<Al::MPIBackend::algo_type>
 get_allreduce_algorithms<Al::MPIBackend>() {  
@@ -139,6 +250,55 @@ std::vector<typename Backend::algo_type> get_nb_allreduce_algorithms() {
     Backend::algo_type::automatic};
   return algos;
 }
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_nb_reduce_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_nb_reduce_scatter_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_nb_allgather_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_nb_bcast_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_nb_alltoall_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_nb_gather_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
+
+template <typename Backend>
+std::vector<typename Backend::algo_type> get_nb_scatter_algorithms() {
+  std::vector<typename Backend::algo_type> algos = {
+    Backend::algo_type::automatic};
+  return algos;
+}
  
 template <>
 std::vector<Al::MPIBackend::algo_type>
@@ -157,27 +317,35 @@ get_nb_allreduce_algorithms<Al::MPIBackend>() {
 #define eps (1e-4)
 
 bool check_vector(const std::vector<float>& expected,
-                  const std::vector<float>& actual) {
-  bool match = true;
+                  const std::vector<float>& actual,
+                  size_t start = 0,
+                  size_t end = std::numeric_limits<size_t>::max()) {
   int rank;
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  for (size_t i = 0; i < expected.size(); ++i) {
+  if (end == std::numeric_limits<size_t>::max()) {
+    end = expected.size();
+  }
+  for (size_t i = start; i < end; ++i) {
     float e = expected[i];
-
     if (std::abs(e - actual[i]) > eps) {
 #ifdef AL_DEBUG
       std::stringstream ss;
       ss << "[" << rank << "] @" << i << " Expected: " << e
                 << ", Actual: " << actual[i] << "\n";
+      // Helpful for debugging to print out small vectors completely.
+      if (expected.size() < 128) {
+        ss << "[" << rank << "] expected: ";
+        for (const auto& v : expected) ss << v << " ";
+        ss << "actual: ";
+        for (const auto& v : actual) ss << v << " ";
+        ss << "\n";
+      }
       std::cerr << ss.str();
-      match = false;
-      return false;
-#else
-      return false;
 #endif
+      return false;
     }
   }
-  return match;
+  return true;
 }
 
 void print_stats(std::vector<double>& times) {
@@ -223,3 +391,85 @@ inline typename Al::MPIBackend::req_type
 get_request<Al::MPIBackend>() {
   return Al::MPIBackend::null_req;
 }
+
+void get_expected_allreduce_result(std::vector<float>& expected) {
+  MPI_Allreduce(MPI_IN_PLACE, expected.data(), expected.size(),
+                MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+}
+
+void get_expected_reduce_scatter_result(std::vector<float>& expected) {
+  int nprocs;
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  MPI_Reduce_scatter_block(MPI_IN_PLACE, expected.data(),
+                           expected.size() / nprocs,
+                           MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+}
+
+void get_expected_allgather_result(std::vector<float>& input,
+                                   std::vector<float>& expected) {
+  MPI_Allgather(input.data(), input.size(), MPI_FLOAT,
+                expected.data(), input.size(), MPI_FLOAT,
+                MPI_COMM_WORLD);
+}
+
+void get_expected_allgather_inplace_result(std::vector<float>& expected) {
+  int nprocs;
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  MPI_Allgather(MPI_IN_PLACE, expected.size() / nprocs, MPI_FLOAT,
+                expected.data(), expected.size() / nprocs, MPI_FLOAT,
+                MPI_COMM_WORLD);
+}
+
+void get_expected_alltoall_result(std::vector<float>& expected) {
+  int nprocs;
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  MPI_Alltoall(MPI_IN_PLACE, expected.size() / nprocs, MPI_FLOAT,
+               expected.data(), expected.size() / nprocs, MPI_FLOAT,
+               MPI_COMM_WORLD);
+}
+
+void get_expected_reduce_result(std::vector<float>& expected) {
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank == 0) {
+    MPI_Reduce(MPI_IN_PLACE, expected.data(), expected.size(),
+               MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
+  } else {
+    MPI_Reduce(expected.data(), expected.data(), expected.size(),
+               MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
+  }
+}
+
+void get_expected_bcast_result(std::vector<float>& expected) {
+  MPI_Bcast(expected.data(), expected.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);
+}
+
+void get_expected_gather_result(std::vector<float>& expected) {
+  int rank, nprocs;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  if (rank == 0) {
+    MPI_Gather(MPI_IN_PLACE, expected.size() / nprocs, MPI_FLOAT,
+               expected.data(), expected.size() / nprocs, MPI_FLOAT,
+               0, MPI_COMM_WORLD);
+  } else {
+    MPI_Gather(expected.data(), expected.size() / nprocs, MPI_FLOAT,
+               expected.data(), expected.size() / nprocs, MPI_FLOAT,
+               0, MPI_COMM_WORLD);
+  }
+}
+
+void get_expected_scatter_result(std::vector<float>& expected) {
+  int rank, nprocs;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  if (rank == 0) {
+    MPI_Scatter(expected.data(), expected.size() / nprocs, MPI_FLOAT,
+                MPI_IN_PLACE, expected.size() / nprocs, MPI_FLOAT,
+                0, MPI_COMM_WORLD);
+  } else {
+    MPI_Scatter(expected.data(), expected.size() / nprocs, MPI_FLOAT,
+                expected.data(), expected.size() / nprocs, MPI_FLOAT,
+                0, MPI_COMM_WORLD);
+  }
+}
diff --git a/test/test_utils_cuda.hpp b/test/test_utils_cuda.hpp
index af80c0e8..e48a9737 100644
--- a/test/test_utils_cuda.hpp
+++ b/test/test_utils_cuda.hpp
@@ -210,16 +210,67 @@ class CUDAVector {
 
 
 bool check_vector(const CUDAVector<float>& expected,
-                  const CUDAVector<float>& actual) {
+                  const CUDAVector<float>& actual,
+                  size_t start = 0,
+                  size_t end = std::numeric_limits<size_t>::max()) {
   std::vector<float> &&expected_host = expected.copyout();
   std::vector<float> &&actual_host = actual.copyout();
-  return check_vector(expected_host, actual_host);
+  return check_vector(expected_host, actual_host, start, end);
 }
 
-void get_expected_result(CUDAVector<float>& expected) {
+void get_expected_allreduce_result(CUDAVector<float>& expected) {
   std::vector<float> &&host_data = expected.copyout();
-  MPI_Allreduce(MPI_IN_PLACE, host_data.data(), expected.size(),
-                MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+  get_expected_allreduce_result(host_data);
+  expected.copyin(host_data);
+}
+
+void get_expected_reduce_scatter_result(CUDAVector<float>& expected) {
+  std::vector<float> &&host_data = expected.copyout();
+  get_expected_reduce_scatter_result(host_data);
+  expected.copyin(host_data);
+}
+
+void get_expected_allgather_result(CUDAVector<float>& input,
+                                   CUDAVector<float>& expected) {
+  std::vector<float> &&host_input = input.copyout();
+  std::vector<float> &&host_data = expected.copyout();
+  get_expected_allgather_result(host_input, host_data);
+  expected.copyin(host_data);
+}
+
+void get_expected_allgather_inplace_result(CUDAVector<float>& expected) {
+  std::vector<float> &&host_data = expected.copyout();
+  get_expected_allgather_inplace_result(host_data);
+  expected.copyin(host_data);
+}
+
+void get_expected_alltoall_result(CUDAVector<float>& expected) {
+  std::vector<float> &&host_data = expected.copyout();
+  get_expected_alltoall_result(host_data);
+  expected.copyin(host_data);
+}
+
+void get_expected_reduce_result(CUDAVector<float>& expected) {
+  std::vector<float> &&host_data = expected.copyout();
+  get_expected_reduce_result(host_data);
+  expected.copyin(host_data);
+}
+
+void get_expected_bcast_result(CUDAVector<float>& expected) {
+  std::vector<float> &&host_data = expected.copyout();
+  get_expected_bcast_result(host_data);
+  expected.copyin(host_data);
+}
+
+void get_expected_gather_result(CUDAVector<float>& expected) {
+  std::vector<float> &&host_data = expected.copyout();
+  get_expected_gather_result(host_data);
+  expected.copyin(host_data);
+}
+
+void get_expected_scatter_result(CUDAVector<float>& expected) {
+  std::vector<float> &&host_data = expected.copyout();
+  get_expected_scatter_result(host_data);
   expected.copyin(host_data);
 }