diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 2f81a79..1b052ff 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -21,12 +21,12 @@ jobs: # Steps represent a sequence of tasks that will be executed as part of the job steps: - # Fetch CUDA toolkit using Jimver/cuda-toolkit@v0.2.5 + # Fetch CUDA toolkit using Jimver/cuda-toolkit@v0.2.7 - name: Fetch CUDA toolkit - uses: Jimver/cuda-toolkit@v0.2.5 + uses: Jimver/cuda-toolkit@v0.2.7 id: cuda-toolkit with: - cuda: '11.5.1' + cuda: '11.7.0' linux-local-args: '["--toolkit"]' # Runs a single command using the runners shell diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index dc4f060..068acc4 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -21,12 +21,12 @@ jobs: # Steps represent a sequence of tasks that will be executed as part of the job steps: - # Fetch CUDA toolkit using Jimver/cuda-toolkit@v0.2.5 + # Fetch CUDA toolkit using Jimver/cuda-toolkit@v0.2.7 - name: Fetch CUDA toolkit - uses: Jimver/cuda-toolkit@v0.2.5 + uses: Jimver/cuda-toolkit@v0.2.7 id: cuda-toolkit with: - cuda: '11.5.1' + cuda: '11.7.0' linux-local-args: '["--toolkit"]' # Runs a single command using the runners shell diff --git a/CMakeLists.txt b/CMakeLists.txt index cf5b37a..65a93e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,8 +111,8 @@ set(LOOPS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) target_include_directories(loops INTERFACE ${LOOPS_INCLUDE_DIR} INTERFACE ${CXXOPTS_INCLUDE_DIR} - INTERFACE ${THRUST_INCLUDE_DIR} INTERFACE ${CUB_INCLUDE_DIR} + INTERFACE ${THRUST_INCLUDE_DIR} INTERFACE ${MODERNGPU_INCLUDE_DIR} # INTERFACE ${RAPIDJSON_INCLUDE_DIR} INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} diff --git a/README.md b/README.md index 95b8f9f..98c0b34 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,125 @@ -# 🐧 `loops` GPU load-balancing library for irregular (and kinda- regular) computations -We propose to build an open-source GPU load-balancing framework for applications that exhibit irregular parallelism. The set of applications and algorithms we consider are fundamental to computing tasks ranging from sparse machine learning, large numerical simulations, and on through to graph analytics. The underlying data and data structures that drive these applications exhibit access patterns that naturally don't map well to the GPU's architecture that is designed with dense and regular access patterns in mind. Prior to the work we present and propose here, the only way to unleash the GPU's full power on these problems has been to workload balance through tightly coupled load-balancing techniques. Our proposed load-balancing abstraction decouples load-balancing from work processing and aims to support both static and dynamic schedules with a programmable interface to implement new load-balancing schedules in the future. With our open-source framework, we hope to not only improve programmers' productivity when developing irregular-parallel algorithms on the GPU but also improve the overall performance characteristics for such applications by allowing a quick path to experimentation with a variety of existing load-balancing techniques. Consequently, we also hope that by separating the concerns of load-balancing from work processing within our abstraction, managing and extending existing code to future architectures becomes easier. +# 🐧 `loops`: Expressing Parallel Irregular Computations +We propose an open-source GPU load-balancing framework for applications that exhibit irregular parallelism. The set of applications and algorithms we consider are fundamental to computing tasks ranging from sparse machine learning, large numerical simulations, and on through to graph analytics. The underlying data and data structures that drive these applications present access patterns that naturally don't map well to the GPU's architecture that is designed with dense and regular patterns in mind. Prior to the work we present and propose here, the only way to unleash the GPU's full power on these problems has been to workload balance through tightly coupled load-balancing techniques. Our proposed load-balancing abstraction decouples load balancing from work processing and aims to support both static and dynamic schedules with a programmable interface to implement new load-balancing schedules in the future. With our open-source framework, we hope to not only improve programmers' productivity when developing irregular-parallel algorithms on the GPU but also improve the overall performance characteristics for such applications by allowing a quick path to experimentation with a variety of existing load-balancing techniques. Consequently, we also hope that by separating the concerns of load-balancing from work processing within our abstraction, managing and extending existing code to future architectures becomes easier. | System | Version | CUDA | Status | |---------|------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------| -| Ubuntu | [Ubuntu 20.04](https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources) | 11.5.1 | [![Ubuntu](https://github.com/neoblizz/loops/actions/workflows/ubuntu.yml/badge.svg)](https://github.com/neoblizz/loops/actions/workflows/ubuntu.yml) | -| Windows | [Windows Server 2019](https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources) | 11.5.1 | [![Windows](https://github.com/neoblizz/loops/actions/workflows/windows.yml/badge.svg)](https://github.com/neoblizz/loops/actions/workflows/windows.yml) | +| Ubuntu | [Ubuntu 20.04](https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources) | 11.7.0 | [![Ubuntu](https://github.com/neoblizz/loops/actions/workflows/ubuntu.yml/badge.svg)](https://github.com/neoblizz/loops/actions/workflows/ubuntu.yml) | +| Windows | [Windows Server 2019](https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources) | 11.7.0 | [![Windows](https://github.com/neoblizz/loops/actions/workflows/windows.yml/badge.svg)](https://github.com/neoblizz/loops/actions/workflows/windows.yml) | ## :musical_note: A little background. -**DARPA** announced [**Software Defined Hardware (SDH)**](https://www.darpa.mil/program/software-defined-hardware), a program that aims "*to build runtime-reconfigurable hardware and software that enables near ASIC performance without sacrificing programmability for data-intensive algorithms.*" **NVIDIA** leading the charge on the program, internally called, [**Symphony**](https://blogs.nvidia.com/blog/2018/07/24/darpa-research-post-moores-law/). My Ph.D. work is a small but important piece of this larger puzzle. The "data-intensive algorithms" part of the program includes domains like Machine Learning, Graph Processing, Sparse-Matrix-Vector algorithms, etc. where there is large amount of data available to be processed. And the problems being addressed are either already based on irregular datastructures and workloads, or are trending towards it (such as sparse machine learning problems). For these irregular workload-computations to be successful, we require efficient load-balancing schemes targetting the specialized hardwares such as the GPUs or Symphony. +**DARPA** announced [**Software Defined Hardware (SDH)**](https://www.darpa.mil/program/software-defined-hardware), a program that aims "*to build runtime-reconfigurable hardware and software that enables near ASIC performance without sacrificing programmability for data-intensive algorithms.*" **NVIDIA** leading the charge on the program, internally called, [**Symphony**](https://blogs.nvidia.com/blog/2018/07/24/darpa-research-post-moores-law/). Our work is a small but important piece of this larger puzzle. The "data-intensive algorithms" part of the program includes domains like Machine Learning, Graph Processing, Sparse-Matrix-Vector algorithms, etc. where there is a large amount of data available to be processed. And the problems being addressed are either already based on irregular data structures and workloads, or are trending towards it (such as sparse machine learning problems). For these irregular workload computations to be successful, we require efficient load-balancing schemes targetting specialized hardware such as the GPUs or Symphony. - [DARPA Selects Teams to Unleash Power of Specialized, Reconfigurable Computing Hardware](https://www.darpa.mil/news-events/2018-07-24a) ## 🧩 A small (and important) piece of a larger puzzle. -The predominant approach today to addressing irregularity is to build application-dependent solutions. These are not portable between applications. This is a shame because I believe the underlying techniques that are currently used to address irregularity have the potential to be expressed in a generic, portable, powerful way. I intend to build a generic open-source library for load balancing that will expose high-performance, intuitive load-balancing strategies to any irregular-parallel application. +The predominant approach today to addressing irregularity is to build application-dependent solutions. These are not portable between applications. This is a shame because We believe the underlying techniques that are currently used to address irregularity have the potential to be expressed in a generic, portable, powerful way. We build a generic open-source library for load balancing that will expose high-performance, intuitive load-balancing strategies to any irregular-parallel application. ## ⚖️ Load-balancing problem, and a silver lining. -Today's GPUs follow a Single Insruction Multiple Data (SIMD) model, where different work-components (for example a node in a graph) are mapped to a single thread. Each thread runs a copy of program and threads run in parallel (this is a simple explanation, there are other work units in NVIDIA's GPUs such as warps, cooperative thread arrays, streaming multiprocessors etc.). +Today's GPUs follow a Single Instruction Multiple Data (SIMD) model, where different work components (for example a node in a graph) are mapped to a single thread. Each thread runs a copy of the program and threads run in parallel (this is a simple explanation, there are other work units in NVIDIA's GPUs such as warps, cooperative thread arrays, streaming multiprocessors etc.). Let's take a graph problem as an example to understand load imbalance. One key operation in graph problems is traversal, given a set of vertices, a traversal operation visits all the neighboring vertices of the input. If we naïvely map each input vertex to a GPU thread it can result in a massive imbalance of work. As some threads within the GPU will get a lot more work than others, causing inefficient utilization of hardware resources. In our example; this could happen for a social-network graph where one input vertex may have millions of connections while other input vertices in the same traversal pass may only have tens of neighbors. -> Take graph as an example to discuss the load-balancing problem. Let's say we would like to process neighbors of every single node in a graph, we naïvely map each node to each thread in the GPU. This can result in an extremely inefficient mapping as there could be one node with millions of neighbors while others with only a few neighbors, so the threads that were mapped to nodes with millions of neighbors now end up doing a lot of work while all the other threads idle. The silver lining here is that there are a lot of more intelligent workload mappings that address this problem for different types of graphs or other irregular workloads. +The silver lining here is that there are more intelligent workload mappings that address this problem the load imbalance problem for various types of graphs and other irregular workloads. We extend these previously tightly-coupled scheduling algorithms to an abstraction. +# GPU Load-balancing Abstraction + +![image](https://user-images.githubusercontent.com/9790745/168728352-27758e82-5f37-46cd-8052-99ca571edbfa.png) +![illustration](https://user-images.githubusercontent.com/9790745/168728299-6b125b44-894a-49bb-92fd-ee85aaa80ae4.png) + +We provide two APIs for our library, one that focuses on a beginner-friendly approach to load-balancing irregular sparse computations and another that allows advanced programmers to retain control of the GPU kernels and express load-balanced execution as ranged loops. Both approaches are highlighted below. + +## Beginner API: Load-balanced Transformations and Primitives + +Our Load-balanced execution API builds on the approach defined in `gunrock/essentials` where we identify key primitives used in computing sparse linear algebra, graph analytics, and other irregular computations alike. Load-balanced versions of these primitives are then implemented, such that the user gets access to the atom, tile, and processor id they are working on as the [C++ lambda](https://en.cppreference.com/w/cpp/language/lambda) signature. + +Users define their computation within the C++ lambda, which gets called by the load-balanced primitive for every instance of the work atom. + +### (1) Sparse Layout +In this simple example we are using Compressed Sparse Row (CSR) format and simply returning the number of `atoms` (nonzeros) in each row as our layout. +```cpp +auto layout = [=] __device__ (std::size_t tile_id) { + return offsets[tile_id+1] – offsets[tile_id]; +} +``` + +### (2) User-defined Compute (C++ Lambda) + +```cpp +// user-defined compute: y = Ax +auto spmv = [=] __device__ (std::size_t atom_id, std::size_t tile_id, std::size_t proc_id) { + return values[atom_id] * x[column_indices[atom_id]]; +} +``` + +### (3) Load-balanced primitive (Transform Segmented Reduce) +Requires the load-balancing schedule (`work_oriented` in this example) as a templated parameter. +The transformation as a C++ lambda expressions (`spmv`) and the `layout` as an input to perform a segmented reduction. +The output of the C++ lambda expression gets reduced by segments defined using `A.offsets`. +```cpp +lb::transform_segreduce + (spmv, layout, A.nonzeros, A.offsets, + G.rows, y, lb::plus_t(), + 0.0f, stream); +``` + +#### Advantage: +- Requires no knowledge of how to implement segmented reduction, +- Very simple API if the computation can be defined using C++ lambda expressions. + +#### Disadvantages: +- No control over kernel execution and dispatch configuration, +- No composability; cannot implement more complicated computations that may have cooperative properties among processors. + +## Advanced API: Load-balanced Loops + +SpMV problem-specific kernel parameters. + +```cpp +template +__global__ void __launch_bounds__(threads_per_block, 2) + spmv(std::size_t rows, + std::size_t cols, + std::size_t nnz, + offset_t* offsets, + index_t* indices, + const type_t* values, + const type_t* x, + type_t* y) { +``` + +### (1) Define and Configure Load-balancing Schedule +Allocates any temporary memory required for load-balancing, as well as constructs a schedule per processors partition (defined using cooperative groups). +```cpp + using setup_t = schedule::setup; + + /// Allocate temporary storage for the schedule. + using storage_t = typename setup_t::storage_t; + __shared__ storage_t temporary_storage; + + /// Construct the schedule. + setup_t config(temporary_storage, offsets, rows, nnz); + auto p = config.partition(); +``` + +### (2) Load-balanced Ranged Loops (see; [C++ ranges](https://en.cppreference.com/w/cpp/header/ranges)) +In this example, we define two iteration spaces; virtual and real. Virtual spaces allow us to balance atoms and tiles onto the processor ids and link directly to the real iteration space, which returns the exact atom or tile being processed. The code below loops over all balanced number of atoms fetches the tile corresponding to the atom being processed and allows user to define their computation. +```cpp + for (auto virtual_atom : config.atom_accessor(p)) { + auto virtual_tile = config.tile_accessor(virtual_atom, p); + + if (!(config.is_valid_accessor(virtual_tile, p))) + continue; + + auto row = config.tile_id(virtual_tile, p); + + auto nz_idx = config.atom_id(virtual_atom, row, virtual_tile, p); +``` + +### (3) User-defined Computation +Once the user has access to the atom, tile, and the processor id, they implement the desired computation on the given tuple. In this example, we use a simple `atomicAdd` to perform SpMV (can be improved). +```cpp + atomicAdd(&(y[row]), values[nz_idx] * x[indices[nz_idx]]); + } +} +``` \ No newline at end of file diff --git a/examples/spmv/CMakeLists.txt b/examples/spmv/CMakeLists.txt index 60256ea..aaf36a4 100644 --- a/examples/spmv/CMakeLists.txt +++ b/examples/spmv/CMakeLists.txt @@ -1,24 +1,23 @@ -# begin /* Set the application name. */ -set(APPLICATION_NAME spmv) -# end /* Set the application name. */ - -# begin /* Add CUDA executables */ -add_executable(${APPLICATION_NAME}) - -set(SOURCE_LIST - ${APPLICATION_NAME}.cu +# begin /* Add application */ +set(SOURCES + original.cu + thread_mapped.cu + group_mapped.cu + work_oriented.cu ) -target_sources(${APPLICATION_NAME} PRIVATE ${SOURCE_LIST}) -target_link_libraries(${APPLICATION_NAME} - PRIVATE loops - # PRIVATE nvToolsExt +get_target_property(LOOPS_ARCHITECTURES + loops CUDA_ARCHITECTURES ) -get_target_property(LOOPS_ARCHITECTURES loops CUDA_ARCHITECTURES) -set_target_properties(${APPLICATION_NAME} - PROPERTIES - CUDA_ARCHITECTURES ${LOOPS_ARCHITECTURES} -) # XXX: Find a better way to inherit loops properties. -message(STATUS "Example Added: ${APPLICATION_NAME}") -# end /* Add CUDA executables */ \ No newline at end of file +foreach(SOURCE IN LISTS SOURCES) + get_filename_component(TEST_NAME ${SOURCE} NAME_WLE) + add_executable(${TEST_NAME} ${SOURCE}) + target_link_libraries(${TEST_NAME} PRIVATE loops) + set_target_properties(${TEST_NAME} + PROPERTIES + CUDA_ARCHITECTURES ${LOOPS_ARCHITECTURES} + ) + message(STATUS "Example Added: spmv/${TEST_NAME}") +endforeach() +# end /* Add application */ \ No newline at end of file diff --git a/examples/spmv/spmv.cu b/examples/spmv/group_mapped.cu similarity index 66% rename from examples/spmv/spmv.cu rename to examples/spmv/group_mapped.cu index 1e8acdd..3ccb224 100644 --- a/examples/spmv/spmv.cu +++ b/examples/spmv/group_mapped.cu @@ -10,67 +10,8 @@ */ #include "spmv.hxx" - -// template -// __global__ void spmv(const std::size_t rows, -// const std::size_t cols, -// const std::size_t nnz, -// const offset_t* offsets, -// const index_t* indices, -// const type_t* values, -// const type_t* x, -// type_t* y) { -// /// Equivalent to: -// /// row = blockIdx.x * blockDim.x + threadIdx.x; (init) -// /// row < rows; (boundary condition) -// /// row += gridDim.x * blockDim.x. (step) -// for (auto row : grid_stride_range(std::size_t(0), rows)) { -// type_t sum = 0; - -// /// Equivalent to: -// /// for (offset_t nz = offset; nz < end; ++nz) -// for (auto nz : range(offsets[row], offsets[row + 1])) { -// sum += values[nz] * x[indices[nz]]; -// } - -// // Output -// y[row] = sum; -// } -// } - using namespace loops; -template -__global__ void spmv(setup_t config, - const std::size_t rows, - const std::size_t cols, - const std::size_t nnz, - const offset_t* offsets, - const index_t* indices, - const type_t* values, - const type_t* x, - type_t* y) { - /// Equivalent to: - /// row = blockIdx.x * blockDim.x + threadIdx.x; (init) - /// row < rows; (boundary condition) - /// row += gridDim.x * blockDim.x. (step) - for (auto row : config.tiles()) { - type_t sum = 0; - - /// Equivalent to: - /// for (offset_t nz = offset; nz < end; ++nz) - for (auto nz : config.atoms(row)) { - sum += values[nz] * x[indices[nz]]; - } - - // Output - y[row] = sum; - } -} - template ; /// Allocate temporary storage for the schedule. @@ -103,10 +44,6 @@ __global__ void __launch_bounds__(threads_per_block, 2) auto row = config.tile_id(virtual_tile, p); - // XXX: Not sure if this check is needed. - // if (row == -1) - // continue; - auto nz_idx = config.atom_id(virtual_atom, row, virtual_tile, p); atomicAdd(&(y[row]), values[nz_idx] * x[indices[nz_idx]]); } @@ -135,13 +72,13 @@ int main(int argc, char** argv) { constexpr std::size_t block_size = 128; /// Set-up kernel launch parameters and run the kernel. - std::size_t grid_size = (csr.rows + block_size - 1) / block_size; cudaStream_t stream; cudaStreamCreate(&stream); /// Traditional kernel launch, this is nice for tile mapped scheduling, which /// will allow blocks to be scheduled in and out as needed. And will rely on /// NVIDIA's hardware schedule to schedule the blocks efficiently. + std::size_t grid_size = (csr.rows + block_size - 1) / block_size; launch::non_cooperative( stream, tiled_spmv, grid_size, block_size, csr.rows, csr.cols, csr.nnzs, csr.offsets.data().get(), diff --git a/examples/spmv/original.cu b/examples/spmv/original.cu new file mode 100644 index 0000000..bca529a --- /dev/null +++ b/examples/spmv/original.cu @@ -0,0 +1,86 @@ +/** + * @file spmv.cu + * @author Muhammad Osama (mosama@ucdavis.edu) + * @brief Sparse Matrix-Vector Multiplication example. + * @version 0.1 + * @date 2022-02-03 + * + * @copyright Copyright (c) 2022 + * + */ + +#include "spmv.hxx" +using namespace loops; + +template +__global__ void spmv(const std::size_t rows, + const std::size_t cols, + const std::size_t nnz, + const offset_t* offsets, + const index_t* indices, + const type_t* values, + const type_t* x, + type_t* y) { + for (auto row = blockIdx.x * blockDim.x + threadIdx.x; + row < rows; // boundary condition + row += gridDim.x * blockDim.x // step + ) { + type_t sum = 0; + for (offset_t nz = offsets[row]; nz < offsets[row + 1]; ++nz) + sum += values[nz] * x[indices[nz]]; + + // Output + y[row] = sum; + } +} + +int main(int argc, char** argv) { + using index_t = int; + using offset_t = int; + using type_t = float; + + // ... I/O parameters, mtx, etc. + parameters_t parameters(argc, argv); + + csr_t csr; + matrix_market_t mtx; + csr.from_coo(mtx.load(parameters.filename)); + + // Input and output vectors. + vector_t x(csr.rows); + vector_t y(csr.rows); + + // Generate random numbers between [0, 1]. + generate::random::uniform_distribution(x.begin(), x.end(), 1, 10); + + // Create a schedule. + constexpr std::size_t block_size = 128; + + /// Set-up kernel launch parameters and run the kernel. + cudaStream_t stream; + cudaStreamCreate(&stream); + + std::size_t grid_size = (csr.rows + block_size - 1) / block_size; + launch::non_cooperative( + stream, spmv, grid_size, block_size, csr.rows, + csr.cols, csr.nnzs, csr.offsets.data().get(), csr.indices.data().get(), + csr.values.data().get(), x.data().get(), y.data().get()); + + cudaStreamSynchronize(stream); + + /// Validation code, can be safely ignored. + if (parameters.validate) { + auto h_y = cpu::spmv(csr, x); + + std::size_t errors = util::equal( + y.data().get(), h_y.data(), csr.rows, + [](const type_t a, const type_t b) { return std::abs(a - b) > 1e-2; }, + parameters.verbose); + + std::cout << "Matrix:\t\t" << extract_filename(parameters.filename) + << std::endl; + std::cout << "Dimensions:\t" << csr.rows << " x " << csr.cols << " (" + << csr.nnzs << ")" << std::endl; + std::cout << "Errors:\t\t" << errors << std::endl; + } +} \ No newline at end of file diff --git a/examples/spmv/thread_mapped.cu b/examples/spmv/thread_mapped.cu new file mode 100644 index 0000000..eb82cb0 --- /dev/null +++ b/examples/spmv/thread_mapped.cu @@ -0,0 +1,101 @@ +/** + * @file spmv.cu + * @author Muhammad Osama (mosama@ucdavis.edu) + * @brief Sparse Matrix-Vector Multiplication example. + * @version 0.1 + * @date 2022-02-03 + * + * @copyright Copyright (c) 2022 + * + */ + +#include "spmv.hxx" +using namespace loops; + +template +__global__ void thread_mapped_spmv(setup_t config, + const std::size_t rows, + const std::size_t cols, + const std::size_t nnz, + const offset_t* offsets, + const index_t* indices, + const type_t* values, + const type_t* x, + type_t* y) { + /// Equivalent to: + /// row = blockIdx.x * blockDim.x + threadIdx.x; (init) + /// row < rows; (boundary condition) + /// row += gridDim.x * blockDim.x. (step) + for (auto row : config.tiles()) { + type_t sum = 0; + + /// Equivalent to: + /// for (offset_t nz = offset; nz < end; ++nz) + for (auto nz : config.atoms(row)) { + sum += values[nz] * x[indices[nz]]; + } + + // Output + y[row] = sum; + } +} + +int main(int argc, char** argv) { + using index_t = int; + using offset_t = int; + using type_t = float; + + // ... I/O parameters, mtx, etc. + parameters_t parameters(argc, argv); + + csr_t csr; + matrix_market_t mtx; + csr.from_coo(mtx.load(parameters.filename)); + + // Input and output vectors. + vector_t x(csr.rows); + vector_t y(csr.rows); + + // Generate random numbers between [0, 1]. + generate::random::uniform_distribution(x.begin(), x.end(), 1, 10); + + // Create a schedule. + constexpr std::size_t block_size = 128; + + /// Set-up kernel launch parameters and run the kernel. + cudaStream_t stream; + cudaStreamCreate(&stream); + + // Create a schedule. + using setup_t = schedule::setup; + setup_t config(csr.offsets.data().get(), csr.rows, csr.nnzs); + + std::size_t grid_size = (csr.rows + block_size - 1) / block_size; + launch::non_cooperative( + stream, thread_mapped_spmv, grid_size, + block_size, config, csr.rows, csr.cols, csr.nnzs, + csr.offsets.data().get(), csr.indices.data().get(), + csr.values.data().get(), x.data().get(), y.data().get()); + + cudaStreamSynchronize(stream); + + /// Validation code, can be safely ignored. + if (parameters.validate) { + auto h_y = cpu::spmv(csr, x); + + std::size_t errors = util::equal( + y.data().get(), h_y.data(), csr.rows, + [](const type_t a, const type_t b) { return std::abs(a - b) > 1e-2; }, + parameters.verbose); + + std::cout << "Matrix:\t\t" << extract_filename(parameters.filename) + << std::endl; + std::cout << "Dimensions:\t" << csr.rows << " x " << csr.cols << " (" + << csr.nnzs << ")" << std::endl; + std::cout << "Errors:\t\t" << errors << std::endl; + } +} \ No newline at end of file diff --git a/examples/spmv/work_oriented.cu b/examples/spmv/work_oriented.cu new file mode 100644 index 0000000..618aa2f --- /dev/null +++ b/examples/spmv/work_oriented.cu @@ -0,0 +1,115 @@ +/** + * @file spmv.cu + * @author Muhammad Osama (mosama@ucdavis.edu) + * @brief Sparse Matrix-Vector Multiplication example. + * @version 0.1 + * @date 2022-02-03 + * + * @copyright Copyright (c) 2022 + * + */ + +#include "spmv.hxx" + +using namespace loops; + +template +__global__ void __launch_bounds__(threads_per_block, 2) + merge_spmv(std::size_t rows, + std::size_t cols, + std::size_t nnz, + offset_t* offsets, + index_t* indices, + const type_t* values, + const type_t* x, + type_t* y) { + using setup_t = schedule::setup; + + setup_t config(offsets, rows, nnz); + auto map = config.init(); + + type_t sum = 0; + for (auto row : config.tiles(map)) { + // printf("row = %d\n", row); + for (auto nz : config.atoms(row, map)) { + // printf("nz = %d\n", nz); + sum += values[nz] * x[indices[nz]]; + } + y[row] = sum; + // if (row == 0) + // printf("sum = %f\n", sum); + sum = 0; + } + + int remainder_row = map.second.first; + // int remainder_nz = map.second.second - map.first.second; + // printf("remainder_row = %d\n", remainder_row); + // printf("remainder_nz = %d\n", remainder_nz); + // printf("(nz start) map.first.second = %d\n", map.first.second); + // printf("(nz end) map.second.second = %d\n", map.second.second); + for (auto nz : config.remainder_atoms(map)) { + // printf("(END) nz = %d\n", nz); + sum += values[nz] * x[indices[nz]]; + } + // if (remainder_row == 0) + // printf("sum = %f\n", sum); + atomicAdd(&(y[remainder_row]), sum); +} + +int main(int argc, char** argv) { + using index_t = int; + using offset_t = int; + using type_t = float; + + // ... I/O parameters, mtx, etc. + parameters_t parameters(argc, argv); + + csr_t csr; + matrix_market_t mtx; + csr.from_coo(mtx.load(parameters.filename)); + + // Input and output vectors. + vector_t x(csr.rows); + vector_t y(csr.rows); + + // Generate random numbers between [0, 1]. + generate::random::uniform_distribution(x.begin(), x.end(), 1, 10); + + // Create a schedule. + constexpr std::size_t block_size = 128; + + /// Set-up kernel launch parameters and run the kernel. + cudaStream_t stream; + cudaStreamCreate(&stream); + + std::size_t grid_size = // 1; + // csr.rows + csr.nnzs; + (((csr.rows + csr.nnzs) + block_size) - 1) / block_size; + launch::non_cooperative( + stream, merge_spmv, grid_size, + block_size, csr.rows, csr.cols, csr.nnzs, csr.offsets.data().get(), + csr.indices.data().get(), csr.values.data().get(), x.data().get(), + y.data().get()); + + cudaStreamSynchronize(stream); + + /// Validation code, can be safely ignored. + if (parameters.validate) { + auto h_y = cpu::spmv(csr, x); + + std::size_t errors = util::equal( + y.data().get(), h_y.data(), csr.rows, + [](const type_t a, const type_t b) { return std::abs(a - b) > 1e-2; }, + parameters.verbose); + + std::cout << "Matrix:\t\t" << extract_filename(parameters.filename) + << std::endl; + std::cout << "Dimensions:\t" << csr.rows << " x " << csr.cols << " (" + << csr.nnzs << ")" << std::endl; + std::cout << "Errors:\t\t" << errors << std::endl; + } +} \ No newline at end of file diff --git a/include/loops/schedule.hxx b/include/loops/schedule.hxx index 59a2b84..d39e4a8 100644 --- a/include/loops/schedule.hxx +++ b/include/loops/schedule.hxx @@ -20,30 +20,31 @@ namespace schedule { * @brief Load balancing algorithms. * */ -enum algroithms_t { +enum algorithms_t { work_oriented, /// < Work oriented scheduling algorithm. thread_mapped, /// < Thread mapped scheduling algorithm. tile_mapped, /// < Tile mapped scheduling algorithm. bucketing, /// < Bucketing scheduling algorithm. }; -template +template class atom_traits; -template +template class tile_traits; /** - * @brief + * @brief Schedule's setup interface. * - * @tparam scheme - * @tparam threads_per_block - * @tparam tiles_t - * @tparam atoms_t - * @tparam tile_size_t - * @tparam atom_size_t + * @tparam scheme The scheduling algorithm. + * @tparam threads_per_block Number of threads per block. + * @tparam threads_per_tile Number of threads per tile. + * @tparam tiles_t Type of the tiles. + * @tparam atoms_t Type of the atoms. + * @tparam tile_size_t Type of the tile size (default: std::size_t). + * @tparam atom_size_t Type of the atom size (default: std::size_t). */ -template -#include \ No newline at end of file +#include +#include \ No newline at end of file diff --git a/include/loops/schedule/thread_mapped.hxx b/include/loops/schedule/thread_mapped.hxx index bd35dd7..f69f693 100644 --- a/include/loops/schedule/thread_mapped.hxx +++ b/include/loops/schedule/thread_mapped.hxx @@ -18,7 +18,7 @@ namespace loops { namespace schedule { template -class atom_traits { +class atom_traits { public: using atoms_t = atoms_type; using atoms_iterator_t = atoms_t*; @@ -40,7 +40,7 @@ class atom_traits { }; template -class tile_traits { +class tile_traits { public: using tiles_t = tiles_type; using tiles_iterator_t = tiles_t*; @@ -63,16 +63,16 @@ template -class setup : public tile_traits : public tile_traits, - public atom_traits { public: @@ -84,9 +84,9 @@ class setup; + tile_traits; using atom_traits_t = - atom_traits; + atom_traits; /** * @brief Default constructor. diff --git a/include/loops/schedule/tile_mapped.hxx b/include/loops/schedule/tile_mapped.hxx index 33e3a0b..d9e7e3b 100644 --- a/include/loops/schedule/tile_mapped.hxx +++ b/include/loops/schedule/tile_mapped.hxx @@ -1,7 +1,8 @@ /** * @file tile_mapped.hxx * @author Muhammad Osama (mosama@ucdavis.edu) - * @brief + * @brief Tile-mapped schedule (map work to tiles, process using individual + * threads within the tile.) * @version 0.1 * @date 2022-02-04 * @@ -12,13 +13,14 @@ #pragma once #include -#include #include #include #include +#ifndef _CG_ABI_EXPERIMENTAL #define _CG_ABI_EXPERIMENTAL +#endif #include #include @@ -30,7 +32,7 @@ namespace cg = cooperative_groups; namespace cg_x = cooperative_groups::experimental; template -class atom_traits { +class atom_traits { public: using atoms_t = atoms_type; using atoms_iterator_t = atoms_t*; @@ -52,7 +54,7 @@ class atom_traits { }; template -class tile_traits { +class tile_traits { public: using tiles_t = tiles_type; using tiles_iterator_t = tiles_t*; @@ -77,15 +79,15 @@ template -class setup - : public tile_traits, - public atom_traits, + public atom_traits { public: @@ -97,9 +99,9 @@ class setup; + tile_traits; using atom_traits_t = - atom_traits; + atom_traits; enum : unsigned int { threads_per_block = THREADS_PER_BLOCK, diff --git a/include/loops/schedule/work_oriented.hxx b/include/loops/schedule/work_oriented.hxx new file mode 100644 index 0000000..5469e4e --- /dev/null +++ b/include/loops/schedule/work_oriented.hxx @@ -0,0 +1,210 @@ +/** + * @file work_oriented.hxx + * @author Muhammad Osama (mosama@ucdavis.edu) + * @brief Work-oriented scheduling algorithm (map even-share of work to + * threads.) + * @version 0.1 + * @date 2022-03-07 + * + * @copyright Copyright (c) 2022 + * + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace loops { +namespace schedule { + +template +__host__ __device__ constexpr auto div(T const& t, U const& u) { + return (t + u - 1) / u; +} + +template +class atom_traits { + public: + using atoms_t = atoms_type; + using atoms_iterator_t = atoms_t*; + using atom_size_t = atom_size_type; + + __host__ __device__ atom_traits() : size_(0), atoms_(nullptr) {} + __host__ __device__ atom_traits(atom_size_t size) + : size_(size), atoms_(nullptr) {} + __host__ __device__ atom_traits(atom_size_t size, atoms_iterator_t atoms) + : size_(size), atoms_(atoms) {} + + __host__ __device__ atom_size_t size() const { return size_; } + __host__ __device__ atoms_iterator_t begin() { return atoms_; }; + __host__ __device__ atoms_iterator_t end() { return atoms_ + size_; }; + + private: + atom_size_t size_; + atoms_iterator_t atoms_; +}; + +template +class tile_traits { + public: + using tiles_t = tiles_type; + using tiles_iterator_t = tiles_t*; + using tile_size_t = tile_size_type; + + __host__ __device__ tile_traits() : size_(0), tiles_(nullptr) {} + __host__ __device__ tile_traits(tile_size_t size, tiles_iterator_t tiles) + : size_(size), tiles_(tiles) {} + + __host__ __device__ tile_size_t size() const { return size_; } + __host__ __device__ tiles_iterator_t begin() { return tiles_; }; + __host__ __device__ tiles_iterator_t end() { return tiles_ + size_; }; + + private: + tile_size_t size_; + tiles_iterator_t tiles_; +}; + +template +class setup : public tile_traits, + public atom_traits { + public: + using tiles_t = tiles_type; + using atoms_t = atoms_type; + using tiles_iterator_t = tiles_t*; + using atoms_iterator_t = atoms_t*; + using tile_size_t = tile_size_type; + using atom_size_t = atom_size_type; + + using tile_traits_t = + tile_traits; + using atom_traits_t = + atom_traits; + + enum : unsigned int { + threads_per_block = THREADS_PER_BLOCK, + items_per_thread = ITEMS_PER_THREAD, + items_per_tile = threads_per_block * items_per_thread, + }; + + /** + * @brief Construct a setup object for load balance schedule. + * + * @param tiles Tiles iterator. + * @param num_tiles Number of tiles. + * @param num_atoms Number of atoms. + */ + __device__ __forceinline__ setup(tiles_iterator_t _tiles, + tile_size_t _num_tiles, + atom_size_t _num_atoms) + : tile_traits_t(_num_tiles, _tiles), + atom_traits_t(_num_atoms), + total_work(_num_tiles + _num_atoms), + num_threads(threads_per_block), // items_per_tile * (div(total_work, + // items_per_tile))), + work_per_thread(div(total_work, num_threads)) {} + + __device__ __forceinline__ auto init() { + thrust::counting_iterator atoms_indices; + + std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + // int hid = (blockIdx.x * gridDim.y) + blockIdx.y; + // if (threadIdx.x < 2) { + std::size_t upper = min(work_per_thread * tid, total_work); + std::size_t lower = min(upper + work_per_thread, total_work); + auto st = search(upper, tile_traits_t::begin() + 1, atoms_indices, + tile_traits_t::size(), atom_traits_t::size()); + auto en = search(lower, tile_traits_t::begin() + 1, atoms_indices, + tile_traits_t::size(), atom_traits_t::size()); + // } + return thrust::make_pair(st, en); + } + + template + __device__ __forceinline__ step_range_t tiles(map_t& m) const { + return custom_stride_range(tiles_t(m.first.first), tiles_t(m.second.first), + tiles_t(1)); + } + + template + __device__ __forceinline__ step_range_t atoms(tiles_t t, map_t& m) { + auto num_atoms = tile_traits_t::begin()[(t + 1)]; + auto nz_start = m.first.second; + m.first.second += (num_atoms - nz_start); + return custom_stride_range(atoms_t(nz_start), num_atoms, atoms_t(1)); + } + + template + __device__ __forceinline__ step_range_t remainder_tiles( + map_t& m) const { + return custom_stride_range(tiles_t(m.second.first), tiles_t(m.second.first), + tiles_t(1)); + } + + template + __device__ __forceinline__ step_range_t remainder_atoms( + map_t& m) const { + return custom_stride_range(atoms_t(m.first.second), + (atoms_t)(m.second.second), atoms_t(1)); + } + + private: + /** + * @brief Thrust based 2D binary-search for merge-path algorithm. + * + * @param diagonal + * @param a + * @param b + * @param a_len + * @param b_len + * @return A coordinate. + */ + template + __device__ __forceinline__ auto search(const std::size_t& diagonal, + const xit_t a, + const yit_t b, + const tile_size_t& a_len, + const atom_size_t& b_len) { + // Diagonal search range (in x-coordinate space) + std::size_t x_min = max((std::size_t)(diagonal - b_len), std::size_t(0)); + std::size_t x_max = min(diagonal, (std::size_t)a_len); + + auto it = thrust::lower_bound( + thrust::seq, // Sequential impl + thrust::counting_iterator(x_min), // Start iterator @x_min + thrust::counting_iterator(x_max), // End iterator @x_max + diagonal, // ... + [=] __device__(const std::size_t& idx, const std::size_t& diagonal) { + return a[idx] <= b[diagonal - idx - 1]; + }); + + return thrust::make_pair(min(*it, a_len), (diagonal - *it)); + } + + std::size_t total_work; + std::size_t num_threads; + std::size_t work_per_thread; +}; + +} // namespace schedule +} // namespace loops \ No newline at end of file diff --git a/include/loops/util/generate.hxx b/include/loops/util/generate.hxx index 12cb881..8b2bc1b 100644 --- a/include/loops/util/generate.hxx +++ b/include/loops/util/generate.hxx @@ -13,8 +13,10 @@ #include #include -#include +#include +#include #include +#include namespace loops { namespace generate { @@ -37,7 +39,7 @@ void uniform_distribution(iterator_t begin_it, type_t end = 1.0f) { int size = thrust::distance(begin_it, end_it); auto generate_random = [=] __host__ __device__(std::size_t i) -> type_t { - thrust::default_random_engine rng; + thrust::minstd_rand rng; thrust::uniform_real_distribution uniform(begin, end); rng.discard(i); return uniform(rng);