Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Reduction #6

Merged
merged 6 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .github/workflows/cmake-single-platform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,23 @@ env:
jobs:
build:
runs-on: ubuntu-latest
container:
container:
image: nvidia/cuda:12.3.2-devel-ubuntu22.04

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: 'recursive'

- name: Set up Git
uses: actions/setup-git@v1
with:
git-version: '2.30.0' # Specify the version of Git you need

- name: Install CMake and git
run: |
apt-get update
apt-get install -y git
apt-get install -y cmake

- name: Initialize and Update Git Submodules
Expand Down
13 changes: 10 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,15 @@ target_include_directories(SimpleCudaLib PUBLIC src/include)
# Add fmtlib
add_subdirectory(third_party/fmt)

# CUDA Flags
set(EXTRA_CUDA_FLAGS "")
# Check if building in Debug mode
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# Add debug-specific flags
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -G -g")
else()
# Add line info flag only if not building in Debug mode
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -lineinfo")
endif()

option (SHOW_PTXAS_INFO "Show ptxas info" OFF)
if(SHOW_PTXAS_INFO)
set(EXTRA_CUDA_FLAGS "${EXTRA_CUDA_FLAGS} -Xptxas -v")
Expand All @@ -44,7 +51,7 @@ foreach(EXAMPLE_SOURCE ${EXAMPLE_SOURCES})

# CUDA properties provided by CMAKE
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 90)
set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 90a)

# Convert the flags string into a list of flags
separate_arguments(EXTRA_CUDA_FLAGS_LIST UNIX_COMMAND "${EXTRA_CUDA_FLAGS}")
Expand Down
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.DEFAULT_GOAL := all

build_dir := build

all: release

release:
cmake -B$(build_dir) -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo
ninja -C $(build_dir)

debug:
cmake -B$(build_dir) -GNinja -DCMAKE_BUILD_TYPE=Debug
ninja -C $(build_dir)

clean:
rm -rf $(build_dir)
204 changes: 204 additions & 0 deletions examples/chapter10/reduce1d.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#include "src/include/tensors.h"
#include "src/include/utils.h"

#include <cstddef>
#include <fmt/core.h>
#include <fmt/ranges.h>

#include <cmath>
#include <numeric>
#include <optional>
#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/host_vector.h>

using namespace simple_cuda;
using KernelFunc = void (*)(float *, float *, const int);

using one_d = Extent<1>;

__global__ void Reduce1dInplace(float *input, float *output, const int numel) {
const int i = 2 * threadIdx.x;
for (unsigned stride{1}; stride <= blockDim.x; stride *= 2) {
if (threadIdx.x % stride == 0) {
input[i] += input[i + stride];
}
__syncthreads();
}
if (threadIdx.x == 0) {
*output = input[0];
}
}

__global__ void Reduce1dInplaceBetterOrdering(float *input, float *output,
const int numel) {
const int i = threadIdx.x;
for (unsigned stride{blockDim.x}; stride >= 1; stride /= 2) {
if (i < stride) {
input[i] += input[i + stride];
}
__syncthreads();
}

if (threadIdx.x == 0) {
*output = input[0];
}
}

__global__ void Reduce1dShared(float *input, float *output, const int numel) {
const int i = threadIdx.x;
extern __shared__ float shmem[];
// First iter pulled out of loop
shmem[i] = input[i] + input[i + blockDim.x];
__syncthreads();
for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
if (i < stride) {
shmem[i] += shmem[i + stride];
}
__syncthreads();
}

if (threadIdx.x == 0) {
*output = shmem[0];
}
}

__global__ void Reduce1dSharedGlobal(float *input, float *output,
const int numel) {
const int local_id = threadIdx.x;
const int global_id = local_id + 2 * blockDim.x * blockIdx.x;
extern __shared__ float shmem[]; // Size blockDim.x
// First iter pulled out of loop
shmem[local_id] = input[global_id] + input[global_id + blockDim.x];
__syncthreads();
for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
if (local_id < stride) {
shmem[local_id] += shmem[local_id + stride];
}
__syncthreads();
}

if (local_id == 0) {
atomicAdd(output, shmem[0]);
}
}

template <int COARSE_FACTOR>
__global__ void Reduce1dSharedGlobalCoarse(float *input, float *output,
const int numel) {
const int local_id = threadIdx.x;
const int global_offset = COARSE_FACTOR * 2 * blockDim.x * blockIdx.x;
const int global_id = local_id + global_offset;
extern __shared__ float shmem[]; // Size blockDim.x
// First iter pulled out of loop
float sum = input[global_id];
#pragma unroll
for (int tile = 1; tile < COARSE_FACTOR * 2; tile++) {
sum += input[global_id + tile * blockIdx.x];
}
shmem[local_id] = sum;
__syncthreads();
for (unsigned stride{blockDim.x / 2}; stride >= 1; stride /= 2) {
if (local_id < stride) {
shmem[local_id] += shmem[local_id + stride];
}
__syncthreads();
}

if (local_id == 0) {
atomicAdd(output, shmem[0]);
}
}

float cpp_kernel(std::vector<float> &input) {
const auto n_elements = input.size();
std::vector<float> input_copy(input.size());
std::copy(input.begin(), input.end(), input_copy.begin());
auto out = std::reduce(input_copy.begin(), input_copy.end());
return out;
}

void Test(KernelFunc func, const size_t numel, dim3 grid, dim3 block,
std::optional<size_t> shmem) {
one_d tensor_extents({numel});

HostTensor<float, one_d> input_vec(tensor_extents);
HostTensor<float, one_d> out_sum(one_d({1}));

fill_random(input_vec.data_);
// std::fill(input_vec.data_.begin(), input_vec.data_.end(), 1);
std::fill(out_sum.data_.begin(), out_sum.data_.end(), 0);

auto input_vec_d = input_vec.to_device();
auto out_sum_d = out_sum.to_device();

if (shmem.has_value()) {
func<<<grid, block, shmem.value()>>>(
input_vec_d.data_ptr(), out_sum_d.data_ptr(), tensor_extents.numel());

} else {
func<<<grid, block>>>(input_vec_d.data_ptr(), out_sum_d.data_ptr(),
tensor_extents.numel());
}
cudaCheckErrors("kernel launch failure");
cudaDeviceSynchronize();

auto host_output = out_sum_d.to_host();
auto host_output_ptr = host_output.data_ptr();

std::vector<float> input_vector(input_vec.data_.begin(),
input_vec.data_.end());
const auto cpp_anwser = cpp_kernel(input_vector);

float diff = fabs(cpp_anwser - host_output_ptr[0]);
if (diff > 5e-3) {
std::string error_string = "Houston we have a problem!\n";
error_string += fmt::format("Found a deviation of {}\n", diff);
error_string += fmt::format("Cpp anwser: {}, GPU anwser: {}\n", cpp_anwser,
host_output_ptr[0]);
std::cout << error_string;
exit(1);
}
std::cout << "All good brother!\n";
}

int main() {
constexpr int max_length = 2048;
constexpr int block_size = max_length / 2;

dim3 grid(1);
dim3 block(block_size);

// Base case bad ordering inplace writes
fmt::print("• Reduced1dInplace Test: ");
Test(Reduce1dInplace, max_length, grid, block, std::nullopt);

// Inplace writes bad ordering
fmt::print("• Reduced1dInplaceBetterOrdering Test: ");
Test(Reduce1dInplaceBetterOrdering, max_length, grid, block, std::nullopt);

// Dynamic shmem version
fmt::print("• Reduce1dShared Test: ");
size_t shmem{block.x * sizeof(float)};
Test(Reduce1dShared, max_length, grid, block, shmem);

// Test larger than thread reductions
constexpr int max_length_global = 2048 * 2;

block.x = 1024;
grid.x = ceil_div(max_length_global, block.x * 2);
shmem = block.x * sizeof(float);
fmt::print("• Reduce1dSharedGlobal Test: ");
Test(Reduce1dSharedGlobal, max_length_global, grid, block, shmem);

constexpr int coarse_factor = 2;
grid.x = ceil_div(max_length_global, block.x * 2 * coarse_factor);
shmem = block.x * sizeof(float);
fmt::print("• Reduce1dSharedGlobalCoarse Test: ");
Test(Reduce1dSharedGlobalCoarse<coarse_factor>, max_length_global, grid,
block, shmem);

// profile the relevant kernels:
// ncu -k "regex:reduce" ./bin/conv1d
return 0;
}
2 changes: 1 addition & 1 deletion examples/chapter7/conv2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ int main() {
constexpr int block_size = 32;

// dimx is inner dim, dimy is outerdim
dim3 grid(ceil_div(num_rows, block_size), ceil_div(num_cols, block_size));
dim3 grid(ceil_div(num_cols, block_size), ceil_div(num_rows, block_size));
dim3 block(block_size, block_size);

Test(Conv2D<block_size, filter_radius>, num_rows, num_cols, filter_radius,
Expand Down
6 changes: 3 additions & 3 deletions src/include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ template <typename T, typename Y> T __host__ __device__ ceil_div(T a, Y b) {

float kernel_time(std::function<void()> kernelLauncher);

template <typename T> void fill_random(T input) {
template <typename T> void fill_random(T& input) {
// First create an instance of an engine.
std::random_device rnd_device;
// Specify the engine and distribution.
std::mt19937 mersenne_engine{rnd_device()}; // Generates random integers
std::normal_distribution<float> dist{2, 1};
std::normal_distribution<float> dist{0, 1};
auto gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };

std::generate(input.begin(), input.end(), gen);
}
} // namespace simple_cuda
} // namespace simple_cuda