From 6aed776aa54c810bc6fdf5fe6ceba10c39665a00 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sat, 20 Apr 2024 19:17:58 -0400 Subject: [PATCH 01/53] test function for rasterszaton tests --- rasterization_tests.py | 167 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 rasterization_tests.py diff --git a/rasterization_tests.py b/rasterization_tests.py new file mode 100644 index 0000000..055727c --- /dev/null +++ b/rasterization_tests.py @@ -0,0 +1,167 @@ +import math +import time + +import torch + +from diff_gaussian_rasterization import ( + GaussianRasterizationSettings, + GaussianRasterizer, +) + + +def get_cuda_args(strategy, mode="train"): + cuda_args = { + "mode": mode, + "world_size": "1", + "global_rank": "0", + "local_rank": "0", + "mp_world_size": "1", + "mp_rank": "0", + "log_folder": "./logs", + "log_interval": "10", + "iteration": "0", + "zhx_debug": "False", + "zhx_time": "False", + "dist_global_strategy": "default", + "avoid_pixel_all2all": False, + "stats_collector": {}, + } + return cuda_args + +def test_gaussian_rasterizer_time(): + # Set up the input data + num_gaussians = 10000 + means3D = torch.randn(num_gaussians, 3).cuda() + scales = torch.randn(num_gaussians, 3).cuda() + rotations = torch.randn(num_gaussians, 3, 3).cuda() + shs = torch.randn(num_gaussians, 9).cuda() + opacities = torch.randn(num_gaussians, 1).cuda() + + # Set up the rasterization settings + image_height = 512 + image_width = 512 + tanfovx = 1.0 + tanfovy = 1.0 + bg = torch.ones(3).cuda() + scale_modifier = 1.0 + viewmatrix = torch.eye(4).cuda() + projmatrix = torch.eye(4).cuda() + sh_degree = 2 + campos = torch.zeros(3).cuda() + prefiltered = False + debug = False + + # mode="train" + # strategy=None + # cuda_args = get_cuda_args(strategy, mode) + + raster_settings = GaussianRasterizationSettings( + image_height, image_width, tanfovx, tanfovy, bg, + scale_modifier, viewmatrix, projmatrix, sh_degree, + campos, prefiltered, debug + ) + + # Create the GaussianRasterizer + rasterizer = GaussianRasterizer(raster_settings) + + # Measure the time for preprocess_gaussians + start_time = time.time() + means2D, rgb, conic_opacity, radii, depths = rasterizer.preprocess_gaussians( + means3D, scales, rotations, shs, opacities + ) + end_time = time.time() + + preprocess_time = end_time - start_time + print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds") + + +def test_batched_gaussian_rasterizer(): + # Set up the input data + num_gaussians = 10000 + num_batches = 4 + means3D = torch.randn(num_gaussians, 3).cuda() + scales = torch.randn(num_gaussians, 3).cuda() + rotations = torch.randn(num_gaussians, 3, 3).cuda() + shs = torch.randn(num_gaussians, 9).cuda() + opacity = torch.randn(num_gaussians, 1).cuda() + + # Set up the viewpoint cameras + batched_viewpoint_cameras = [] + for _ in range(num_batches): + viewpoint_camera = type('ViewpointCamera', (), {}) + viewpoint_camera.FoVx = math.radians(60) + viewpoint_camera.FoVy = math.radians(60) + viewpoint_camera.image_height = 512 + viewpoint_camera.image_width = 512 + viewpoint_camera.world_view_transform = torch.eye(4).cuda() + viewpoint_camera.full_proj_transform = torch.eye(4).cuda() + viewpoint_camera.camera_center = torch.zeros(3).cuda() + batched_viewpoint_cameras.append(viewpoint_camera) + + # Set up the strategies + batched_strategies = [None] * num_batches + + # Set up other parameters + bg_color = torch.ones(3).cuda() + scaling_modifier = 1.0 + pc = type('PC', (), {}) + pc.active_sh_degree = 2 + pipe = type('Pipe', (), {}) + pipe.debug = False + mode = "train" + + batched_rasterizers = [] + batched_cuda_args = [] + batched_screenspace_params = [] + batched_means2D = [] + batched_radii = [] + + for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)): + ########## [START] Prepare CUDA Rasterization Settings ########## + cuda_args = get_cuda_args(strategy, mode) + batched_cuda_args.append(cuda_args) + + # Set up rasterization configuration + tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) + tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) + raster_settings = GaussianRasterizationSettings( + image_height=int(viewpoint_camera.image_height), + image_width=int(viewpoint_camera.image_width), + tanfovx=tanfovx, + tanfovy=tanfovy, + bg=bg_color, + scale_modifier=scaling_modifier, + viewmatrix=viewpoint_camera.world_view_transform, + projmatrix=viewpoint_camera.full_proj_transform, + sh_degree=pc.active_sh_degree, + campos=viewpoint_camera.camera_center, + prefiltered=False, + debug=pipe.debug + ) + rasterizer = GaussianRasterizer(raster_settings=raster_settings) + ########## [END] Prepare CUDA Rasterization Settings ########## + + #[3DGS-wise preprocess] + means2D, rgb, conic_opacity, radii, depths = rasterizer.preprocess_gaussians( + means3D=means3D, + scales=scales, + rotations=rotations, + shs=shs, + opacities=opacity, + cuda_args=cuda_args + ) + + if mode == "train": + means2D.retain_grad() + + batched_means2D.append(means2D) + screenspace_params = [means2D, rgb, conic_opacity, radii, depths] + batched_rasterizers.append(rasterizer) + batched_screenspace_params.append(screenspace_params) + batched_radii.append(radii) + + # Perform further operations with the batched results + # ... + +if __name__ == "__main__": + test_gaussian_rasterizer_time() \ No newline at end of file From b7b08baa5c6bcaf9c2bd38fb4b274fcc5198237a Mon Sep 17 00:00:00 2001 From: Prapti Devansh Trivedi Date: Sat, 20 Apr 2024 19:51:45 -0400 Subject: [PATCH 02/53] add mock of improved preproc --- diff_gaussian_rasterization/__init__.py | 25 ++++++++ rasterization_tests.py | 84 +++++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index 7e3ad04..73ab7e9 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -304,6 +304,31 @@ class GaussianRasterizationSettings(NamedTuple): prefiltered : bool debug : bool +class GaussianRasterizerBatches(nn.Module): + def __init__(self, raster_settings): + super().__init__() + self.raster_settings_list = raster_settings + + def markVisible(self, positions): + # Mark visible points (based on frustum culling for camera) with a boolean + with torch.no_grad(): + visible = [] + for viewmatrix, projmatrix in zip(self.raster_settings.viewmatrix, self.raster_settings.projmatrix): + visible.append(_C.mark_visible(positions, viewmatrix, projmatrix)) + return visible + + def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batched_cuda_args=None): + # Invoke C++/CUDA rasterization routine + + return preprocess_gaussians_batches( + means3D, + scales, + rotations, + shs, + opacities, + self.raster_settings_list, + batched_cuda_args) + class GaussianRasterizer(nn.Module): def __init__(self, raster_settings): super().__init__() diff --git a/rasterization_tests.py b/rasterization_tests.py index 055727c..1dcc91a 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -74,6 +74,88 @@ def test_gaussian_rasterizer_time(): preprocess_time = end_time - start_time print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds") +def test_improved_gaussian_rasterizer(): + + # Set up the input data + num_gaussians = 10000 + num_batches = 4 + means3D = torch.randn(num_gaussians, 3).cuda() + scales = torch.randn(num_gaussians, 3).cuda() + rotations = torch.randn(num_gaussians, 3, 3).cuda() + shs = torch.randn(num_gaussians, 9).cuda() + opacity = torch.randn(num_gaussians, 1).cuda() + + # Set up the viewpoint cameras + batched_viewpoint_cameras = [] + for _ in range(num_batches): + viewpoint_camera = type('ViewpointCamera', (), {}) + viewpoint_camera.FoVx = math.radians(60) + viewpoint_camera.FoVy = math.radians(60) + viewpoint_camera.image_height = 512 + viewpoint_camera.image_width = 512 + viewpoint_camera.world_view_transform = torch.eye(4).cuda() + viewpoint_camera.full_proj_transform = torch.eye(4).cuda() + viewpoint_camera.camera_center = torch.zeros(3).cuda() + batched_viewpoint_cameras.append(viewpoint_camera) + + # Set up the strategies + batched_strategies = [None] * num_batches + + # Set up other parameters + bg_color = torch.ones(3).cuda() + scaling_modifier = 1.0 + pc = type('PC', (), {}) + pc.active_sh_degree = 2 + pipe = type('Pipe', (), {}) + pipe.debug = False + mode = "train" + + batched_rasterizers = [] + batched_cuda_args = [] + batched_screenspace_params = [] + batched_means2D = [] + batched_radii = [] + raster_settings_list=[] + for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)): + ########## [START] Prepare CUDA Rasterization Settings ########## + cuda_args = get_cuda_args(strategy, mode) + batched_cuda_args.append(cuda_args) + + # Set up rasterization configuration + tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) + tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) + + raster_settings_list.append(GaussianRasterizationSettings( + image_height=int(viewpoint_camera.image_height), + image_width=int(viewpoint_camera.image_width), + tanfovx=tanfovx, + tanfovy=tanfovy, + bg=bg_color, + scale_modifier=scaling_modifier, + viewmatrix=viewpoint_camera.world_view_transform, + projmatrix=viewpoint_camera.full_proj_transform, + sh_degree=pc.active_sh_degree, + campos=viewpoint_camera.camera_center, + prefiltered=False, + debug=pipe.debug + )) + + + rasterizer=GaussianRasterizerBatches(raster_settings=raster_settings_list) + start_time = time.time() + batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians_batches( + means3D=means3D, + scales=scales, + rotations=rotations, + shs=shs, + opacities=opacity, + cuda_args=batched_cuda_args + ) + end_time = time.time() + + preprocess_time = end_time - start_time + print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds") + def test_batched_gaussian_rasterizer(): # Set up the input data @@ -163,5 +245,7 @@ def test_batched_gaussian_rasterizer(): # Perform further operations with the batched results # ... + + if __name__ == "__main__": test_gaussian_rasterizer_time() \ No newline at end of file From 54302e4381903500d89ee8e8914d4a3dd638b812 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sun, 21 Apr 2024 10:20:52 -0400 Subject: [PATCH 03/53] batched rasterization --- rasterization_tests.py | 92 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 055727c..1aca6de 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -161,7 +161,95 @@ def test_batched_gaussian_rasterizer(): batched_radii.append(radii) # Perform further operations with the batched results - # ... + # Test results and performance + +def test_batched_gaussian_rasterizer_batch_processing(): + # Set up the input data + num_gaussians = 10000 + num_batches = 4 + means3D = torch.randn(num_gaussians, 3).cuda() + scales = torch.randn(num_gaussians, 3).cuda() + rotations = torch.randn(num_gaussians, 3, 3).cuda() + shs = torch.randn(num_gaussians, 9).cuda() + opacity = torch.randn(num_gaussians, 1).cuda() + + # Set up the viewpoint cameras + batched_viewpoint_cameras = [] + for _ in range(num_batches): + viewpoint_camera = type('ViewpointCamera', (), {}) + viewpoint_camera.FoVx = math.radians(60) + viewpoint_camera.FoVy = math.radians(60) + viewpoint_camera.image_height = 512 + viewpoint_camera.image_width = 512 + viewpoint_camera.world_view_transform = torch.eye(4).cuda() + viewpoint_camera.full_proj_transform = torch.eye(4).cuda() + viewpoint_camera.camera_center = torch.zeros(3).cuda() + batched_viewpoint_cameras.append(viewpoint_camera) + + # Set up the strategies + batched_strategies = [None] * num_batches + + # Set up other parameters + bg_color = torch.ones(3).cuda() + scaling_modifier = 1.0 + pc = type('PC', (), {}) + pc.active_sh_degree = 2 + pipe = type('Pipe', (), {}) + pipe.debug = False + mode = "train" + + # Set up rasterization configuration for the batch + batched_tanfovx = [math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras] + batched_tanfovy = [math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras] + batched_viewmatrix = [camera.world_view_transform for camera in batched_viewpoint_cameras] + batched_projmatrix = [camera.full_proj_transform for camera in batched_viewpoint_cameras] + batched_campos = [camera.camera_center for camera in batched_viewpoint_cameras] + + batched_raster_settings = GaussianRasterizationSettings( + image_height=int(batched_viewpoint_cameras[0].image_height), + image_width=int(batched_viewpoint_cameras[0].image_width), + tanfovx=batched_tanfovx, + tanfovy=batched_tanfovy, + bg=bg_color, + scale_modifier=scaling_modifier, + viewmatrix=batched_viewmatrix, + projmatrix=batched_projmatrix, + sh_degree=pc.active_sh_degree, + campos=batched_campos, + prefiltered=False, + debug=pipe.debug + ) + + # Create the GaussianRasterizer for the batch + rasterizer = GaussianRasterizer(raster_settings=batched_raster_settings) + + # Set up CUDA arguments for the batch + cuda_args = get_cuda_args(batched_strategies[0], mode) # TODO: Check if this is correct for the batch + + # Preprocess the Gaussians for the entire batch + batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians( + means3D=means3D, + scales=scales, + rotations=rotations, + shs=shs, + opacities=opacity, + cuda_args=cuda_args + ) + + if mode == "train": + batched_means2D.retain_grad() + + batched_screenspace_params = [batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths] + + # Perform assertions on the preprocessed data + assert batched_means2D.shape == (num_gaussians, num_batches, 2) + assert batched_rgb.shape == (num_gaussians, num_batches, 3) + assert batched_conic_opacity.shape == (num_gaussians, num_batches, 1) + assert batched_radii.shape == (num_gaussians, num_batches) + assert batched_depths.shape == (num_gaussians, num_batches) + if __name__ == "__main__": - test_gaussian_rasterizer_time() \ No newline at end of file + test_gaussian_rasterizer_time() + test_batched_gaussian_rasterizer_batch_processing() + \ No newline at end of file From 53e12d2067a0bf3fe7eb00f6e85e53ec35ac07e3 Mon Sep 17 00:00:00 2001 From: Prapti Devansh Trivedi Date: Sun, 21 Apr 2024 19:32:02 -0400 Subject: [PATCH 04/53] add rough idea for kernel --- cuda_rasterizer/rasterizer_impl.cu | 95 +++++++++++++++++++ diff_gaussian_rasterization/__init__.py | 119 ++++++++++++++++-------- ext.cpp | 1 + rasterization_tests.py | 2 +- rasterize_points.cu | 82 ++++++++++++++++ 5 files changed, 260 insertions(+), 39 deletions(-) diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index dec20fa..1215c25 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -424,6 +424,101 @@ int CudaRasterizer::Rasterizer::preprocessForward( return num_rendered; } + +int CudaRasterizer::Rasterizer::preprocessForwardBatches( + float2* means2D, + float* depths, + int* radii, + float* cov3D, + float4* conic_opacity, + float* rgb, + bool* clamped,//the above are all per-Gaussian intemediate results. + const int P, int D, int M, + const std::vector& width, std::vector& height, + const float* means3D, + const float* scales, + const float* rotations, + const float* shs, + const float* opacities,//3dgs parameters + const std::vector& scale_modifier, + const std::vector& viewmatrix, + const std::vector& projmatrix, + const std::vector& cam_pos, + const std::vector& tan_fovx, std::vector& tan_fovy, + const std::vector& prefiltered, + std::vector& debug,//raster_settings + const std::vector &args) +{ + auto [global_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args); + char* log_tmp = new char[500]; + + // print out the environment variables + if (mode == "train" && zhx_debug && iteration % log_interval == 1) { + sprintf(log_tmp, "world_size: %d, global_rank: %d, iteration: %d, log_folder: %s, zhx_debug: %d, zhx_time: %d, device: %d, log_interval: %d, dist_division_mode: %s", + world_size, global_rank, iteration, log_folder.c_str(), zhx_debug, zhx_time, device, log_interval, dist_division_mode.c_str()); + save_log_in_file(iteration, global_rank, world_size, log_folder, "cuda", log_tmp); + } + + MyTimerOnGPU timer; + // const float focal_y = height / (2.0f * tan_fovy); + // const float focal_x = width / (2.0f * tan_fovx); + const int num_viewpoints=viewmatrix.size(); + + //CONVERT ALL VECTORS TO FLOATSSSSSS PRAPTIIIIIII + + dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, num_viewpoints); + dim3 block(BLOCK_X, BLOCK_Y, num_viewpoints); + int tile_num = tile_grid.x * tile_grid.y*tile_grid.z; + + // allocate temporary buffer for tiles_touched. + // In sep_rendering==True case, we will compute tiles_touched in the renderForward. + // TODO: remove it later by modifying FORWARD::preprocess when we deprecate sep_rendering==False case + uint32_t* tiles_touched_temp_buffer; + CHECK_CUDA(cudaMalloc(&tiles_touched_temp_buffer, P * sizeof(uint32_t)), debug); + CHECK_CUDA(cudaMemset(tiles_touched_temp_buffer, 0, P * sizeof(uint32_t)), debug); + + timer.start("10 preprocess"); + // Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB) + CHECK_CUDA(FORWARD::preprocess( + P, D, M, + means3D, + (glm::vec3*)scales, + scale_modifier, + (glm::vec4*)rotations, + opacities, + shs, + clamped, + nullptr,//cov3D_precomp, + nullptr,//colors_precomp,TODO: this is correct? + viewmatrix, projmatrix, + (glm::vec3*)cam_pos, + width, height, + focal_x, focal_y, + tan_fovx, tan_fovy, + radii, + means2D, + depths, + cov3D, + rgb, + conic_opacity, + tile_grid, + tiles_touched_temp_buffer, + prefiltered + ), debug) + timer.stop("10 preprocess"); + + int num_rendered = 0;//TODO: should I calculate this here? + + // Print out timing information + if (zhx_time && iteration % log_interval == 1) { + timer.printAllTimes(iteration, world_size, global_rank, log_folder, true); + } + delete log_tmp; + // free temporary buffer for tiles_touched. TODO: remove it. + CHECK_CUDA(cudaFree(tiles_touched_temp_buffer), debug); + return num_rendered; +} + void CudaRasterizer::Rasterizer::preprocessBackward( const int* radii, const float* cov3D, diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index 73ab7e9..957d9cd 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -31,7 +31,7 @@ def preprocess_gaussians( sh, opacities, raster_settings, - cuda_args, + cuda_args,flag_batched=False ): return _PreprocessGaussians.apply( means3D, @@ -40,7 +40,7 @@ def preprocess_gaussians( sh, opacities, raster_settings, - cuda_args, + cuda_args,flag_batched ) class _PreprocessGaussians(torch.autograd.Function): @@ -52,45 +52,88 @@ def forward( rotations, sh, opacities, - raster_settings, - cuda_args, + raster_settings_list, + batched_cuda_args,flag_batched ): # Restructure arguments the way that the C++ lib expects them - args = ( - means3D, - scales, - rotations, - sh, - opacities,# 3dgs' parametes. - raster_settings.scale_modifier, - raster_settings.viewmatrix, - raster_settings.projmatrix, - raster_settings.tanfovx, - raster_settings.tanfovy, - raster_settings.image_height, - raster_settings.image_width, - raster_settings.sh_degree, - raster_settings.campos, - raster_settings.prefiltered, - raster_settings.debug,#raster_settings - cuda_args - ) - - # TODO: update this. - num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args) + if flag_batched==False: + args = ( + means3D, + scales, + rotations, + sh, + opacities,# 3dgs' parametes. + raster_settings.scale_modifier, + raster_settings.viewmatrix, + raster_settings.projmatrix, + raster_settings.tanfovx, + raster_settings.tanfovy, + raster_settings.image_height, + raster_settings.image_width, + raster_settings.sh_degree, + raster_settings.campos, + raster_settings.prefiltered, + raster_settings.debug,#raster_settings + cuda_args + ) + + # TODO: update this. + num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args) + + # Keep relevant tensors for backward + ctx.raster_settings = raster_settings + ctx.cuda_args = cuda_args + ctx.num_rendered = num_rendered + ctx.save_for_backward(means3D, scales, rotations, sh, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped) + ctx.mark_non_differentiable(radii, depths) + + # # TODO: double check. means2D is padded to (P, 3) in python. It is (P, 2) in cuda code. + # means2D_pad = torch.zeros((means2D.shape[0], 1), dtype = means2D.dtype, device = means2D.device) + # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous() + return means2D, rgb, conic_opacity, radii, depths + + else: + args_list=[] + for raster_settings,cuda_args in zip(raster_settings_list,batched_cuda_args): + + args = ( + means3D, + scales, + rotations, + sh, + opacities,# 3dgs' parametes. + raster_settings.scale_modifier, + raster_settings.viewmatrix, + raster_settings.projmatrix, + raster_settings.tanfovx, + raster_settings.tanfovy, + raster_settings.image_height, + raster_settings.image_width, + raster_settings.sh_degree, + raster_settings.campos, + raster_settings.prefiltered, + raster_settings.debug,#raster_settings + cuda_args + ) + args_list.append(args) + + # TODO: update this. + num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batches(*args_list) + + # Keep relevant tensors for backward + ctx.raster_settings = raster_settings_list + ctx.cuda_args = batched_cuda_args + ctx.num_rendered = num_rendered + ctx.save_for_backward(means3D, scales, rotations, sh, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped) + ctx.mark_non_differentiable(radii, depths) + + # # TODO: double check. means2D is padded to (P, 3) in python. It is (P, 2) in cuda code. + # means2D_pad = torch.zeros((means2D.shape[0], 1), dtype = means2D.dtype, device = means2D.device) + # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous() + return means2D, rgb, conic_opacity, radii, depths - # Keep relevant tensors for backward - ctx.raster_settings = raster_settings - ctx.cuda_args = cuda_args - ctx.num_rendered = num_rendered - ctx.save_for_backward(means3D, scales, rotations, sh, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped) - ctx.mark_non_differentiable(radii, depths) - # # TODO: double check. means2D is padded to (P, 3) in python. It is (P, 2) in cuda code. - # means2D_pad = torch.zeros((means2D.shape[0], 1), dtype = means2D.dtype, device = means2D.device) - # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous() - return means2D, rgb, conic_opacity, radii, depths @staticmethod # TODO: gradient for conic_opacity is tricky. because cuda render backward generate dL_dconic and dL_dopacity sperately. def backward(ctx, grad_means2D, grad_rgb, grad_conic_opacity, grad_radii, grad_depths): @@ -320,14 +363,14 @@ def markVisible(self, positions): def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batched_cuda_args=None): # Invoke C++/CUDA rasterization routine - return preprocess_gaussians_batches( + return preprocess_gaussians( means3D, scales, rotations, shs, opacities, self.raster_settings_list, - batched_cuda_args) + batched_cuda_args,True) class GaussianRasterizer(nn.Module): def __init__(self, raster_settings): diff --git a/ext.cpp b/ext.cpp index a957cd2..e4249bb 100644 --- a/ext.cpp +++ b/ext.cpp @@ -16,6 +16,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("mark_visible", &markVisible); m.def("preprocess_gaussians", &PreprocessGaussiansCUDA); + m.def("preprocess_gaussians_batched", &PreprocessGaussiansCUDABatches); m.def("preprocess_gaussians_backward", &PreprocessGaussiansBackwardCUDA); m.def("get_distribution_strategy", &GetDistributionStrategyCUDA); m.def("render_gaussians", &RenderGaussiansCUDA); diff --git a/rasterization_tests.py b/rasterization_tests.py index 1dcc91a..b26f3cd 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -143,7 +143,7 @@ def test_improved_gaussian_rasterizer(): rasterizer=GaussianRasterizerBatches(raster_settings=raster_settings_list) start_time = time.time() - batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians_batches( + batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians( means3D=means3D, scales=scales, rotations=rotations, diff --git a/rasterize_points.cu b/rasterize_points.cu index e4400a6..9a3fb7b 100644 --- a/rasterize_points.cu +++ b/rasterize_points.cu @@ -142,6 +142,88 @@ PreprocessGaussiansCUDA( return std::make_tuple(rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped); } +std::tuple +PreprocessGaussiansCUDABatches( + const torch::Tensor& means3D, + const torch::Tensor& scales, + const torch::Tensor& rotations, + const torch::Tensor& sh, + const torch::Tensor& opacity,//3dgs' parametes. + const std::vector& scale_modifier, + const std::vector& viewmatrix, + const std::vector& projmatrix, + const std::vector& tan_fovx, + const std::vector& tan_fovy, + const std::vector& image_height, + const std::vector& image_width, + const std::vector& degree, + const std::vector& campos, + const std::vector& prefiltered,//raster_settings + const std::vector& debug, + const std::vector &args) { + + if (means3D.ndimension() != 2 || means3D.size(1) != 3) { + AT_ERROR("means3D must have dimensions (num_points, 3)"); + } + + const int P = means3D.size(0); + // const int H = image_height; + // const int W = image_width; + + // of shape (P, 2). means2D is (P, 2) in cuda. It will be converted to (P, 3) when is sent back to python to meet torch graph's requirement. + torch::Tensor means2D = torch::full({P, 2}, 0.0, means3D.options());//TODO: what about require_grads? + // of shape (P) + torch::Tensor depths = torch::full({P}, 0.0, means3D.options()); + // of shape (P) + torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32)); + // of shape (P, 6) + torch::Tensor cov3D = torch::full({P, 6}, 0.0, means3D.options()); + // of shape (P, 4) + torch::Tensor conic_opacity = torch::full({P, 4}, 0.0, means3D.options()); + // of shape (P, 3) + torch::Tensor rgb = torch::full({P, 3}, 0.0, means3D.options()); + // of shape (P) + torch::Tensor clamped = torch::full({P, 3}, false, means3D.options().dtype(at::kBool)); + //TODO: compare to original GeometryState implementation, this one does not explicitly do gpu memory alignment. + //That may lead to problems. However, pytorch does implicit memory alignment. + + int rendered = 0;//TODO: I could compute rendered here by summing up geomState.tiles_touched. + if(P != 0) + { + int M = 0; + if(sh.size(0) != 0) + { + M = sh.size(1); + } + + rendered = CudaRasterizer::Rasterizer::preprocessForwardBatches( + reinterpret_cast(means2D.contiguous().data()),//TODO: check whether it supports float2? + depths.contiguous().data(), + radii.contiguous().data(), + cov3D.contiguous().data(), + reinterpret_cast(conic_opacity.contiguous().data()), + rgb.contiguous().data(), + clamped.contiguous().data(), + P, degree, M, + image_width, image_height, + means3D, + scales, + rotations, + sh, + opacity, + scale_modifier, + viewmatrix, + projmatrix, + campos, + tan_fovx, + tan_fovy, + prefiltered, + debug, + args); + } + return std::make_tuple(rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped); +} + std::tuple PreprocessGaussiansBackwardCUDA( From f0e0469a7141a53cdb6cde62eba59f173b468beb Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sun, 21 Apr 2024 22:23:59 -0400 Subject: [PATCH 05/53] Refactor rasterizer import in rasterization_tests.py --- rasterization_tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index f5100f7..ccbde17 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -6,6 +6,7 @@ from diff_gaussian_rasterization import ( GaussianRasterizationSettings, GaussianRasterizer, + GaussianRasterizerBatches, ) @@ -303,7 +304,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): ) # Create the GaussianRasterizer for the batch - rasterizer = GaussianRasterizer(raster_settings=batched_raster_settings) + rasterizer = GaussianRasterizerBatches(raster_settings=batched_raster_settings) # Set up CUDA arguments for the batch cuda_args = get_cuda_args(batched_strategies[0], mode) # TODO: Check if this is correct for the batch From a16acd034afe77f91f29528d708cc94078e68dfa Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sun, 21 Apr 2024 22:46:43 -0400 Subject: [PATCH 06/53] Refactor GaussianRasterizerBatches class to support batched preprocess_gaussians function. --- diff_gaussian_rasterization/__init__.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index 73ab7e9..64a826f 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -9,11 +9,14 @@ # For inquiries contact george.drettakis@inria.fr # +import time from typing import NamedTuple -import torch.nn as nn + import torch +import torch.nn as nn + from . import _C -import time + def cpu_deep_copy_tuple(input_tuple): copied_tensors = [item.cpu().clone() if isinstance(item, torch.Tensor) else item for item in input_tuple] @@ -78,7 +81,11 @@ def forward( ) # TODO: update this. - num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args) + batch_size = len(raster_settings.tanfovx) if isinstance(raster_settings.tanfovx, list) else 1 + if batch_size == 1: + num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args) + else: + num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batched(*args) # Keep relevant tensors for backward ctx.raster_settings = raster_settings @@ -320,7 +327,7 @@ def markVisible(self, positions): def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batched_cuda_args=None): # Invoke C++/CUDA rasterization routine - return preprocess_gaussians_batches( + return preprocess_gaussians( means3D, scales, rotations, From 268f46a5cd05f0608de1b7aef6755da3bc8c0765 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sun, 21 Apr 2024 22:54:13 -0400 Subject: [PATCH 07/53] Refactor preprocess_gaussians function to remove flag_batched parameter in __init__.py --- diff_gaussian_rasterization/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index 8152d7f..ca95f73 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -34,7 +34,7 @@ def preprocess_gaussians( sh, opacities, raster_settings, - cuda_args,flag_batched=False + cuda_args ): return _PreprocessGaussians.apply( means3D, @@ -43,7 +43,7 @@ def preprocess_gaussians( sh, opacities, raster_settings, - cuda_args,flag_batched + cuda_args ) class _PreprocessGaussians(torch.autograd.Function): @@ -336,7 +336,7 @@ def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batch shs, opacities, self.raster_settings_list, - batched_cuda_args,True) + batched_cuda_args) class GaussianRasterizer(nn.Module): def __init__(self, raster_settings): From 7361323a6c84edc4b17673e763d8731e3e7bb066 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Mon, 22 Apr 2024 17:12:36 -0400 Subject: [PATCH 08/53] batched forward pass kernel --- cuda_rasterizer/forward.cu | 149 +++++++++++++++++++++++++++++ cuda_rasterizer/rasterizer_impl.cu | 24 ++--- rasterization_tests.py | 10 +- rasterize_points.cu | 68 +++++++------ 4 files changed, 196 insertions(+), 55 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 9362275..102c6d9 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -498,4 +498,153 @@ void FORWARD::preprocess(int P, int D, int M, tiles_touched, prefiltered ); +} + + +template +__global__ void preprocessCUDABatched( + int P, int D, int M, + const float* orig_points, const glm::vec3* scales, const float scale_modifier, + const glm::vec4* rotations, const float* opacities, const float* shs, + bool* clamped, const float* cov3D_precomp, const float* colors_precomp, + const float* viewmatrix_arr, const float* projmatrix_arr, const glm::vec3* cam_pos, + const int W, int H, const float* focal_x, const float* focal_y, + const float* tan_fovx, const float* tan_fovy, + int* radii, float2* points_xy_image, float* depths, float* cov3Ds, + float* rgb, float4* conic_opacity, const dim3 grid, uint32_t* tiles_touched, + bool prefiltered, int num_viewpoints) +{ + auto point_idx = cg::this_grid().thread_rank(); + auto viewpoint_idx = blockIdx.z; + + if (viewpoint_idx >= num_viewpoints || point_idx >= P) return; + + auto idx = viewpoint_idx * P + point_idx; + const float* viewmatrix = viewmatrix_arr + viewpoint_idx * 16; + const float* projmatrix = projmatrix_arr + viewpoint_idx * 16; + + // Initialize radius and touched tiles to 0. If this isn't changed, + // this Gaussian will not be processed further. + radii[idx] = 0; + tiles_touched[idx] = 0; + + // Perform near culling, quit if outside. + float3 p_view; + if (!in_frustum(point_idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return; + + // Transform point by projecting + float3 p_orig = { orig_points[3 * point_idx], orig_points[3 * point_idx + 1], orig_points[3 * point_idx + 2] }; + float4 p_hom = transformPoint4x4(p_orig, projmatrix); + float p_w = 1.0f / (p_hom.w + 0.0000001f); + float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; + + // If 3D covariance matrix is precomputed, use it, otherwise compute + // from scaling and rotation parameters. + const float* cov3D; + if (cov3D_precomp != nullptr) { + cov3D = cov3D_precomp + point_idx * 6; + } else { + computeCov3D(scales[point_idx], scale_modifier, rotations[point_idx], cov3Ds + idx * 6); + cov3D = cov3Ds + idx * 6; + } + + // Compute 2D screen-space covariance matrix + const float focal_x = W / (2.0f * tan_fovx[viewpoint_idx]); + const float focal_y = H / (2.0f * tan_fovy[viewpoint_idx]); + float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx[viewpoint_idx], tan_fovy[viewpoint_idx], cov3D, viewmatrix); + + // Invert covariance (EWA algorithm) + float det = (cov.x * cov.z - cov.y * cov.y); + if (det == 0.0f) return; + float det_inv = 1.f / det; + float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv }; + + // Compute extent in screen space (by finding eigenvalues of + // 2D covariance matrix). Use extent to compute a bounding rectangle + // of screen-space tiles that this Gaussian overlaps with. Quit if + // rectangle covers 0 tiles. + float mid = 0.5f * (cov.x + cov.z); + float lambda1 = mid + sqrt(max(0.1f, mid * mid - det)); + float lambda2 = mid - sqrt(max(0.1f, mid * mid - det)); + float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2))); + float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) }; + uint2 rect_min, rect_max; + getRect(point_image, my_radius, rect_min, rect_max, grid); + if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0) return; + + // If colors have been precomputed, use them, otherwise convert + // spherical harmonics coefficients to RGB color. + if (colors_precomp == nullptr) { + glm::vec3 result = computeColorFromSH(point_idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3); + rgb[idx * C + 0] = result.x; + rgb[idx * C + 1] = result.y; + rgb[idx * C + 2] = result.z; + } + + // Store some useful helper data for the next steps. + depths[idx] = p_view.z; + radii[idx] = my_radius; + points_xy_image[idx] = point_image; + + // Inverse 2D covariance and opacity neatly pack into one float4 + conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[point_idx] }; + tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); +} + +void FORWARD::preprocess(int P, int D, int M, + const float* means3D, + const glm::vec3* scales, + const float scale_modifier, + const glm::vec4* rotations, + const float* opacities, + const float* shs, + bool* clamped, + const float* cov3D_precomp, + const float* colors_precomp, + const float* viewmatrix, + const float* projmatrix, + const glm::vec3* cam_pos, + const int W, int H, + const float focal_x, float focal_y, + const float tan_fovx, float tan_fovy, + int* radii, + float2* means2D, + float* depths, + float* cov3Ds, + float* rgb, + float4* conic_opacity, + const dim3 grid, + uint32_t* tiles_touched, + bool prefiltered, + int num_viewpoints) +{ + dim3 block(BLOCK_X, BLOCK_Y, 1); + dim3 grid((P + BLOCK_X - 1) / BLOCK_X, 1, num_viewpoints); + preprocessCUDABatched<<>>( + P, D, M, + means3D, + scales, + scale_modifier, + rotations, + opacities, + shs, + clamped, + cov3D_precomp, + colors_precomp, + viewmatrix, + projmatrix, + cam_pos, + W, H, + tan_fovx, tan_fovy, + focal_x, focal_y, + radii, + means2D, + depths, + cov3Ds, + rgb, + conic_opacity, + grid, + tiles_touched, + prefiltered, + ); } \ No newline at end of file diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index 1215c25..09a82b6 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -434,20 +434,21 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches( float* rgb, bool* clamped,//the above are all per-Gaussian intemediate results. const int P, int D, int M, - const std::vector& width, std::vector& height, + const int width, int height, const float* means3D, const float* scales, const float* rotations, const float* shs, const float* opacities,//3dgs parameters - const std::vector& scale_modifier, - const std::vector& viewmatrix, - const std::vector& projmatrix, - const std::vector& cam_pos, - const std::vector& tan_fovx, std::vector& tan_fovy, - const std::vector& prefiltered, - std::vector& debug,//raster_settings - const std::vector &args) + const float scale_modifier, + const float* viewmatrix, + const float* projmatrix, + const float* cam_pos, + const float* tan_fovx, float* tan_fovy, + const bool prefiltered, + const int num_viewpoints, + bool debug,//raster_settings + const pybind11::dict &args) { auto [global_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args); char* log_tmp = new char[500]; @@ -462,14 +463,9 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches( MyTimerOnGPU timer; // const float focal_y = height / (2.0f * tan_fovy); // const float focal_x = width / (2.0f * tan_fovx); - const int num_viewpoints=viewmatrix.size(); //CONVERT ALL VECTORS TO FLOATSSSSSS PRAPTIIIIIII - dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, num_viewpoints); - dim3 block(BLOCK_X, BLOCK_Y, num_viewpoints); - int tile_num = tile_grid.x * tile_grid.y*tile_grid.z; - // allocate temporary buffer for tiles_touched. // In sep_rendering==True case, we will compute tiles_touched in the renderForward. // TODO: remove it later by modifying FORWARD::preprocess when we deprecate sep_rendering==False case diff --git a/rasterization_tests.py b/rasterization_tests.py index 7c8a93b..10da643 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -282,11 +282,11 @@ def test_batched_gaussian_rasterizer_batch_processing(): mode = "train" # Set up rasterization configuration for the batch - batched_tanfovx = [math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras] - batched_tanfovy = [math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras] - batched_viewmatrix = [camera.world_view_transform for camera in batched_viewpoint_cameras] - batched_projmatrix = [camera.full_proj_transform for camera in batched_viewpoint_cameras] - batched_campos = [camera.camera_center for camera in batched_viewpoint_cameras] + batched_tanfovx = torch.stack([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]) + batched_tanfovy = torch.stack([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]) + batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras]) + batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras]) + batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras]) batched_raster_settings = GaussianRasterizationSettings( image_height=int(batched_viewpoint_cameras[0].image_height), diff --git a/rasterize_points.cu b/rasterize_points.cu index 9a3fb7b..e8eb8a7 100644 --- a/rasterize_points.cu +++ b/rasterize_points.cu @@ -149,52 +149,47 @@ PreprocessGaussiansCUDABatches( const torch::Tensor& rotations, const torch::Tensor& sh, const torch::Tensor& opacity,//3dgs' parametes. - const std::vector& scale_modifier, - const std::vector& viewmatrix, - const std::vector& projmatrix, - const std::vector& tan_fovx, - const std::vector& tan_fovy, - const std::vector& image_height, - const std::vector& image_width, - const std::vector& degree, - const std::vector& campos, - const std::vector& prefiltered,//raster_settings - const std::vector& debug, - const std::vector &args) { + const float scale_modifier, + const torch::Tensor& viewmatrix, + const torch::Tensor& projmatrix, + const torch::Tensor& tan_fovx, + const torch::Tensor& tan_fovy, + const int image_height, + const int image_width, + const int degree, + const torch::Tensor& campos, + const bool prefiltered,//raster_settings + const bool debug, + const pybind11::dict &args) { if (means3D.ndimension() != 2 || means3D.size(1) != 3) { AT_ERROR("means3D must have dimensions (num_points, 3)"); } const int P = means3D.size(0); - // const int H = image_height; - // const int W = image_width; + const int num_viewpoints = viewmatrix.size(0); // of shape (P, 2). means2D is (P, 2) in cuda. It will be converted to (P, 3) when is sent back to python to meet torch graph's requirement. - torch::Tensor means2D = torch::full({P, 2}, 0.0, means3D.options());//TODO: what about require_grads? + torch::Tensor means2D = torch::full({num_viewpoints, P, 2}, 0.0, means3D.options());//TODO: what about require_grads? // of shape (P) - torch::Tensor depths = torch::full({P}, 0.0, means3D.options()); + torch::Tensor depths = torch::full({num_viewpoints, P}, 0.0, means3D.options()); // of shape (P) - torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32)); + torch::Tensor radii = torch::full({num_viewpoints, P}, 0, means3D.options().dtype(torch::kInt32)); // of shape (P, 6) - torch::Tensor cov3D = torch::full({P, 6}, 0.0, means3D.options()); + torch::Tensor cov3D = torch::full({num_viewpoints, P, 6}, 0.0, means3D.options()); // of shape (P, 4) - torch::Tensor conic_opacity = torch::full({P, 4}, 0.0, means3D.options()); + torch::Tensor conic_opacity = torch::full({num_viewpoints, P, 4}, 0.0, means3D.options()); // of shape (P, 3) - torch::Tensor rgb = torch::full({P, 3}, 0.0, means3D.options()); + torch::Tensor rgb = torch::full({num_viewpoints, P, 3}, 0.0, means3D.options()); // of shape (P) - torch::Tensor clamped = torch::full({P, 3}, false, means3D.options().dtype(at::kBool)); + torch::Tensor clamped = torch::full({num_viewpoints, P, 3}, false, means3D.options().dtype(at::kBool)); //TODO: compare to original GeometryState implementation, this one does not explicitly do gpu memory alignment. //That may lead to problems. However, pytorch does implicit memory alignment. int rendered = 0;//TODO: I could compute rendered here by summing up geomState.tiles_touched. if(P != 0) { - int M = 0; - if(sh.size(0) != 0) - { - M = sh.size(1); - } + int M = sh.size(0) != 0 ? sh.size(1) : 0; rendered = CudaRasterizer::Rasterizer::preprocessForwardBatches( reinterpret_cast(means2D.contiguous().data()),//TODO: check whether it supports float2? @@ -206,18 +201,19 @@ PreprocessGaussiansCUDABatches( clamped.contiguous().data(), P, degree, M, image_width, image_height, - means3D, - scales, - rotations, - sh, - opacity, + means3D.contiguous().data(), + scales.contiguous().data_ptr(), + rotations.contiguous().data_ptr(), + sh.contiguous().data_ptr(), + opacity.contiguous().data(), scale_modifier, - viewmatrix, - projmatrix, - campos, - tan_fovx, - tan_fovy, + viewmatrix.contiguous().data(), + projmatrix.contiguous().data(), + campos.contiguous().data(), + tan_fovx.contiguous().data(), + tan_fovy.contiguous().data(), prefiltered, + num_viewpoints, debug, args); } From 7f4935d86ca48772e57e1356b9ef57a5a5c5e0b0 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Tue, 23 Apr 2024 00:21:58 -0400 Subject: [PATCH 09/53] added headers and changed kernel structure to 1d block --- cuda_rasterizer/forward.cu | 17 ++++++--------- cuda_rasterizer/forward.h | 33 ++++++++++++++++++++++++++++-- cuda_rasterizer/rasterizer_impl.cu | 10 ++++----- rasterize_points.h | 20 ++++++++++++++++++ 4 files changed, 62 insertions(+), 18 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 102c6d9..156985b 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -471,7 +471,7 @@ void FORWARD::preprocess(int P, int D, int M, uint32_t* tiles_touched, bool prefiltered) { - preprocessCUDA << <(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >> > ( + preprocessCUDA << > > ( P, D, M, means3D, scales, @@ -508,14 +508,13 @@ __global__ void preprocessCUDABatched( const glm::vec4* rotations, const float* opacities, const float* shs, bool* clamped, const float* cov3D_precomp, const float* colors_precomp, const float* viewmatrix_arr, const float* projmatrix_arr, const glm::vec3* cam_pos, - const int W, int H, const float* focal_x, const float* focal_y, - const float* tan_fovx, const float* tan_fovy, + const int W, int H, const float* tan_fovx, const float* tan_fovy, int* radii, float2* points_xy_image, float* depths, float* cov3Ds, float* rgb, float4* conic_opacity, const dim3 grid, uint32_t* tiles_touched, bool prefiltered, int num_viewpoints) { auto point_idx = cg::this_grid().thread_rank(); - auto viewpoint_idx = blockIdx.z; + auto viewpoint_idx = blockIdx.y; if (viewpoint_idx >= num_viewpoints || point_idx >= P) return; @@ -591,7 +590,7 @@ __global__ void preprocessCUDABatched( tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); } -void FORWARD::preprocess(int P, int D, int M, +void FORWARD::preprocess_batch(int P, int D, int M, const float* means3D, const glm::vec3* scales, const float scale_modifier, @@ -605,8 +604,7 @@ void FORWARD::preprocess(int P, int D, int M, const float* projmatrix, const glm::vec3* cam_pos, const int W, int H, - const float focal_x, float focal_y, - const float tan_fovx, float tan_fovy, + const float* tan_fovx, float* tan_fovy, int* radii, float2* means2D, float* depths, @@ -618,9 +616,7 @@ void FORWARD::preprocess(int P, int D, int M, bool prefiltered, int num_viewpoints) { - dim3 block(BLOCK_X, BLOCK_Y, 1); - dim3 grid((P + BLOCK_X - 1) / BLOCK_X, 1, num_viewpoints); - preprocessCUDABatched<<>>( + preprocessCUDABatched<<>>( P, D, M, means3D, scales, @@ -636,7 +632,6 @@ void FORWARD::preprocess(int P, int D, int M, cam_pos, W, H, tan_fovx, tan_fovy, - focal_x, focal_y, radii, means2D, depths, diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h index 86e5cb9..0f036d9 100644 --- a/cuda_rasterizer/forward.h +++ b/cuda_rasterizer/forward.h @@ -45,7 +45,35 @@ namespace FORWARD float4* conic_opacity, const dim3 grid, uint32_t* tiles_touched, - bool prefiltered); + bool prefiltered + ); + + void preprocess_batch(int P, int D, int M, + const float* means3D, + const glm::vec3* scales, + const float scale_modifier, + const glm::vec4* rotations, + const float* opacities, + const float* shs, + bool* clamped, + const float* cov3D_precomp, + const float* colors_precomp, + const float* viewmatrix, + const float* projmatrix, + const glm::vec3* cam_pos, + const int W, int H, + const float* tan_fovx, float* tan_fovy, + int* radii, + float2* means2D, + float* depths, + float* cov3Ds, + float* rgb, + float4* conic_opacity, + const dim3 grid, + uint32_t* tiles_touched, + bool prefiltered, + int num_viewpoints + ); // Main rasterization method. void render( @@ -61,7 +89,8 @@ namespace FORWARD uint32_t* n_contrib2loss, const int* compute_locally_1D_2D_map, const float* bg_color, - float* out_color); + float* out_color + ); } diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index 09a82b6..4aca70e 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -461,11 +461,11 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches( } MyTimerOnGPU timer; - // const float focal_y = height / (2.0f * tan_fovy); - // const float focal_x = width / (2.0f * tan_fovx); //CONVERT ALL VECTORS TO FLOATSSSSSS PRAPTIIIIIII + dim3 tile_grid(cdiv(P, ONE_DIM_BLOCK_SIZE), num_viewpoints); + // allocate temporary buffer for tiles_touched. // In sep_rendering==True case, we will compute tiles_touched in the renderForward. // TODO: remove it later by modifying FORWARD::preprocess when we deprecate sep_rendering==False case @@ -475,7 +475,7 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches( timer.start("10 preprocess"); // Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB) - CHECK_CUDA(FORWARD::preprocess( + CHECK_CUDA(FORWARD::preprocess_batch( P, D, M, means3D, (glm::vec3*)scales, @@ -486,10 +486,10 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches( clamped, nullptr,//cov3D_precomp, nullptr,//colors_precomp,TODO: this is correct? - viewmatrix, projmatrix, + viewmatrix, + projmatrix, (glm::vec3*)cam_pos, width, height, - focal_x, focal_y, tan_fovx, tan_fovy, radii, means2D, diff --git a/rasterize_points.h b/rasterize_points.h index 86798ec..3700126 100644 --- a/rasterize_points.h +++ b/rasterize_points.h @@ -49,6 +49,26 @@ PreprocessGaussiansCUDA( const bool debug, const pybind11::dict &args); +std::tuple +PreprocessGaussiansCUDABatches( + const torch::Tensor& means3D, + const torch::Tensor& scales, + const torch::Tensor& rotations, + const torch::Tensor& sh, + const torch::Tensor& opacity,//3dgs' parametes. + const float scale_modifier, + const torch::Tensor& viewmatrix, + const torch::Tensor& projmatrix, + const torch::Tensor& tan_fovx, + const torch::Tensor& tan_fovy, + const int image_height, + const int image_width, + const int degree, + const torch::Tensor& campos, + const bool prefiltered,//raster_settings + const bool debug, + const pybind11::dict &args); + std::tuple PreprocessGaussiansBackwardCUDA( const torch::Tensor& radii, From 5f05af5b0f5d689cf219afd2a5a868c7d360fc2a Mon Sep 17 00:00:00 2001 From: Prapti Devansh Trivedi Date: Tue, 23 Apr 2024 15:14:05 -0400 Subject: [PATCH 10/53] solved syntax errors --- cuda_rasterizer/forward.cu | 5 +++-- cuda_rasterizer/forward.h | 2 +- cuda_rasterizer/rasterizer.h | 25 +++++++++++++++++++++++++ cuda_rasterizer/rasterizer_impl.cu | 3 ++- rasterization_tests.py | 6 +----- 5 files changed, 32 insertions(+), 9 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 156985b..1e3e119 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -511,7 +511,7 @@ __global__ void preprocessCUDABatched( const int W, int H, const float* tan_fovx, const float* tan_fovy, int* radii, float2* points_xy_image, float* depths, float* cov3Ds, float* rgb, float4* conic_opacity, const dim3 grid, uint32_t* tiles_touched, - bool prefiltered, int num_viewpoints) + bool prefiltered, const int num_viewpoints) { auto point_idx = cg::this_grid().thread_rank(); auto viewpoint_idx = blockIdx.y; @@ -614,7 +614,7 @@ void FORWARD::preprocess_batch(int P, int D, int M, const dim3 grid, uint32_t* tiles_touched, bool prefiltered, - int num_viewpoints) + const int num_viewpoints) { preprocessCUDABatched<<>>( P, D, M, @@ -641,5 +641,6 @@ void FORWARD::preprocess_batch(int P, int D, int M, grid, tiles_touched, prefiltered, + num_viewpoints ); } \ No newline at end of file diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h index 0f036d9..6c4a1da 100644 --- a/cuda_rasterizer/forward.h +++ b/cuda_rasterizer/forward.h @@ -72,7 +72,7 @@ namespace FORWARD const dim3 grid, uint32_t* tiles_touched, bool prefiltered, - int num_viewpoints + const int num_viewpoints ); // Main rasterization method. diff --git a/cuda_rasterizer/rasterizer.h b/cuda_rasterizer/rasterizer.h index ddc989c..4ad1bf8 100644 --- a/cuda_rasterizer/rasterizer.h +++ b/cuda_rasterizer/rasterizer.h @@ -65,6 +65,31 @@ namespace CudaRasterizer bool debug,//raster_settings const pybind11::dict &args); + static int preprocessForwardBatches( + float2* means2D, + float* depths, + int* radii, + float* cov3D, + float4* conic_opacity, + float* rgb, + bool* clamped,//the above are all per-Gaussian intemediate results. + const int P, int D, int M, + const int width, int height, + const float* means3D, + const float* scales, + const float* rotations, + const float* shs, + const float* opacities,//3dgs parameters + const float scale_modifier, + const float* viewmatrix, + const float* projmatrix, + const float* cam_pos, + const float* tan_fovx, float* tan_fovy, + const bool prefiltered, + const int num_viewpoints, + bool debug,//raster_settings + const pybind11::dict &args); + static void preprocessBackward( const int* radii, const float* cov3D, diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index 4aca70e..2f43492 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -499,7 +499,8 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches( conic_opacity, tile_grid, tiles_touched_temp_buffer, - prefiltered + prefiltered, + num_viewpoints ), debug) timer.stop("10 preprocess"); diff --git a/rasterization_tests.py b/rasterization_tests.py index 10da643..cb65cdb 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -3,11 +3,7 @@ import torch -from diff_gaussian_rasterization import ( - GaussianRasterizationSettings, - GaussianRasterizer, - GaussianRasterizerBatches, -) +import diff_gaussian_rasterization import (GaussianRasterizationSettings,GaussianRasterizer,GaussianRasterizerBatches) def get_cuda_args(strategy, mode="train"): From 543d4b85d47491c1b91acd8ef7fad8eb8de71fb1 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Tue, 23 Apr 2024 22:20:28 -0400 Subject: [PATCH 11/53] fixed import syntax in test --- rasterization_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index cb65cdb..88277ef 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -3,7 +3,7 @@ import torch -import diff_gaussian_rasterization import (GaussianRasterizationSettings,GaussianRasterizer,GaussianRasterizerBatches) +from diff_gaussian_rasterization import (GaussianRasterizationSettings,GaussianRasterizer,GaussianRasterizerBatches) def get_cuda_args(strategy, mode="train"): @@ -333,4 +333,4 @@ def test_batched_gaussian_rasterizer_batch_processing(): if __name__ == "__main__": test_gaussian_rasterizer_time() test_batched_gaussian_rasterizer_batch_processing() - \ No newline at end of file + From 8ca5a9fb07215f0e52549dcaa6a115a845f23fa4 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Tue, 23 Apr 2024 23:00:23 -0400 Subject: [PATCH 12/53] formatting changes --- rasterization_tests.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 88277ef..90f18b9 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -3,7 +3,11 @@ import torch -from diff_gaussian_rasterization import (GaussianRasterizationSettings,GaussianRasterizer,GaussianRasterizerBatches) +from diff_gaussian_rasterization import ( + GaussianRasterizationSettings, + GaussianRasterizer, + GaussianRasterizerBatches, +) def get_cuda_args(strategy, mode="train"): From 0dbe8fd4f4b116146c5e8e3128047b54fea2d1be Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Tue, 23 Apr 2024 23:14:34 -0400 Subject: [PATCH 13/53] Refactor GaussianRasterizerBatches class to use torch.tensor instead of math.tan in test_batched_gaussian_rasterizer_batch_processing function --- rasterization_tests.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 90f18b9..0200210 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -234,8 +234,8 @@ def test_batched_gaussian_rasterizer(): cuda_args=cuda_args ) - if mode == "train": - means2D.retain_grad() + # if mode == "train": + # means2D.retain_grad() batched_means2D.append(means2D) screenspace_params = [means2D, rgb, conic_opacity, radii, depths] @@ -282,8 +282,8 @@ def test_batched_gaussian_rasterizer_batch_processing(): mode = "train" # Set up rasterization configuration for the batch - batched_tanfovx = torch.stack([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]) - batched_tanfovy = torch.stack([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]) + batched_tanfovx = torch.tensor([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]) + batched_tanfovy = torch.tensor([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]) batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras]) batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras]) batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras]) @@ -335,6 +335,6 @@ def test_batched_gaussian_rasterizer_batch_processing(): if __name__ == "__main__": - test_gaussian_rasterizer_time() + test_batched_gaussian_rasterizer() test_batched_gaussian_rasterizer_batch_processing() From 193fa821e57fdba31b7f309f00c4fdbe5f7d1b3b Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Tue, 23 Apr 2024 23:20:44 -0400 Subject: [PATCH 14/53] Refactor variable name in test_batched_gaussian_rasterizer_batch_processing function --- rasterization_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 0200210..7ba64a6 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -316,7 +316,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): rotations=rotations, shs=shs, opacities=opacity, - cuda_args=cuda_args + batched_cuda_args=cuda_args ) if mode == "train": From ac43fc44625ea62a1e6fed753d08f1ba7db52caa Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Tue, 23 Apr 2024 23:26:12 -0400 Subject: [PATCH 15/53] Refactor preprocess_gaussians function to handle batched and non-batched inputs in __init__.py --- diff_gaussian_rasterization/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index ca95f73..e73be82 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -9,7 +9,6 @@ # For inquiries contact george.drettakis@inria.fr # -import time from typing import NamedTuple import torch @@ -81,7 +80,7 @@ def forward( ) # TODO: update this. - batch_size = len(raster_settings.tanfovx) if isinstance(raster_settings.tanfovx, list) else 1 + batch_size = raster_settings.tanfovx.shape[0] if torch.is_tensor(raster_settings.tanfovx) else 1 if batch_size == 1: num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args) else: From 4115266c5a2f4d88bdddfa632945a8bc68d8cf1c Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Tue, 23 Apr 2024 23:26:19 -0400 Subject: [PATCH 16/53] Refactor test_batched_gaussian_rasterizer and test_batched_gaussian_rasterizer_batch_processing functions in rasterization_tests.py --- rasterization_tests.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/rasterization_tests.py b/rasterization_tests.py index 7ba64a6..93248df 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -199,6 +199,8 @@ def test_batched_gaussian_rasterizer(): batched_means2D = [] batched_radii = [] + start_time = time.time() + for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)): ########## [START] Prepare CUDA Rasterization Settings ########## cuda_args = get_cuda_args(strategy, mode) @@ -243,6 +245,9 @@ def test_batched_gaussian_rasterizer(): batched_screenspace_params.append(screenspace_params) batched_radii.append(radii) + end_time = time.time() + preprocess_time = end_time - start_time + print(f"Time taken by test_batched_gaussian_rasterizer: {preprocess_time:.4f} seconds") # Perform further operations with the batched results # Test results and performance @@ -256,6 +261,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): shs = torch.randn(num_gaussians, 9).cuda() opacity = torch.randn(num_gaussians, 1).cuda() + start_time = time.time() # Set up the viewpoint cameras batched_viewpoint_cameras = [] for _ in range(num_batches): @@ -318,6 +324,9 @@ def test_batched_gaussian_rasterizer_batch_processing(): opacities=opacity, batched_cuda_args=cuda_args ) + end_time = time.time() + preprocess_time = end_time - start_time + print(f"Time taken by test_batched_gaussian_rasterizer_batch_processing: {preprocess_time:.4f} seconds") if mode == "train": batched_means2D.retain_grad() From 162e7d02cb38e701d666899cba3d9af4fd829065 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Tue, 23 Apr 2024 23:30:08 -0400 Subject: [PATCH 17/53] Refactor test_batched_gaussian_rasterizer and test_batched_gaussian_rasterizer_batch_processing functions in rasterization_tests.py --- rasterization_tests.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 93248df..93c0cd5 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -111,9 +111,7 @@ def test_improved_gaussian_rasterizer(): pipe.debug = False mode = "train" - batched_rasterizers = [] batched_cuda_args = [] - batched_screenspace_params = [] batched_means2D = [] batched_radii = [] raster_settings_list=[] @@ -236,6 +234,7 @@ def test_batched_gaussian_rasterizer(): cuda_args=cuda_args ) + # TODO: make the below work # if mode == "train": # means2D.retain_grad() @@ -328,17 +327,17 @@ def test_batched_gaussian_rasterizer_batch_processing(): preprocess_time = end_time - start_time print(f"Time taken by test_batched_gaussian_rasterizer_batch_processing: {preprocess_time:.4f} seconds") - if mode == "train": - batched_means2D.retain_grad() + # TODO: make the below work + # if mode == "train": + # batched_means2D.retain_grad() - batched_screenspace_params = [batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths] # Perform assertions on the preprocessed data - assert batched_means2D.shape == (num_gaussians, num_batches, 2) - assert batched_rgb.shape == (num_gaussians, num_batches, 3) - assert batched_conic_opacity.shape == (num_gaussians, num_batches, 1) - assert batched_radii.shape == (num_gaussians, num_batches) - assert batched_depths.shape == (num_gaussians, num_batches) + assert batched_means2D.shape == (num_batches, num_gaussians, 2) + assert batched_rgb.shape == (num_batches, num_gaussians, 3) + assert batched_conic_opacity.shape == (num_batches, num_gaussians, 1) + assert batched_radii.shape == (num_batches, num_gaussians) + assert batched_depths.shape == (num_batches, num_gaussians) From fdf3bf5b4429744d8ef20fd380343a9b64d09c8b Mon Sep 17 00:00:00 2001 From: prapti19 Date: Wed, 24 Apr 2024 23:19:25 -0400 Subject: [PATCH 18/53] add parity test --- diff_gaussian_rasterization/__init__.py | 4 +- rasterization_tests.py | 130 +++++------------------- 2 files changed, 31 insertions(+), 103 deletions(-) diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index e73be82..633adba 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -81,9 +81,11 @@ def forward( # TODO: update this. batch_size = raster_settings.tanfovx.shape[0] if torch.is_tensor(raster_settings.tanfovx) else 1 - if batch_size == 1: + if not torch.is_tensor(raster_settings.tanfovx): + print("normal batch calling") num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args) else: + print("improved batch calling") num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batched(*args) # Keep relevant tensors for backward diff --git a/rasterization_tests.py b/rasterization_tests.py index 93c0cd5..84a5391 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -8,7 +8,13 @@ GaussianRasterizer, GaussianRasterizerBatches, ) - +num_gaussians = 10000 +num_batches=1 +means3D = torch.randn(num_gaussians, 3).cuda() +scales = torch.randn(num_gaussians, 3).cuda() +rotations = torch.randn(num_gaussians, 3, 3).cuda() +shs = torch.randn(num_gaussians, 9).cuda() +opacity = torch.randn(num_gaussians, 1).cuda() def get_cuda_args(strategy, mode="train"): cuda_args = { @@ -74,98 +80,13 @@ def test_gaussian_rasterizer_time(): preprocess_time = end_time - start_time print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds") - -def test_improved_gaussian_rasterizer(): - - # Set up the input data - num_gaussians = 10000 - num_batches = 4 - means3D = torch.randn(num_gaussians, 3).cuda() - scales = torch.randn(num_gaussians, 3).cuda() - rotations = torch.randn(num_gaussians, 3, 3).cuda() - shs = torch.randn(num_gaussians, 9).cuda() - opacity = torch.randn(num_gaussians, 1).cuda() - - # Set up the viewpoint cameras - batched_viewpoint_cameras = [] - for _ in range(num_batches): - viewpoint_camera = type('ViewpointCamera', (), {}) - viewpoint_camera.FoVx = math.radians(60) - viewpoint_camera.FoVy = math.radians(60) - viewpoint_camera.image_height = 512 - viewpoint_camera.image_width = 512 - viewpoint_camera.world_view_transform = torch.eye(4).cuda() - viewpoint_camera.full_proj_transform = torch.eye(4).cuda() - viewpoint_camera.camera_center = torch.zeros(3).cuda() - batched_viewpoint_cameras.append(viewpoint_camera) - - # Set up the strategies - batched_strategies = [None] * num_batches - - # Set up other parameters - bg_color = torch.ones(3).cuda() - scaling_modifier = 1.0 - pc = type('PC', (), {}) - pc.active_sh_degree = 2 - pipe = type('Pipe', (), {}) - pipe.debug = False - mode = "train" - - batched_cuda_args = [] - batched_means2D = [] - batched_radii = [] - raster_settings_list=[] - for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)): - ########## [START] Prepare CUDA Rasterization Settings ########## - cuda_args = get_cuda_args(strategy, mode) - batched_cuda_args.append(cuda_args) - - # Set up rasterization configuration - tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) - tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) - - raster_settings_list.append(GaussianRasterizationSettings( - image_height=int(viewpoint_camera.image_height), - image_width=int(viewpoint_camera.image_width), - tanfovx=tanfovx, - tanfovy=tanfovy, - bg=bg_color, - scale_modifier=scaling_modifier, - viewmatrix=viewpoint_camera.world_view_transform, - projmatrix=viewpoint_camera.full_proj_transform, - sh_degree=pc.active_sh_degree, - campos=viewpoint_camera.camera_center, - prefiltered=False, - debug=pipe.debug - )) - - - rasterizer=GaussianRasterizerBatches(raster_settings=raster_settings_list) - start_time = time.time() - batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians( - means3D=means3D, - scales=scales, - rotations=rotations, - shs=shs, - opacities=opacity, - cuda_args=batched_cuda_args - ) - end_time = time.time() - - preprocess_time = end_time - start_time - print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds") - + def test_batched_gaussian_rasterizer(): # Set up the input data num_gaussians = 10000 - num_batches = 4 - means3D = torch.randn(num_gaussians, 3).cuda() - scales = torch.randn(num_gaussians, 3).cuda() - rotations = torch.randn(num_gaussians, 3, 3).cuda() - shs = torch.randn(num_gaussians, 9).cuda() - opacity = torch.randn(num_gaussians, 1).cuda() - + + # Set up the viewpoint cameras batched_viewpoint_cameras = [] for _ in range(num_batches): @@ -244,22 +165,18 @@ def test_batched_gaussian_rasterizer(): batched_screenspace_params.append(screenspace_params) batched_radii.append(radii) + end_time = time.time() preprocess_time = end_time - start_time print(f"Time taken by test_batched_gaussian_rasterizer: {preprocess_time:.4f} seconds") # Perform further operations with the batched results # Test results and performance + + return torch.stack(batched_means2D,dim=0).clone().cpu() -def test_batched_gaussian_rasterizer_batch_processing(): + +def test_batched_gaussian_rasterizer_batch_processing(orig_means2D): # Set up the input data - num_gaussians = 10000 - num_batches = 4 - means3D = torch.randn(num_gaussians, 3).cuda() - scales = torch.randn(num_gaussians, 3).cuda() - rotations = torch.randn(num_gaussians, 3, 3).cuda() - shs = torch.randn(num_gaussians, 9).cuda() - opacity = torch.randn(num_gaussians, 1).cuda() - start_time = time.time() # Set up the viewpoint cameras batched_viewpoint_cameras = [] @@ -292,7 +209,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras]) batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras]) batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras]) - + batched_raster_settings = GaussianRasterizationSettings( image_height=int(batched_viewpoint_cameras[0].image_height), image_width=int(batched_viewpoint_cameras[0].image_width), @@ -333,16 +250,25 @@ def test_batched_gaussian_rasterizer_batch_processing(): # Perform assertions on the preprocessed data + assert batched_means2D.shape == (num_batches, num_gaussians, 2) assert batched_rgb.shape == (num_batches, num_gaussians, 3) - assert batched_conic_opacity.shape == (num_batches, num_gaussians, 1) + assert batched_conic_opacity.shape == (num_batches, num_gaussians,4) assert batched_radii.shape == (num_batches, num_gaussians) assert batched_depths.shape == (num_batches, num_gaussians) + torch.cuda.empty_cache() + new_batched_means2D=batched_means2D.clone().cpu() + + equal_elements = torch.eq(orig_means2D, new_batched_means2D) + all_equal = torch.all(equal_elements) + print(all_equal) + + assert(all_equal==True)#means2d if __name__ == "__main__": - test_batched_gaussian_rasterizer() - test_batched_gaussian_rasterizer_batch_processing() + means2D=test_batched_gaussian_rasterizer() + test_batched_gaussian_rasterizer_batch_processing(means2D) From cace4fd0c79dff9e9bbdbd4adcda549399ce4dca Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 00:21:13 -0400 Subject: [PATCH 19/53] Refactor preprocess_gaussians function to handle batched and non-batched inputs in __init__.py --- diff_gaussian_rasterization/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index e73be82..8b041f9 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -80,8 +80,7 @@ def forward( ) # TODO: update this. - batch_size = raster_settings.tanfovx.shape[0] if torch.is_tensor(raster_settings.tanfovx) else 1 - if batch_size == 1: + if not torch.is_tensor(raster_settings.tanfovx): num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args) else: num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batched(*args) From eaf0d42a22f67aab5fc663bb84b66a8372c423d7 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 00:32:19 -0400 Subject: [PATCH 20/53] Refactor test_batched_gaussian_rasterizer and test_batched_gaussian_rasterizer_batch_processing functions in rasterization_tests.py --- rasterization_tests.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 84a5391..6fd9d4f 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -8,6 +8,7 @@ GaussianRasterizer, GaussianRasterizerBatches, ) + num_gaussians = 10000 num_batches=1 means3D = torch.randn(num_gaussians, 3).cuda() @@ -82,11 +83,7 @@ def test_gaussian_rasterizer_time(): print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds") -def test_batched_gaussian_rasterizer(): - # Set up the input data - num_gaussians = 10000 - - +def test_batched_gaussian_rasterizer(): # Set up the viewpoint cameras batched_viewpoint_cameras = [] for _ in range(num_batches): @@ -172,10 +169,10 @@ def test_batched_gaussian_rasterizer(): # Perform further operations with the batched results # Test results and performance - return torch.stack(batched_means2D,dim=0).clone().cpu() + return batched_means2D -def test_batched_gaussian_rasterizer_batch_processing(orig_means2D): +def test_batched_gaussian_rasterizer_batch_processing(): # Set up the input data start_time = time.time() # Set up the viewpoint cameras @@ -257,18 +254,18 @@ def test_batched_gaussian_rasterizer_batch_processing(orig_means2D): assert batched_radii.shape == (num_batches, num_gaussians) assert batched_depths.shape == (num_batches, num_gaussians) torch.cuda.empty_cache() - new_batched_means2D=batched_means2D.clone().cpu() - - equal_elements = torch.eq(orig_means2D, new_batched_means2D) - all_equal = torch.all(equal_elements) - print(all_equal) - - assert(all_equal==True)#means2d + + return batched_means2D +if __name__ == "__main__": + batched_means2D=test_batched_gaussian_rasterizer() + batched_means2D_batch_processed = test_batched_gaussian_rasterizer_batch_processing() + + equal_elements = torch.eq(batched_means2D, batched_means2D_batch_processed) + all_equal = torch.all(equal_elements) + print(all_equal) + assert(all_equal is True)#means2d -if __name__ == "__main__": - means2D=test_batched_gaussian_rasterizer() - test_batched_gaussian_rasterizer_batch_processing(means2D) From d9eb4e83913ff00e77fc24a9ac617f67c2284fda Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 00:38:42 -0400 Subject: [PATCH 21/53] Refactor test_batched_gaussian_rasterizer and test_batched_gaussian_rasterizer_batch_processing functions in rasterization_tests.py --- rasterization_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 6fd9d4f..16feb19 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -169,7 +169,7 @@ def test_batched_gaussian_rasterizer(): # Perform further operations with the batched results # Test results and performance - return batched_means2D + return torch.stack(batched_means2D, dim=0) def test_batched_gaussian_rasterizer_batch_processing(): From c38cfa9ad9bf6ed18568c5131ce376f0c31eff00 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 01:13:37 -0400 Subject: [PATCH 22/53] add debug flag to extra_compile_args --- setup.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 8c4d011..3d16c4d 100644 --- a/setup.py +++ b/setup.py @@ -9,9 +9,11 @@ # For inquiries contact george.drettakis@inria.fr # -from setuptools import setup -from torch.utils.cpp_extension import CUDAExtension, BuildExtension import os + +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + os.path.dirname(os.path.abspath(__file__)) setup( @@ -30,7 +32,7 @@ headers=[ "config.h" ], - extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]}) + extra_compile_args={"nvcc": ["-g", "-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]}) ], cmdclass={ 'build_ext': BuildExtension From 24905aa81a50720bc263222d81d47c4799b8c7b9 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 18:36:26 -0400 Subject: [PATCH 23/53] Refactor tan_fovy parameter to be const in CUDA rasterizer files --- cuda_rasterizer/forward.cu | 2 +- cuda_rasterizer/rasterizer_impl.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 1e3e119..16d33a1 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -604,7 +604,7 @@ void FORWARD::preprocess_batch(int P, int D, int M, const float* projmatrix, const glm::vec3* cam_pos, const int W, int H, - const float* tan_fovx, float* tan_fovy, + const float* tan_fovx, const float* tan_fovy, int* radii, float2* means2D, float* depths, diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index 2f43492..c7fe67b 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -444,7 +444,7 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches( const float* viewmatrix, const float* projmatrix, const float* cam_pos, - const float* tan_fovx, float* tan_fovy, + const float* tan_fovx, const float* tan_fovy, const bool prefiltered, const int num_viewpoints, bool debug,//raster_settings From d376d418204a068daa4ca043e143060985f5e0a5 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 18:37:11 -0400 Subject: [PATCH 24/53] Refactor tan_fovy parameter to be const in CUDA rasterizer files --- cuda_rasterizer/forward.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h index 6c4a1da..902fa00 100644 --- a/cuda_rasterizer/forward.h +++ b/cuda_rasterizer/forward.h @@ -62,7 +62,7 @@ namespace FORWARD const float* projmatrix, const glm::vec3* cam_pos, const int W, int H, - const float* tan_fovx, float* tan_fovy, + const float* tan_fovx, const float* tan_fovy, int* radii, float2* means2D, float* depths, From 8c82fa74f885a647c1f65ba048511ca9ba4b2a21 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 18:46:07 -0400 Subject: [PATCH 25/53] Refactor tan_fovy parameter to be const in CUDA rasterizer files --- cuda_rasterizer/rasterizer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_rasterizer/rasterizer.h b/cuda_rasterizer/rasterizer.h index 4ad1bf8..b7f93fd 100644 --- a/cuda_rasterizer/rasterizer.h +++ b/cuda_rasterizer/rasterizer.h @@ -84,7 +84,7 @@ namespace CudaRasterizer const float* viewmatrix, const float* projmatrix, const float* cam_pos, - const float* tan_fovx, float* tan_fovy, + const float* tan_fovx, const float* tan_fovy, const bool prefiltered, const int num_viewpoints, bool debug,//raster_settings From 4cca1185ee555386bf4f374247770e65eb9086af Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 19:18:09 -0400 Subject: [PATCH 26/53] Refactor CUDA rasterizer files to use CUDA tensors for batched calculations --- rasterization_tests.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 16feb19..d55e34b 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -201,11 +201,11 @@ def test_batched_gaussian_rasterizer_batch_processing(): mode = "train" # Set up rasterization configuration for the batch - batched_tanfovx = torch.tensor([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]) - batched_tanfovy = torch.tensor([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]) - batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras]) - batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras]) - batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras]) + batched_tanfovx = torch.tensor([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]).cuda() + batched_tanfovy = torch.tensor([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]).cuda() + batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras]).cuda() + batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras]).cuda() + batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras]).cuda() batched_raster_settings = GaussianRasterizationSettings( image_height=int(batched_viewpoint_cameras[0].image_height), From 34ebced4edd04a3cd05a2a6a16b33ccae7302504 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 19:29:02 -0400 Subject: [PATCH 27/53] Refactor test_batched_gaussian_rasterizer and test_batched_gaussian_rasterizer_batch_processing functions in rasterization_tests.py --- rasterization_tests.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index d55e34b..35fd237 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -263,9 +263,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): batched_means2D_batch_processed = test_batched_gaussian_rasterizer_batch_processing() equal_elements = torch.eq(batched_means2D, batched_means2D_batch_processed) - all_equal = torch.all(equal_elements) - print(all_equal) - assert(all_equal is True)#means2d + assert torch.all(equal_elements) From f6374d98c20f66aaefa5c73af0504811c8ab7b7b Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 19:37:01 -0400 Subject: [PATCH 28/53] Refactor test_batched_gaussian_rasterizer and test_batched_gaussian_rasterizer_batch_processing functions in rasterization_tests.py --- rasterization_tests.py | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 35fd237..231500e 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -169,7 +169,10 @@ def test_batched_gaussian_rasterizer(): # Perform further operations with the batched results # Test results and performance - return torch.stack(batched_means2D, dim=0) + batched_means2D = torch.stack(batched_means2D, dim=0) + batched_radii = torch.stack(batched_radii, dim=0) + + return batched_means2D, batched_radii, batched_screenspace_params def test_batched_gaussian_rasterizer_batch_processing(): @@ -255,15 +258,33 @@ def test_batched_gaussian_rasterizer_batch_processing(): assert batched_depths.shape == (num_batches, num_gaussians) torch.cuda.empty_cache() - return batched_means2D + batched_screenspace_params = [] + for i in range(num_batches): + means2D = batched_means2D[i] + rgb = batched_rgb[i] + conic_opacity = batched_conic_opacity[i] + radii = batched_radii[i] + depths = batched_depths[i] + + screenspace_params = [means2D, rgb, conic_opacity, radii, depths] + batched_screenspace_params.append(screenspace_params) + + return batched_means2D, batched_radii, batched_screenspace_params -if __name__ == "__main__": - batched_means2D=test_batched_gaussian_rasterizer() - batched_means2D_batch_processed = test_batched_gaussian_rasterizer_batch_processing() - - equal_elements = torch.eq(batched_means2D, batched_means2D_batch_processed) +def assert_tensor_equal(tensor1, tensor2): + return torch.all(torch.eq(tensor1, tensor2)) - assert torch.all(equal_elements) +if __name__ == "__main__": + batched_means2D, batched_radii, batched_screenspace_params = test_batched_gaussian_rasterizer() + batched_means2D_batch_processed, batched_radii_batch_processed, batched_screenspace_params_batch_processed = test_batched_gaussian_rasterizer_batch_processing() + + assert assert_tensor_equal(batched_means2D, batched_means2D_batch_processed) + assert assert_tensor_equal(batched_radii, batched_radii_batch_processed) + assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed) + for i in range(len(batched_screenspace_params)): + assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i]) + for j in range(len(batched_screenspace_params[i])): + assert assert_tensor_equal(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j]) From d0230a6b2181f0eb784d4705a5db05e8106f7a65 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 19:44:23 -0400 Subject: [PATCH 29/53] Refactor assert_tensor_equal function to compare_tensors in rasterization_tests.py --- rasterization_tests.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 231500e..a88c800 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -272,19 +272,36 @@ def test_batched_gaussian_rasterizer_batch_processing(): return batched_means2D, batched_radii, batched_screenspace_params -def assert_tensor_equal(tensor1, tensor2): - return torch.all(torch.eq(tensor1, tensor2)) +def compare_tensors(tensor1, tensor2): + if tensor1.shape != tensor2.shape: + print("Tensors have different shapes:") + print("Tensor 1 shape:", tensor1.shape) + print("Tensor 2 shape:", tensor2.shape) + return False + + equality_matrix = torch.eq(tensor1, tensor2) + if torch.all(equality_matrix): + print("All values in the tensors are equal.") + return True + else: + print("Tensors have non-matching values.") + non_matching_indices = torch.where(equality_matrix == False) + for idx in zip(*non_matching_indices): + value1 = tensor1[idx].item() + value2 = tensor2[idx].item() + print(f"Non-matching values at index {idx}: {value1} != {value2}") + return False if __name__ == "__main__": batched_means2D, batched_radii, batched_screenspace_params = test_batched_gaussian_rasterizer() batched_means2D_batch_processed, batched_radii_batch_processed, batched_screenspace_params_batch_processed = test_batched_gaussian_rasterizer_batch_processing() - assert assert_tensor_equal(batched_means2D, batched_means2D_batch_processed) - assert assert_tensor_equal(batched_radii, batched_radii_batch_processed) + assert compare_tensors(batched_means2D, batched_means2D_batch_processed) + assert compare_tensors(batched_radii, batched_radii_batch_processed) assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed) for i in range(len(batched_screenspace_params)): assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i]) for j in range(len(batched_screenspace_params[i])): - assert assert_tensor_equal(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j]) + assert compare_tensors(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j]) From e529d2ac76dec0aea6d9a755393b9d388594bd62 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 20:12:22 -0400 Subject: [PATCH 30/53] tile_grid calculated before kernel launch --- cuda_rasterizer/forward.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 16d33a1..92fda9b 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -616,7 +616,8 @@ void FORWARD::preprocess_batch(int P, int D, int M, bool prefiltered, const int num_viewpoints) { - preprocessCUDABatched<<>>( + dim3 tile_grid(cdiv(P, ONE_DIM_BLOCK_SIZE), num_viewpoints); + preprocessCUDABatched<<>>( P, D, M, means3D, scales, From 09b853effb501077563d3231470cbcc98dd91abb Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 20:12:40 -0400 Subject: [PATCH 31/53] Fix indexing bug in preprocessCUDABatched function --- cuda_rasterizer/forward.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 92fda9b..0ec6dd0 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -513,12 +513,12 @@ __global__ void preprocessCUDABatched( float* rgb, float4* conic_opacity, const dim3 grid, uint32_t* tiles_touched, bool prefiltered, const int num_viewpoints) { - auto point_idx = cg::this_grid().thread_rank(); + auto point_idx = blockIdx.x * blockDim.x + threadIdx.x; auto viewpoint_idx = blockIdx.y; if (viewpoint_idx >= num_viewpoints || point_idx >= P) return; - auto idx = viewpoint_idx * P + point_idx; + auto idx = viewpoint_idx * num_viewpoints + point_idx; const float* viewmatrix = viewmatrix_arr + viewpoint_idx * 16; const float* projmatrix = projmatrix_arr + viewpoint_idx * 16; From 44f8fc19581bc5782525824088b5574c28018bc3 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 22:01:39 -0400 Subject: [PATCH 32/53] Refactor indexing in preprocessCUDABatched function in forward.cu --- cuda_rasterizer/forward.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 0ec6dd0..358ab5e 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -529,10 +529,10 @@ __global__ void preprocessCUDABatched( // Perform near culling, quit if outside. float3 p_view; - if (!in_frustum(point_idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return; + if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return; // Transform point by projecting - float3 p_orig = { orig_points[3 * point_idx], orig_points[3 * point_idx + 1], orig_points[3 * point_idx + 2] }; + float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; float4 p_hom = transformPoint4x4(p_orig, projmatrix); float p_w = 1.0f / (p_hom.w + 0.0000001f); float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; @@ -541,9 +541,9 @@ __global__ void preprocessCUDABatched( // from scaling and rotation parameters. const float* cov3D; if (cov3D_precomp != nullptr) { - cov3D = cov3D_precomp + point_idx * 6; + cov3D = cov3D_precomp + idx * 6; } else { - computeCov3D(scales[point_idx], scale_modifier, rotations[point_idx], cov3Ds + idx * 6); + computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6); cov3D = cov3Ds + idx * 6; } @@ -574,7 +574,7 @@ __global__ void preprocessCUDABatched( // If colors have been precomputed, use them, otherwise convert // spherical harmonics coefficients to RGB color. if (colors_precomp == nullptr) { - glm::vec3 result = computeColorFromSH(point_idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3); + glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3); rgb[idx * C + 0] = result.x; rgb[idx * C + 1] = result.y; rgb[idx * C + 2] = result.z; @@ -586,7 +586,7 @@ __global__ void preprocessCUDABatched( points_xy_image[idx] = point_image; // Inverse 2D covariance and opacity neatly pack into one float4 - conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[point_idx] }; + conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] }; tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); } From a00921dfd5ae4e061dac171b55fa163e05071465 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Thu, 25 Apr 2024 22:11:59 -0400 Subject: [PATCH 33/53] Refactor tile_grid calculation in rasterizer_impl.cu --- cuda_rasterizer/rasterizer_impl.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index c7fe67b..06e95db 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -464,7 +464,9 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches( //CONVERT ALL VECTORS TO FLOATSSSSSS PRAPTIIIIIII - dim3 tile_grid(cdiv(P, ONE_DIM_BLOCK_SIZE), num_viewpoints); + dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1); + dim3 block(BLOCK_X, BLOCK_Y, 1); + int tile_num = tile_grid.x * tile_grid.y; // allocate temporary buffer for tiles_touched. // In sep_rendering==True case, we will compute tiles_touched in the renderForward. From 890d95fc21bdfc7763e66ddbf712e69f81bff700 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Fri, 26 Apr 2024 10:36:34 -0400 Subject: [PATCH 34/53] Refactor indexing in preprocessCUDABatched function in forward.cu --- cuda_rasterizer/forward.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 358ab5e..5dbc9ff 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -529,10 +529,10 @@ __global__ void preprocessCUDABatched( // Perform near culling, quit if outside. float3 p_view; - if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return; + if (!in_frustum(point_idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return; // Transform point by projecting - float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; + float3 p_orig = { orig_points[3 * point_idx], orig_points[3 * point_idx + 1], orig_points[3 * point_idx + 2] }; float4 p_hom = transformPoint4x4(p_orig, projmatrix); float p_w = 1.0f / (p_hom.w + 0.0000001f); float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; @@ -543,7 +543,7 @@ __global__ void preprocessCUDABatched( if (cov3D_precomp != nullptr) { cov3D = cov3D_precomp + idx * 6; } else { - computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6); + computeCov3D(scales[point_idx], scale_modifier, rotations[point_idx], cov3Ds + idx * 6); cov3D = cov3Ds + idx * 6; } @@ -574,7 +574,7 @@ __global__ void preprocessCUDABatched( // If colors have been precomputed, use them, otherwise convert // spherical harmonics coefficients to RGB color. if (colors_precomp == nullptr) { - glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3); + glm::vec3 result = computeColorFromSH(point_idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3); rgb[idx * C + 0] = result.x; rgb[idx * C + 1] = result.y; rgb[idx * C + 2] = result.z; @@ -586,7 +586,7 @@ __global__ void preprocessCUDABatched( points_xy_image[idx] = point_image; // Inverse 2D covariance and opacity neatly pack into one float4 - conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] }; + conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[point_idx] }; tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); } From b3ad196bbf3a9996fe980c70a917aca31ec9c88c Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Fri, 26 Apr 2024 14:05:12 -0400 Subject: [PATCH 35/53] Refactor compare_tensors function in rasterization_tests.py to handle non-matching values --- rasterization_tests.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index a88c800..ef3c23f 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -281,12 +281,11 @@ def compare_tensors(tensor1, tensor2): equality_matrix = torch.eq(tensor1, tensor2) if torch.all(equality_matrix): - print("All values in the tensors are equal.") return True else: - print("Tensors have non-matching values.") + print("Tensors have non-matching values. Some of them are") non_matching_indices = torch.where(equality_matrix == False) - for idx in zip(*non_matching_indices): + for idx in zip(*non_matching_indices[:5]): value1 = tensor1[idx].item() value2 = tensor2[idx].item() print(f"Non-matching values at index {idx}: {value1} != {value2}") From abfb8b4119e278b670a1c97235c46d27e0a69c99 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Fri, 26 Apr 2024 14:07:03 -0400 Subject: [PATCH 36/53] Fix indexing bug in preprocessCUDABatched function --- cuda_rasterizer/forward.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 5dbc9ff..807bb84 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -518,7 +518,7 @@ __global__ void preprocessCUDABatched( if (viewpoint_idx >= num_viewpoints || point_idx >= P) return; - auto idx = viewpoint_idx * num_viewpoints + point_idx; + auto idx = viewpoint_idx * P + point_idx; const float* viewmatrix = viewmatrix_arr + viewpoint_idx * 16; const float* projmatrix = projmatrix_arr + viewpoint_idx * 16; From 18f9c2022fc643aa6628458344f67b8acd94cdb0 Mon Sep 17 00:00:00 2001 From: Prapti Trivedi Date: Fri, 26 Apr 2024 14:24:43 -0400 Subject: [PATCH 37/53] Update rasterization_tests.py --- rasterization_tests.py | 43 ++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index ef3c23f..1295902 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -9,7 +9,7 @@ GaussianRasterizerBatches, ) -num_gaussians = 10000 +num_gaussians = 50 num_batches=1 means3D = torch.randn(num_gaussians, 3).cuda() scales = torch.randn(num_gaussians, 3).cuda() @@ -114,6 +114,9 @@ def test_batched_gaussian_rasterizer(): batched_screenspace_params = [] batched_means2D = [] batched_radii = [] + batched_conic_opacity=[] + batched_depths=[] + batched_rgb=[] start_time = time.time() @@ -161,6 +164,9 @@ def test_batched_gaussian_rasterizer(): batched_rasterizers.append(rasterizer) batched_screenspace_params.append(screenspace_params) batched_radii.append(radii) + batched_rgb.append(rgb) + batched_conic_opacity.append(conic_opacity) + batched_depths.append(depths) end_time = time.time() @@ -171,8 +177,11 @@ def test_batched_gaussian_rasterizer(): batched_means2D = torch.stack(batched_means2D, dim=0) batched_radii = torch.stack(batched_radii, dim=0) + batched_conic_opacity=torch.stack(batched_conic_opacity,dim=0) + batched_rgb=torch.stack(batched_rgb,dim=0) + batched_depths=torch.stack(batched_depths,dim=0) - return batched_means2D, batched_radii, batched_screenspace_params + return batched_means2D, batched_radii, batched_screenspace_params,batched_conic_opacity,batched_rgb,batched_depths def test_batched_gaussian_rasterizer_batch_processing(): @@ -269,7 +278,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): screenspace_params = [means2D, rgb, conic_opacity, radii, depths] batched_screenspace_params.append(screenspace_params) - return batched_means2D, batched_radii, batched_screenspace_params + return batched_means2D, batched_radii, batched_screenspace_params, batched_conic_opacity,batched_rgb,batched_depths def compare_tensors(tensor1, tensor2): @@ -281,26 +290,36 @@ def compare_tensors(tensor1, tensor2): equality_matrix = torch.eq(tensor1, tensor2) if torch.all(equality_matrix): + print("All values in the tensors are equal.") return True else: - print("Tensors have non-matching values. Some of them are") + print("Tensors have non-matching values.") non_matching_indices = torch.where(equality_matrix == False) - for idx in zip(*non_matching_indices[:5]): + for idx in zip(*non_matching_indices): value1 = tensor1[idx].item() value2 = tensor2[idx].item() - print(f"Non-matching values at index {idx}: {value1} != {value2}") + # print(f"Non-matching values at index {idx}: {value1} != {value2}") return False if __name__ == "__main__": - batched_means2D, batched_radii, batched_screenspace_params = test_batched_gaussian_rasterizer() - batched_means2D_batch_processed, batched_radii_batch_processed, batched_screenspace_params_batch_processed = test_batched_gaussian_rasterizer_batch_processing() + batched_means2D, batched_radii, batched_screenspace_params,batched_conic_opacity,batched_rgb,batched_depths = test_batched_gaussian_rasterizer() + batched_means2D_batch_processed, batched_radii_batch_processed, batched_screenspace_params_batch_processed,batched_conic_opacity_batch_processed,batched_rgb_batch_processed,batched_depths_batch_processed = test_batched_gaussian_rasterizer_batch_processing() assert compare_tensors(batched_means2D, batched_means2D_batch_processed) assert compare_tensors(batched_radii, batched_radii_batch_processed) + assert compare_tensors(batched_conic_opacity, batched_conic_opacity_batch_processed) + print(batched_rgb.shape,batched_rgb_batch_processed.shape) + print(batched_rgb) + print('*****') + print(batched_rgb_batch_processed) + + assert compare_tensors(batched_rgb, batched_rgb_batch_processed) + assert compare_tensors(batched_depths, batched_depths_batch_processed) assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed) - for i in range(len(batched_screenspace_params)): - assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i]) - for j in range(len(batched_screenspace_params[i])): - assert compare_tensors(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j]) + # for i in range(len(batched_screenspace_params)): + # assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i]) + # for j in range(len(batched_screenspace_params[i])): + # print(i,j) + # assert compare_tensors(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j]) From a04b34dd88544c7721b69c2e7383ec5e40c520b7 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Fri, 26 Apr 2024 14:28:51 -0400 Subject: [PATCH 38/53] Refactor compare_tensors function in rasterization_tests.py to handle non-matching values --- rasterization_tests.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 1295902..0613e80 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -290,15 +290,14 @@ def compare_tensors(tensor1, tensor2): equality_matrix = torch.eq(tensor1, tensor2) if torch.all(equality_matrix): - print("All values in the tensors are equal.") return True else: print("Tensors have non-matching values.") non_matching_indices = torch.where(equality_matrix == False) - for idx in zip(*non_matching_indices): + for idx in zip(*non_matching_indices)[:5]: value1 = tensor1[idx].item() value2 = tensor2[idx].item() - # print(f"Non-matching values at index {idx}: {value1} != {value2}") + print(f"Non-matching values at index {idx}: {value1} != {value2}") return False if __name__ == "__main__": From e593132761793c958fd481b266df94bc360d8a8e Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Fri, 26 Apr 2024 14:34:52 -0400 Subject: [PATCH 39/53] Refactor compare_tensors function to fix indexing bug and handle non-matching values --- rasterization_tests.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 0613e80..af252e6 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -294,7 +294,7 @@ def compare_tensors(tensor1, tensor2): else: print("Tensors have non-matching values.") non_matching_indices = torch.where(equality_matrix == False) - for idx in zip(*non_matching_indices)[:5]: + for idx in zip(*non_matching_indices[:5]): value1 = tensor1[idx].item() value2 = tensor2[idx].item() print(f"Non-matching values at index {idx}: {value1} != {value2}") @@ -307,7 +307,6 @@ def compare_tensors(tensor1, tensor2): assert compare_tensors(batched_means2D, batched_means2D_batch_processed) assert compare_tensors(batched_radii, batched_radii_batch_processed) assert compare_tensors(batched_conic_opacity, batched_conic_opacity_batch_processed) - print(batched_rgb.shape,batched_rgb_batch_processed.shape) print(batched_rgb) print('*****') print(batched_rgb_batch_processed) @@ -315,10 +314,4 @@ def compare_tensors(tensor1, tensor2): assert compare_tensors(batched_rgb, batched_rgb_batch_processed) assert compare_tensors(batched_depths, batched_depths_batch_processed) assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed) - # for i in range(len(batched_screenspace_params)): - # assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i]) - # for j in range(len(batched_screenspace_params[i])): - # print(i,j) - # assert compare_tensors(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j]) - From e109969dc0cca0a6c5eded09498b1742a77a04c5 Mon Sep 17 00:00:00 2001 From: Prapti Trivedi Date: Fri, 26 Apr 2024 17:17:49 -0400 Subject: [PATCH 40/53] Update forward.cu --- cuda_rasterizer/forward.cu | 71 ++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 807bb84..8d384e6 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -28,8 +28,9 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const dir = dir / glm::length(dir); glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; + glm::vec3 result = SH_C0 * sh[0]; - + if (deg > 0) { float x = dir.x; @@ -70,6 +71,59 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const clamped[3 * idx + 2] = (result.z < 0); return glm::max(result, 0.0f); } +__device__ glm::vec3 computeColorFromSHBatched(int idx, int view_idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped) +{ + // The implementation is loosely based on code for + // "Differentiable Point-Based Radiance Fields for + // Efficient View Synthesis" by Zhang et al. (2022) + glm::vec3 pos = means[idx]; + glm::vec3 dir = pos - campos; + dir = dir / glm::length(dir); + + glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; + + glm::vec3 result = SH_C0 * sh[0]; + + if (deg > 0) + { + float x = dir.x; + float y = dir.y; + float z = dir.z; + result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3]; + + if (deg > 1) + { + float xx = x * x, yy = y * y, zz = z * z; + float xy = x * y, yz = y * z, xz = x * z; + result = result + + SH_C2[0] * xy * sh[4] + + SH_C2[1] * yz * sh[5] + + SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] + + SH_C2[3] * xz * sh[7] + + SH_C2[4] * (xx - yy) * sh[8]; + + if (deg > 2) + { + result = result + + SH_C3[0] * y * (3.0f * xx - yy) * sh[9] + + SH_C3[1] * xy * z * sh[10] + + SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] + + SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] + + SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] + + SH_C3[5] * z * (xx - yy) * sh[14] + + SH_C3[6] * x * (xx - 3.0f * yy) * sh[15]; + } + } + } + result += 0.5f; + + // RGB colors are clamped to positive values. If values are + // clamped, we need to keep track of this for the backward pass. + clamped[3 * view_idx + 0] = (result.x < 0); + clamped[3 * view_idx + 1] = (result.y < 0); + clamped[3 * view_idx + 2] = (result.z < 0); + return glm::max(result, 0.0f); +} // Forward version of 2D covariance matrix computation __device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix) @@ -197,6 +251,7 @@ __global__ void preprocessCUDA(int P, int D, int M, // Transform point by projecting float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; + float4 p_hom = transformPoint4x4(p_orig, projmatrix); float p_w = 1.0f / (p_hom.w + 0.0000001f); float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; @@ -213,10 +268,9 @@ __global__ void preprocessCUDA(int P, int D, int M, computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6); cov3D = cov3Ds + idx * 6; } - // Compute 2D screen-space covariance matrix float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix); - + // Invert covariance (EWA algorithm) float det = (cov.x * cov.z - cov.y * cov.y); if (det == 0.0f) @@ -241,7 +295,7 @@ __global__ void preprocessCUDA(int P, int D, int M, // If colors have been precomputed, use them, otherwise convert // spherical harmonics coefficients to RGB color. if (colors_precomp == nullptr) - { + { glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped); rgb[idx * C + 0] = result.x; rgb[idx * C + 1] = result.y; @@ -533,6 +587,7 @@ __global__ void preprocessCUDABatched( // Transform point by projecting float3 p_orig = { orig_points[3 * point_idx], orig_points[3 * point_idx + 1], orig_points[3 * point_idx + 2] }; + float4 p_hom = transformPoint4x4(p_orig, projmatrix); float p_w = 1.0f / (p_hom.w + 0.0000001f); float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; @@ -546,12 +601,14 @@ __global__ void preprocessCUDABatched( computeCov3D(scales[point_idx], scale_modifier, rotations[point_idx], cov3Ds + idx * 6); cov3D = cov3Ds + idx * 6; } + // Compute 2D screen-space covariance matrix const float focal_x = W / (2.0f * tan_fovx[viewpoint_idx]); const float focal_y = H / (2.0f * tan_fovy[viewpoint_idx]); float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx[viewpoint_idx], tan_fovy[viewpoint_idx], cov3D, viewmatrix); + // Invert covariance (EWA algorithm) float det = (cov.x * cov.z - cov.y * cov.y); if (det == 0.0f) return; @@ -573,8 +630,10 @@ __global__ void preprocessCUDABatched( // If colors have been precomputed, use them, otherwise convert // spherical harmonics coefficients to RGB color. + if (colors_precomp == nullptr) { - glm::vec3 result = computeColorFromSH(point_idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3); + + glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3); rgb[idx * C + 0] = result.x; rgb[idx * C + 1] = result.y; rgb[idx * C + 2] = result.z; @@ -644,4 +703,4 @@ void FORWARD::preprocess_batch(int P, int D, int M, prefiltered, num_viewpoints ); -} \ No newline at end of file +} From edfea2eefc16fa890e0f92f4842b5b7e5ddb78cd Mon Sep 17 00:00:00 2001 From: Prapti Trivedi Date: Sat, 27 Apr 2024 00:54:29 -0400 Subject: [PATCH 41/53] Update rasterization_tests.py --- rasterization_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index af252e6..fab689a 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -13,8 +13,8 @@ num_batches=1 means3D = torch.randn(num_gaussians, 3).cuda() scales = torch.randn(num_gaussians, 3).cuda() -rotations = torch.randn(num_gaussians, 3, 3).cuda() -shs = torch.randn(num_gaussians, 9).cuda() +rotations = torch.randn(num_gaussians,4).cuda() +shs = torch.randn(num_gaussians, 16,3).cuda() opacity = torch.randn(num_gaussians, 1).cuda() def get_cuda_args(strategy, mode="train"): From 53a14e29740e331a537a3c66d5f63758ce503d15 Mon Sep 17 00:00:00 2001 From: Prapti Trivedi Date: Sat, 27 Apr 2024 00:55:05 -0400 Subject: [PATCH 42/53] Update forward.cu --- cuda_rasterizer/forward.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 8d384e6..567c7ae 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -633,7 +633,7 @@ __global__ void preprocessCUDABatched( if (colors_precomp == nullptr) { - glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3); + glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped); rgb[idx * C + 0] = result.x; rgb[idx * C + 1] = result.y; rgb[idx * C + 2] = result.z; From 32a601f41467c1d27ccb80698d831a786b2dd39d Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sat, 27 Apr 2024 01:06:07 -0400 Subject: [PATCH 43/53] fixed sh_sdegree --- cuda_rasterizer/forward.cu | 2 +- rasterization_tests.py | 17 +++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 8d384e6..567c7ae 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -633,7 +633,7 @@ __global__ void preprocessCUDABatched( if (colors_precomp == nullptr) { - glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3); + glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped); rgb[idx * C + 0] = result.x; rgb[idx * C + 1] = result.y; rgb[idx * C + 2] = result.z; diff --git a/rasterization_tests.py b/rasterization_tests.py index af252e6..afea6b9 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -9,12 +9,12 @@ GaussianRasterizerBatches, ) -num_gaussians = 50 -num_batches=1 +num_gaussians = 10000 +num_batches=32 means3D = torch.randn(num_gaussians, 3).cuda() scales = torch.randn(num_gaussians, 3).cuda() -rotations = torch.randn(num_gaussians, 3, 3).cuda() -shs = torch.randn(num_gaussians, 9).cuda() +rotations = torch.randn(num_gaussians, 4).cuda() +shs = torch.randn(num_gaussians, 16, 3).cuda() opacity = torch.randn(num_gaussians, 1).cuda() def get_cuda_args(strategy, mode="train"): @@ -104,7 +104,7 @@ def test_batched_gaussian_rasterizer(): bg_color = torch.ones(3).cuda() scaling_modifier = 1.0 pc = type('PC', (), {}) - pc.active_sh_degree = 2 + pc.active_sh_degree = 3 pipe = type('Pipe', (), {}) pipe.debug = False mode = "train" @@ -207,7 +207,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): bg_color = torch.ones(3).cuda() scaling_modifier = 1.0 pc = type('PC', (), {}) - pc.active_sh_degree = 2 + pc.active_sh_degree = 3 pipe = type('Pipe', (), {}) pipe.debug = False mode = "train" @@ -265,7 +265,6 @@ def test_batched_gaussian_rasterizer_batch_processing(): assert batched_conic_opacity.shape == (num_batches, num_gaussians,4) assert batched_radii.shape == (num_batches, num_gaussians) assert batched_depths.shape == (num_batches, num_gaussians) - torch.cuda.empty_cache() batched_screenspace_params = [] for i in range(num_batches): @@ -307,11 +306,9 @@ def compare_tensors(tensor1, tensor2): assert compare_tensors(batched_means2D, batched_means2D_batch_processed) assert compare_tensors(batched_radii, batched_radii_batch_processed) assert compare_tensors(batched_conic_opacity, batched_conic_opacity_batch_processed) - print(batched_rgb) - print('*****') - print(batched_rgb_batch_processed) assert compare_tensors(batched_rgb, batched_rgb_batch_processed) assert compare_tensors(batched_depths, batched_depths_batch_processed) assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed) + From 22eb043f05cf4d47256f12907873ca643d294408 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sat, 27 Apr 2024 20:23:47 -0400 Subject: [PATCH 44/53] Refactor GaussianRasterizationSettings class to handle raster_settings as a batch --- diff_gaussian_rasterization/__init__.py | 23 +++++- rasterization_tests.py | 100 +++++++----------------- 2 files changed, 47 insertions(+), 76 deletions(-) diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index f7860b4..6be6778 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -59,6 +59,19 @@ def forward( ): # Restructure arguments the way that the C++ lib expects them + if isinstance(raster_settings, list): + rs = raster_settings[0] + rs.viewmatrix, rs.projmatrix, rs.campos = [ + torch.stack(tensors) for tensors in zip( + *[(rs.viewmatrix, rs.projmatrix, rs.campos) for rs in raster_settings] + ) + ] + rs.tanfovx, rs.tanfovy = [ + torch.tensor(vals, device=means3D.device) + for vals in zip(*[(rs.tanfovx, rs.tanfovy) for rs in raster_settings]) + ] + raster_settings = rs + args = ( means3D, scales, @@ -314,15 +327,17 @@ class GaussianRasterizationSettings(NamedTuple): debug : bool class GaussianRasterizerBatches(nn.Module): - def __init__(self, raster_settings): + def __init__(self, raster_settings_batch): super().__init__() - self.raster_settings_list = raster_settings + self.raster_settings_batch = raster_settings_batch def markVisible(self, positions): # Mark visible points (based on frustum culling for camera) with a boolean with torch.no_grad(): visible = [] - for viewmatrix, projmatrix in zip(self.raster_settings.viewmatrix, self.raster_settings.projmatrix): + for raster_settings in self.raster_settings_batch: + viewmatrix = raster_settings.viewmatrix + projmatrix = raster_settings.projmatrix visible.append(_C.mark_visible(positions, viewmatrix, projmatrix)) return visible @@ -335,7 +350,7 @@ def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batch rotations, shs, opacities, - self.raster_settings_list, + self.raster_settings_batch, batched_cuda_args) class GaussianRasterizer(nn.Module): diff --git a/rasterization_tests.py b/rasterization_tests.py index afea6b9..2452cbf 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -16,6 +16,7 @@ rotations = torch.randn(num_gaussians, 4).cuda() shs = torch.randn(num_gaussians, 16, 3).cuda() opacity = torch.randn(num_gaussians, 1).cuda() +SH_ACTIVE_DEGREE = 3 def get_cuda_args(strategy, mode="train"): cuda_args = { @@ -36,52 +37,6 @@ def get_cuda_args(strategy, mode="train"): } return cuda_args -def test_gaussian_rasterizer_time(): - # Set up the input data - num_gaussians = 10000 - means3D = torch.randn(num_gaussians, 3).cuda() - scales = torch.randn(num_gaussians, 3).cuda() - rotations = torch.randn(num_gaussians, 3, 3).cuda() - shs = torch.randn(num_gaussians, 9).cuda() - opacities = torch.randn(num_gaussians, 1).cuda() - - # Set up the rasterization settings - image_height = 512 - image_width = 512 - tanfovx = 1.0 - tanfovy = 1.0 - bg = torch.ones(3).cuda() - scale_modifier = 1.0 - viewmatrix = torch.eye(4).cuda() - projmatrix = torch.eye(4).cuda() - sh_degree = 2 - campos = torch.zeros(3).cuda() - prefiltered = False - debug = False - - # mode="train" - # strategy=None - # cuda_args = get_cuda_args(strategy, mode) - - raster_settings = GaussianRasterizationSettings( - image_height, image_width, tanfovx, tanfovy, bg, - scale_modifier, viewmatrix, projmatrix, sh_degree, - campos, prefiltered, debug - ) - - # Create the GaussianRasterizer - rasterizer = GaussianRasterizer(raster_settings) - - # Measure the time for preprocess_gaussians - start_time = time.time() - means2D, rgb, conic_opacity, radii, depths = rasterizer.preprocess_gaussians( - means3D, scales, rotations, shs, opacities - ) - end_time = time.time() - - preprocess_time = end_time - start_time - print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds") - def test_batched_gaussian_rasterizer(): # Set up the viewpoint cameras @@ -104,7 +59,7 @@ def test_batched_gaussian_rasterizer(): bg_color = torch.ones(3).cuda() scaling_modifier = 1.0 pc = type('PC', (), {}) - pc.active_sh_degree = 3 + pc.active_sh_degree = SH_ACTIVE_DEGREE pipe = type('Pipe', (), {}) pipe.debug = False mode = "train" @@ -207,39 +162,40 @@ def test_batched_gaussian_rasterizer_batch_processing(): bg_color = torch.ones(3).cuda() scaling_modifier = 1.0 pc = type('PC', (), {}) - pc.active_sh_degree = 3 + pc.active_sh_degree = SH_ACTIVE_DEGREE pipe = type('Pipe', (), {}) pipe.debug = False mode = "train" # Set up rasterization configuration for the batch - batched_tanfovx = torch.tensor([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]).cuda() - batched_tanfovy = torch.tensor([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]).cuda() - batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras]).cuda() - batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras]).cuda() - batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras]).cuda() - - batched_raster_settings = GaussianRasterizationSettings( - image_height=int(batched_viewpoint_cameras[0].image_height), - image_width=int(batched_viewpoint_cameras[0].image_width), - tanfovx=batched_tanfovx, - tanfovy=batched_tanfovy, - bg=bg_color, - scale_modifier=scaling_modifier, - viewmatrix=batched_viewmatrix, - projmatrix=batched_projmatrix, - sh_degree=pc.active_sh_degree, - campos=batched_campos, - prefiltered=False, - debug=pipe.debug - ) + batched_raster_settings = [] + batched_cuda_args = [] + for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)): + ########## [START] Prepare CUDA Rasterization Settings ########## + cuda_args = get_cuda_args(strategy, mode) + batched_cuda_args.append(cuda_args) + tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) + tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) + + raster_settings = GaussianRasterizationSettings( + image_height=int(batched_viewpoint_cameras[0].image_height), + image_width=int(batched_viewpoint_cameras[0].image_width), + tanfovx=tanfovx, + tanfovy=tanfovy, + bg=bg_color, + scale_modifier=scaling_modifier, + viewmatrix=viewpoint_camera.world_view_transform, + projmatrix=viewpoint_camera.full_proj_transform, + sh_degree=pc.active_sh_degree, + campos=viewpoint_camera.camera_center, + prefiltered=False, + debug=pipe.debug + ) + batched_raster_settings.append(raster_settings) # Create the GaussianRasterizer for the batch rasterizer = GaussianRasterizerBatches(raster_settings=batched_raster_settings) - # Set up CUDA arguments for the batch - cuda_args = get_cuda_args(batched_strategies[0], mode) # TODO: Check if this is correct for the batch - # Preprocess the Gaussians for the entire batch batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians( means3D=means3D, @@ -247,7 +203,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): rotations=rotations, shs=shs, opacities=opacity, - batched_cuda_args=cuda_args + batched_cuda_args=batched_cuda_args ) end_time = time.time() preprocess_time = end_time - start_time From 7ff2fd31b038cafd942e680018793004fc6815ba Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sat, 27 Apr 2024 20:33:22 -0400 Subject: [PATCH 45/53] Refactor rasterization_tests.py to use raster_settings_batch instead of batched_raster_settings --- rasterization_tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 2452cbf..f69cc6e 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -168,7 +168,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): mode = "train" # Set up rasterization configuration for the batch - batched_raster_settings = [] + raster_settings_batch = [] batched_cuda_args = [] for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)): ########## [START] Prepare CUDA Rasterization Settings ########## @@ -191,10 +191,10 @@ def test_batched_gaussian_rasterizer_batch_processing(): prefiltered=False, debug=pipe.debug ) - batched_raster_settings.append(raster_settings) + raster_settings_batch.append(raster_settings) # Create the GaussianRasterizer for the batch - rasterizer = GaussianRasterizerBatches(raster_settings=batched_raster_settings) + rasterizer = GaussianRasterizerBatches(raster_settings_batch=raster_settings_batch) # Preprocess the Gaussians for the entire batch batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians( From fc48eece6f933afda42ab7ca19690b667f71b186 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sat, 27 Apr 2024 20:45:00 -0400 Subject: [PATCH 46/53] fixed namedtuple setting bug --- diff_gaussian_rasterization/__init__.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index 6be6778..fe4a910 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -60,18 +60,23 @@ def forward( # Restructure arguments the way that the C++ lib expects them if isinstance(raster_settings, list): - rs = raster_settings[0] - rs.viewmatrix, rs.projmatrix, rs.campos = [ + viewmatrix, projmatrix, campos = [ torch.stack(tensors) for tensors in zip( *[(rs.viewmatrix, rs.projmatrix, rs.campos) for rs in raster_settings] ) ] - rs.tanfovx, rs.tanfovy = [ + tanfovx, tanfovy = [ torch.tensor(vals, device=means3D.device) for vals in zip(*[(rs.tanfovx, rs.tanfovy) for rs in raster_settings]) ] - raster_settings = rs - + raster_settings = raster_settings[0]._replace( + tanfovx=tanfovx, + tanfovy=tanfovy, + viewmatrix=viewmatrix, + projmatrix=projmatrix, + campos=campos + ) + args = ( means3D, scales, From 49c517978304a5c7d183bbd714fb616de46a7f85 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sat, 27 Apr 2024 20:53:21 -0400 Subject: [PATCH 47/53] Refactor GaussianRasterizationSettings class to handle raster_settings as a batch --- diff_gaussian_rasterization/__init__.py | 2 -- rasterization_tests.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index fe4a910..41b6a43 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -99,10 +99,8 @@ def forward( # TODO: update this. if not torch.is_tensor(raster_settings.tanfovx): - print("normal batch calling") num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args) else: - print("improved batch calling") num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batched(*args) # Keep relevant tensors for backward diff --git a/rasterization_tests.py b/rasterization_tests.py index f69cc6e..7501854 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -203,7 +203,7 @@ def test_batched_gaussian_rasterizer_batch_processing(): rotations=rotations, shs=shs, opacities=opacity, - batched_cuda_args=batched_cuda_args + batched_cuda_args=batched_cuda_args[0] #TODO: look into sending list of cuda_args/strategies ) end_time = time.time() preprocess_time = end_time - start_time From a0d7127c4d80844d4566a779c520c1c284f6a543 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sat, 27 Apr 2024 21:01:03 -0400 Subject: [PATCH 48/53] Update setup.py to remove debug flag from extra_compile_args --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3d16c4d..03b2df8 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ headers=[ "config.h" ], - extra_compile_args={"nvcc": ["-g", "-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]}) + extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]}) ], cmdclass={ 'build_ext': BuildExtension From a21c4b9303e56a901ea77555f8e767f1a0940a17 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sat, 27 Apr 2024 21:01:27 -0400 Subject: [PATCH 49/53] Fix formatting issues in forward.cu and __init__.py --- cuda_rasterizer/forward.cu | 8 +++----- diff_gaussian_rasterization/__init__.py | 2 -- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 567c7ae..fd1732a 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -28,9 +28,8 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const dir = dir / glm::length(dir); glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; - glm::vec3 result = SH_C0 * sh[0]; - + if (deg > 0) { float x = dir.x; @@ -251,7 +250,6 @@ __global__ void preprocessCUDA(int P, int D, int M, // Transform point by projecting float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; - float4 p_hom = transformPoint4x4(p_orig, projmatrix); float p_w = 1.0f / (p_hom.w + 0.0000001f); float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; @@ -270,7 +268,7 @@ __global__ void preprocessCUDA(int P, int D, int M, } // Compute 2D screen-space covariance matrix float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix); - + // Invert covariance (EWA algorithm) float det = (cov.x * cov.z - cov.y * cov.y); if (det == 0.0f) @@ -295,7 +293,7 @@ __global__ void preprocessCUDA(int P, int D, int M, // If colors have been precomputed, use them, otherwise convert // spherical harmonics coefficients to RGB color. if (colors_precomp == nullptr) - { + { glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped); rgb[idx * C + 0] = result.x; rgb[idx * C + 1] = result.y; diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py index 41b6a43..7667063 100644 --- a/diff_gaussian_rasterization/__init__.py +++ b/diff_gaussian_rasterization/__init__.py @@ -115,8 +115,6 @@ def forward( # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous() return means2D, rgb, conic_opacity, radii, depths - - @staticmethod # TODO: gradient for conic_opacity is tricky. because cuda render backward generate dL_dconic and dL_dopacity sperately. def backward(ctx, grad_means2D, grad_rgb, grad_conic_opacity, grad_radii, grad_depths): # grad_radii, grad_depths should be all None. From 25c6812f1eb4b654588df3f6d1df664528c256bd Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Sat, 27 Apr 2024 21:11:10 -0400 Subject: [PATCH 50/53] Refactor computeColorFromSH function in forward.cu to use point_idx and result_idx instead of only idx. --- cuda_rasterizer/forward.cu | 69 +++++--------------------------------- 1 file changed, 8 insertions(+), 61 deletions(-) diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index fd1732a..5f83847 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -18,16 +18,16 @@ namespace cg = cooperative_groups; // Forward method for converting the input spherical harmonics // coefficients of each Gaussian to a simple RGB color. -__device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped) +__device__ glm::vec3 computeColorFromSH(int point_idx, int result_idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped) { // The implementation is loosely based on code for // "Differentiable Point-Based Radiance Fields for // Efficient View Synthesis" by Zhang et al. (2022) - glm::vec3 pos = means[idx]; + glm::vec3 pos = means[point_idx]; glm::vec3 dir = pos - campos; dir = dir / glm::length(dir); - glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; + glm::vec3* sh = ((glm::vec3*)shs) + point_idx * max_coeffs; glm::vec3 result = SH_C0 * sh[0]; if (deg > 0) @@ -65,62 +65,9 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const // RGB colors are clamped to positive values. If values are // clamped, we need to keep track of this for the backward pass. - clamped[3 * idx + 0] = (result.x < 0); - clamped[3 * idx + 1] = (result.y < 0); - clamped[3 * idx + 2] = (result.z < 0); - return glm::max(result, 0.0f); -} -__device__ glm::vec3 computeColorFromSHBatched(int idx, int view_idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped) -{ - // The implementation is loosely based on code for - // "Differentiable Point-Based Radiance Fields for - // Efficient View Synthesis" by Zhang et al. (2022) - glm::vec3 pos = means[idx]; - glm::vec3 dir = pos - campos; - dir = dir / glm::length(dir); - - glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; - - glm::vec3 result = SH_C0 * sh[0]; - - if (deg > 0) - { - float x = dir.x; - float y = dir.y; - float z = dir.z; - result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3]; - - if (deg > 1) - { - float xx = x * x, yy = y * y, zz = z * z; - float xy = x * y, yz = y * z, xz = x * z; - result = result + - SH_C2[0] * xy * sh[4] + - SH_C2[1] * yz * sh[5] + - SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] + - SH_C2[3] * xz * sh[7] + - SH_C2[4] * (xx - yy) * sh[8]; - - if (deg > 2) - { - result = result + - SH_C3[0] * y * (3.0f * xx - yy) * sh[9] + - SH_C3[1] * xy * z * sh[10] + - SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] + - SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] + - SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] + - SH_C3[5] * z * (xx - yy) * sh[14] + - SH_C3[6] * x * (xx - 3.0f * yy) * sh[15]; - } - } - } - result += 0.5f; - - // RGB colors are clamped to positive values. If values are - // clamped, we need to keep track of this for the backward pass. - clamped[3 * view_idx + 0] = (result.x < 0); - clamped[3 * view_idx + 1] = (result.y < 0); - clamped[3 * view_idx + 2] = (result.z < 0); + clamped[3 * result_idx + 0] = (result.x < 0); + clamped[3 * result_idx + 1] = (result.y < 0); + clamped[3 * result_idx + 2] = (result.z < 0); return glm::max(result, 0.0f); } @@ -294,7 +241,7 @@ __global__ void preprocessCUDA(int P, int D, int M, // spherical harmonics coefficients to RGB color. if (colors_precomp == nullptr) { - glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped); + glm::vec3 result = computeColorFromSH(idx, idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped); rgb[idx * C + 0] = result.x; rgb[idx * C + 1] = result.y; rgb[idx * C + 2] = result.z; @@ -631,7 +578,7 @@ __global__ void preprocessCUDABatched( if (colors_precomp == nullptr) { - glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped); + glm::vec3 result = computeColorFromSH(point_idx, idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped); rgb[idx * C + 0] = result.x; rgb[idx * C + 1] = result.y; rgb[idx * C + 2] = result.z; From 1b7fdc4bdc4358db42b89f2bc4f1c9979f2438f6 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Tue, 7 May 2024 23:33:04 -0400 Subject: [PATCH 51/53] replaced python time with torch event records --- rasterization_tests.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/rasterization_tests.py b/rasterization_tests.py index 7501854..b140aab 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -9,7 +9,7 @@ GaussianRasterizerBatches, ) -num_gaussians = 10000 +num_gaussians = 20000 num_batches=32 means3D = torch.randn(num_gaussians, 3).cuda() scales = torch.randn(num_gaussians, 3).cuda() @@ -73,8 +73,10 @@ def test_batched_gaussian_rasterizer(): batched_depths=[] batched_rgb=[] - start_time = time.time() - + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start_event.record() for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)): ########## [START] Prepare CUDA Rasterization Settings ########## cuda_args = get_cuda_args(strategy, mode) @@ -124,9 +126,10 @@ def test_batched_gaussian_rasterizer(): batched_depths.append(depths) - end_time = time.time() - preprocess_time = end_time - start_time - print(f"Time taken by test_batched_gaussian_rasterizer: {preprocess_time:.4f} seconds") + end_event.record() + torch.cuda.synchronize() + elapsed_time_ms = start_event.elapsed_time(end_event) + print(f"Time taken by test_batched_gaussian_rasterizer: {elapsed_time_ms:.4f} ms") # Perform further operations with the batched results # Test results and performance @@ -141,7 +144,10 @@ def test_batched_gaussian_rasterizer(): def test_batched_gaussian_rasterizer_batch_processing(): # Set up the input data - start_time = time.time() + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() # Wait for the events to be recorded! + start_event.record() # Set up the viewpoint cameras batched_viewpoint_cameras = [] for _ in range(num_batches): @@ -205,9 +211,10 @@ def test_batched_gaussian_rasterizer_batch_processing(): opacities=opacity, batched_cuda_args=batched_cuda_args[0] #TODO: look into sending list of cuda_args/strategies ) - end_time = time.time() - preprocess_time = end_time - start_time - print(f"Time taken by test_batched_gaussian_rasterizer_batch_processing: {preprocess_time:.4f} seconds") + end_event.record() + torch.cuda.synchronize() # Wait for the events to be recorded! + elapsed_time_ms = start_event.elapsed_time(end_event) + print(f"Time taken by test_batched_gaussian_rasterizer_batch_processing: {elapsed_time_ms:.4f} ms") # TODO: make the below work # if mode == "train": From 3c4c66775ed187e19c462d75e30598d70a0d9f94 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Wed, 8 May 2024 00:13:11 -0400 Subject: [PATCH 52/53] fixed cuda illegal memory bug and can run for 1M gaussians --- cuda_rasterizer/rasterizer_impl.cu | 4 ++-- rasterization_tests.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu index 06e95db..44d09f4 100644 --- a/cuda_rasterizer/rasterizer_impl.cu +++ b/cuda_rasterizer/rasterizer_impl.cu @@ -472,8 +472,8 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches( // In sep_rendering==True case, we will compute tiles_touched in the renderForward. // TODO: remove it later by modifying FORWARD::preprocess when we deprecate sep_rendering==False case uint32_t* tiles_touched_temp_buffer; - CHECK_CUDA(cudaMalloc(&tiles_touched_temp_buffer, P * sizeof(uint32_t)), debug); - CHECK_CUDA(cudaMemset(tiles_touched_temp_buffer, 0, P * sizeof(uint32_t)), debug); + CHECK_CUDA(cudaMalloc(&tiles_touched_temp_buffer, num_viewpoints * P * sizeof(uint32_t)), debug); + CHECK_CUDA(cudaMemset(tiles_touched_temp_buffer, 0, num_viewpoints * P * sizeof(uint32_t)), debug); timer.start("10 preprocess"); // Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB) diff --git a/rasterization_tests.py b/rasterization_tests.py index b140aab..90afdcd 100644 --- a/rasterization_tests.py +++ b/rasterization_tests.py @@ -1,5 +1,4 @@ import math -import time import torch @@ -9,8 +8,8 @@ GaussianRasterizerBatches, ) -num_gaussians = 20000 -num_batches=32 +num_gaussians = 1000000 +num_batches=64 means3D = torch.randn(num_gaussians, 3).cuda() scales = torch.randn(num_gaussians, 3).cuda() rotations = torch.randn(num_gaussians, 4).cuda() From 1e4cbc9845807daef10358252322d9314d56c4c2 Mon Sep 17 00:00:00 2001 From: Sandeep Menon Date: Wed, 8 May 2024 00:15:59 -0400 Subject: [PATCH 53/53] chore: Update .gitignore to ignore *.pyc files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index aa1ae78..77db517 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ diff_gaussian_rasterization.egg-info/ dist/ diff_gaussian_rasterization/__pycache__/ *so +*.pyc \ No newline at end of file