From 6aed776aa54c810bc6fdf5fe6ceba10c39665a00 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sat, 20 Apr 2024 19:17:58 -0400
Subject: [PATCH 01/53] test function for rasterszaton tests

---
 rasterization_tests.py | 167 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 rasterization_tests.py

diff --git a/rasterization_tests.py b/rasterization_tests.py
new file mode 100644
index 0000000..055727c
--- /dev/null
+++ b/rasterization_tests.py
@@ -0,0 +1,167 @@
+import math
+import time
+
+import torch
+
+from diff_gaussian_rasterization import (
+    GaussianRasterizationSettings,
+    GaussianRasterizer,
+)
+
+
+def get_cuda_args(strategy, mode="train"):
+    cuda_args = {
+        "mode": mode,
+        "world_size": "1",
+        "global_rank": "0",
+        "local_rank": "0",
+        "mp_world_size": "1",
+        "mp_rank": "0",
+        "log_folder": "./logs",
+        "log_interval": "10",
+        "iteration": "0",
+        "zhx_debug": "False",
+        "zhx_time": "False",
+        "dist_global_strategy": "default",
+        "avoid_pixel_all2all": False,
+        "stats_collector": {},
+    }
+    return cuda_args
+
+def test_gaussian_rasterizer_time():
+    # Set up the input data
+    num_gaussians = 10000
+    means3D = torch.randn(num_gaussians, 3).cuda()
+    scales = torch.randn(num_gaussians, 3).cuda()
+    rotations = torch.randn(num_gaussians, 3, 3).cuda()
+    shs = torch.randn(num_gaussians, 9).cuda()
+    opacities = torch.randn(num_gaussians, 1).cuda()
+
+    # Set up the rasterization settings
+    image_height = 512
+    image_width = 512
+    tanfovx = 1.0
+    tanfovy = 1.0
+    bg = torch.ones(3).cuda()
+    scale_modifier = 1.0
+    viewmatrix = torch.eye(4).cuda()
+    projmatrix = torch.eye(4).cuda()
+    sh_degree = 2
+    campos = torch.zeros(3).cuda()
+    prefiltered = False
+    debug = False
+    
+    # mode="train"
+    # strategy=None
+    # cuda_args = get_cuda_args(strategy, mode)
+
+    raster_settings = GaussianRasterizationSettings(
+        image_height, image_width, tanfovx, tanfovy, bg,
+        scale_modifier, viewmatrix, projmatrix, sh_degree,
+        campos, prefiltered, debug
+    )
+
+    # Create the GaussianRasterizer
+    rasterizer = GaussianRasterizer(raster_settings)
+
+    # Measure the time for preprocess_gaussians
+    start_time = time.time()
+    means2D, rgb, conic_opacity, radii, depths = rasterizer.preprocess_gaussians(
+        means3D, scales, rotations, shs, opacities
+    )
+    end_time = time.time()
+
+    preprocess_time = end_time - start_time
+    print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds")
+
+
+def test_batched_gaussian_rasterizer():
+    # Set up the input data
+    num_gaussians = 10000
+    num_batches = 4
+    means3D = torch.randn(num_gaussians, 3).cuda()
+    scales = torch.randn(num_gaussians, 3).cuda()
+    rotations = torch.randn(num_gaussians, 3, 3).cuda()
+    shs = torch.randn(num_gaussians, 9).cuda()
+    opacity = torch.randn(num_gaussians, 1).cuda()
+
+    # Set up the viewpoint cameras
+    batched_viewpoint_cameras = []
+    for _ in range(num_batches):
+        viewpoint_camera = type('ViewpointCamera', (), {})
+        viewpoint_camera.FoVx = math.radians(60)
+        viewpoint_camera.FoVy = math.radians(60)
+        viewpoint_camera.image_height = 512
+        viewpoint_camera.image_width = 512
+        viewpoint_camera.world_view_transform = torch.eye(4).cuda()
+        viewpoint_camera.full_proj_transform = torch.eye(4).cuda()
+        viewpoint_camera.camera_center = torch.zeros(3).cuda()
+        batched_viewpoint_cameras.append(viewpoint_camera)
+
+    # Set up the strategies
+    batched_strategies = [None] * num_batches
+
+    # Set up other parameters
+    bg_color = torch.ones(3).cuda()
+    scaling_modifier = 1.0
+    pc = type('PC', (), {})
+    pc.active_sh_degree = 2
+    pipe = type('Pipe', (), {})
+    pipe.debug = False
+    mode = "train"
+
+    batched_rasterizers = []
+    batched_cuda_args = []
+    batched_screenspace_params = []
+    batched_means2D = []
+    batched_radii = []
+
+    for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)):
+        ########## [START] Prepare CUDA Rasterization Settings ##########
+        cuda_args = get_cuda_args(strategy, mode)
+        batched_cuda_args.append(cuda_args)
+
+        # Set up rasterization configuration
+        tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
+        tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
+        raster_settings = GaussianRasterizationSettings(
+            image_height=int(viewpoint_camera.image_height),
+            image_width=int(viewpoint_camera.image_width),
+            tanfovx=tanfovx,
+            tanfovy=tanfovy,
+            bg=bg_color,
+            scale_modifier=scaling_modifier,
+            viewmatrix=viewpoint_camera.world_view_transform,
+            projmatrix=viewpoint_camera.full_proj_transform,
+            sh_degree=pc.active_sh_degree,
+            campos=viewpoint_camera.camera_center,
+            prefiltered=False,
+            debug=pipe.debug
+        )
+        rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+        ########## [END] Prepare CUDA Rasterization Settings ##########
+
+        #[3DGS-wise preprocess]
+        means2D, rgb, conic_opacity, radii, depths = rasterizer.preprocess_gaussians(
+            means3D=means3D,
+            scales=scales,
+            rotations=rotations,
+            shs=shs,
+            opacities=opacity,
+            cuda_args=cuda_args
+        )
+
+        if mode == "train":
+            means2D.retain_grad()
+
+        batched_means2D.append(means2D)
+        screenspace_params = [means2D, rgb, conic_opacity, radii, depths]
+        batched_rasterizers.append(rasterizer)
+        batched_screenspace_params.append(screenspace_params)
+        batched_radii.append(radii)
+
+    # Perform further operations with the batched results
+    # ...
+
+if __name__ == "__main__":
+    test_gaussian_rasterizer_time()
\ No newline at end of file

From b7b08baa5c6bcaf9c2bd38fb4b274fcc5198237a Mon Sep 17 00:00:00 2001
From: Prapti Devansh Trivedi <pt2295@log-2.hpc.nyu.edu>
Date: Sat, 20 Apr 2024 19:51:45 -0400
Subject: [PATCH 02/53] add mock of improved preproc

---
 diff_gaussian_rasterization/__init__.py | 25 ++++++++
 rasterization_tests.py                  | 84 +++++++++++++++++++++++++
 2 files changed, 109 insertions(+)

diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index 7e3ad04..73ab7e9 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -304,6 +304,31 @@ class GaussianRasterizationSettings(NamedTuple):
     prefiltered : bool
     debug : bool
 
+class GaussianRasterizerBatches(nn.Module):
+    def __init__(self, raster_settings):
+        super().__init__()
+        self.raster_settings_list = raster_settings
+
+    def markVisible(self, positions):
+        # Mark visible points (based on frustum culling for camera) with a boolean 
+        with torch.no_grad():
+            visible = []
+            for viewmatrix, projmatrix in zip(self.raster_settings.viewmatrix, self.raster_settings.projmatrix):
+                visible.append(_C.mark_visible(positions, viewmatrix, projmatrix))
+        return visible
+
+    def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batched_cuda_args=None):
+        # Invoke C++/CUDA rasterization routine
+        
+            return preprocess_gaussians_batches(
+                means3D,
+                scales,
+                rotations,
+                shs,
+                opacities,
+                self.raster_settings_list,
+                batched_cuda_args)
+
 class GaussianRasterizer(nn.Module):
     def __init__(self, raster_settings):
         super().__init__()
diff --git a/rasterization_tests.py b/rasterization_tests.py
index 055727c..1dcc91a 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -74,6 +74,88 @@ def test_gaussian_rasterizer_time():
     preprocess_time = end_time - start_time
     print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds")
 
+def test_improved_gaussian_rasterizer():
+
+    # Set up the input data
+    num_gaussians = 10000
+    num_batches = 4
+    means3D = torch.randn(num_gaussians, 3).cuda()
+    scales = torch.randn(num_gaussians, 3).cuda()
+    rotations = torch.randn(num_gaussians, 3, 3).cuda()
+    shs = torch.randn(num_gaussians, 9).cuda()
+    opacity = torch.randn(num_gaussians, 1).cuda()
+
+    # Set up the viewpoint cameras
+    batched_viewpoint_cameras = []
+    for _ in range(num_batches):
+        viewpoint_camera = type('ViewpointCamera', (), {})
+        viewpoint_camera.FoVx = math.radians(60)
+        viewpoint_camera.FoVy = math.radians(60)
+        viewpoint_camera.image_height = 512
+        viewpoint_camera.image_width = 512
+        viewpoint_camera.world_view_transform = torch.eye(4).cuda()
+        viewpoint_camera.full_proj_transform = torch.eye(4).cuda()
+        viewpoint_camera.camera_center = torch.zeros(3).cuda()
+        batched_viewpoint_cameras.append(viewpoint_camera)
+
+    # Set up the strategies
+    batched_strategies = [None] * num_batches
+
+    # Set up other parameters
+    bg_color = torch.ones(3).cuda()
+    scaling_modifier = 1.0
+    pc = type('PC', (), {})
+    pc.active_sh_degree = 2
+    pipe = type('Pipe', (), {})
+    pipe.debug = False
+    mode = "train"
+
+    batched_rasterizers = []
+    batched_cuda_args = []
+    batched_screenspace_params = []
+    batched_means2D = []
+    batched_radii = []
+    raster_settings_list=[]
+    for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)):
+        ########## [START] Prepare CUDA Rasterization Settings ##########
+        cuda_args = get_cuda_args(strategy, mode)
+        batched_cuda_args.append(cuda_args)
+
+        # Set up rasterization configuration
+        tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
+        tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
+
+        raster_settings_list.append(GaussianRasterizationSettings(
+            image_height=int(viewpoint_camera.image_height),
+            image_width=int(viewpoint_camera.image_width),
+            tanfovx=tanfovx,
+            tanfovy=tanfovy,
+            bg=bg_color,
+            scale_modifier=scaling_modifier,
+            viewmatrix=viewpoint_camera.world_view_transform,
+            projmatrix=viewpoint_camera.full_proj_transform,
+            sh_degree=pc.active_sh_degree,
+            campos=viewpoint_camera.camera_center,
+            prefiltered=False,
+            debug=pipe.debug
+        ))
+
+
+    rasterizer=GaussianRasterizerBatches(raster_settings=raster_settings_list)
+    start_time = time.time()
+    batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians_batches(
+            means3D=means3D,
+            scales=scales,
+            rotations=rotations,
+            shs=shs,
+            opacities=opacity,
+            cuda_args=batched_cuda_args
+        )
+    end_time = time.time()
+
+    preprocess_time = end_time - start_time
+    print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds")
+
 
 def test_batched_gaussian_rasterizer():
     # Set up the input data
@@ -163,5 +245,7 @@ def test_batched_gaussian_rasterizer():
     # Perform further operations with the batched results
     # ...
 
+
+
 if __name__ == "__main__":
     test_gaussian_rasterizer_time()
\ No newline at end of file

From 54302e4381903500d89ee8e8914d4a3dd638b812 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sun, 21 Apr 2024 10:20:52 -0400
Subject: [PATCH 03/53] batched rasterization

---
 rasterization_tests.py | 92 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 2 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 055727c..1aca6de 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -161,7 +161,95 @@ def test_batched_gaussian_rasterizer():
         batched_radii.append(radii)
 
     # Perform further operations with the batched results
-    # ...
+    # Test results and performance
+    
+def test_batched_gaussian_rasterizer_batch_processing():
+    # Set up the input data
+    num_gaussians = 10000
+    num_batches = 4
+    means3D = torch.randn(num_gaussians, 3).cuda()
+    scales = torch.randn(num_gaussians, 3).cuda()
+    rotations = torch.randn(num_gaussians, 3, 3).cuda()
+    shs = torch.randn(num_gaussians, 9).cuda()
+    opacity = torch.randn(num_gaussians, 1).cuda()
+
+    # Set up the viewpoint cameras
+    batched_viewpoint_cameras = []
+    for _ in range(num_batches):
+        viewpoint_camera = type('ViewpointCamera', (), {})
+        viewpoint_camera.FoVx = math.radians(60)
+        viewpoint_camera.FoVy = math.radians(60)
+        viewpoint_camera.image_height = 512
+        viewpoint_camera.image_width = 512
+        viewpoint_camera.world_view_transform = torch.eye(4).cuda()
+        viewpoint_camera.full_proj_transform = torch.eye(4).cuda()
+        viewpoint_camera.camera_center = torch.zeros(3).cuda()
+        batched_viewpoint_cameras.append(viewpoint_camera)
+
+    # Set up the strategies
+    batched_strategies = [None] * num_batches
+
+    # Set up other parameters
+    bg_color = torch.ones(3).cuda()
+    scaling_modifier = 1.0
+    pc = type('PC', (), {})
+    pc.active_sh_degree = 2
+    pipe = type('Pipe', (), {})
+    pipe.debug = False
+    mode = "train"
+
+    # Set up rasterization configuration for the batch
+    batched_tanfovx = [math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]
+    batched_tanfovy = [math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]
+    batched_viewmatrix = [camera.world_view_transform for camera in batched_viewpoint_cameras]
+    batched_projmatrix = [camera.full_proj_transform for camera in batched_viewpoint_cameras]
+    batched_campos = [camera.camera_center for camera in batched_viewpoint_cameras]
+
+    batched_raster_settings = GaussianRasterizationSettings(
+        image_height=int(batched_viewpoint_cameras[0].image_height),
+        image_width=int(batched_viewpoint_cameras[0].image_width),
+        tanfovx=batched_tanfovx,
+        tanfovy=batched_tanfovy,
+        bg=bg_color,
+        scale_modifier=scaling_modifier,
+        viewmatrix=batched_viewmatrix,
+        projmatrix=batched_projmatrix,
+        sh_degree=pc.active_sh_degree,
+        campos=batched_campos,
+        prefiltered=False,
+        debug=pipe.debug
+    )
+
+    # Create the GaussianRasterizer for the batch
+    rasterizer = GaussianRasterizer(raster_settings=batched_raster_settings)
+
+    # Set up CUDA arguments for the batch
+    cuda_args = get_cuda_args(batched_strategies[0], mode)  # TODO: Check if this is correct for the batch
+
+    # Preprocess the Gaussians for the entire batch
+    batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians(
+        means3D=means3D,
+        scales=scales,
+        rotations=rotations,
+        shs=shs,
+        opacities=opacity,
+        cuda_args=cuda_args
+    )
+
+    if mode == "train":
+        batched_means2D.retain_grad()
+
+    batched_screenspace_params = [batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths]
+
+    # Perform assertions on the preprocessed data
+    assert batched_means2D.shape == (num_gaussians, num_batches, 2)
+    assert batched_rgb.shape == (num_gaussians, num_batches, 3)
+    assert batched_conic_opacity.shape == (num_gaussians, num_batches, 1)
+    assert batched_radii.shape == (num_gaussians, num_batches)
+    assert batched_depths.shape == (num_gaussians, num_batches)
+
 
 if __name__ == "__main__":
-    test_gaussian_rasterizer_time()
\ No newline at end of file
+    test_gaussian_rasterizer_time()
+    test_batched_gaussian_rasterizer_batch_processing()
+    
\ No newline at end of file

From 53e12d2067a0bf3fe7eb00f6e85e53ec35ac07e3 Mon Sep 17 00:00:00 2001
From: Prapti Devansh Trivedi <pt2295@log-2.hpc.nyu.edu>
Date: Sun, 21 Apr 2024 19:32:02 -0400
Subject: [PATCH 04/53] add rough idea for kernel

---
 cuda_rasterizer/rasterizer_impl.cu      |  95 +++++++++++++++++++
 diff_gaussian_rasterization/__init__.py | 119 ++++++++++++++++--------
 ext.cpp                                 |   1 +
 rasterization_tests.py                  |   2 +-
 rasterize_points.cu                     |  82 ++++++++++++++++
 5 files changed, 260 insertions(+), 39 deletions(-)

diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
index dec20fa..1215c25 100644
--- a/cuda_rasterizer/rasterizer_impl.cu
+++ b/cuda_rasterizer/rasterizer_impl.cu
@@ -424,6 +424,101 @@ int CudaRasterizer::Rasterizer::preprocessForward(
 	return num_rendered;
 }
 
+
+int CudaRasterizer::Rasterizer::preprocessForwardBatches(
+	float2* means2D,
+	float* depths,
+	int* radii,
+	float* cov3D,
+	float4* conic_opacity,
+	float* rgb,
+	bool* clamped,//the above are all per-Gaussian intemediate results.
+	const int P, int D, int M,
+	const std::vector<int>& width, std::vector<int>& height,
+	const float* means3D,
+	const float* scales,
+	const float* rotations,
+	const float* shs,
+	const float* opacities,//3dgs parameters
+	const std::vector<float>& scale_modifier,
+	const std::vector<torch::Tensor>& viewmatrix,
+	const std::vector<torch::Tensor>& projmatrix,
+	const std::vector<float>& cam_pos,
+	const std::vector<float>& tan_fovx, std::vector<float>& tan_fovy,
+	const std::vector<bool>& prefiltered,
+	std::vector<bool>& debug,//raster_settings
+	const std::vector<pybind11::dict> &args)
+{
+	auto [global_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
+	char* log_tmp = new char[500];
+
+	// print out the environment variables
+	if (mode == "train" && zhx_debug && iteration % log_interval == 1) {
+		sprintf(log_tmp, "world_size: %d, global_rank: %d, iteration: %d, log_folder: %s, zhx_debug: %d, zhx_time: %d, device: %d, log_interval: %d, dist_division_mode: %s", 
+				world_size, global_rank, iteration, log_folder.c_str(), zhx_debug, zhx_time, device, log_interval, dist_division_mode.c_str());
+		save_log_in_file(iteration, global_rank, world_size, log_folder, "cuda", log_tmp);
+	}
+
+	MyTimerOnGPU timer;
+	// const float focal_y = height / (2.0f * tan_fovy);
+	// const float focal_x = width / (2.0f * tan_fovx);
+	const int num_viewpoints=viewmatrix.size();
+
+	//CONVERT ALL VECTORS TO FLOATSSSSSS PRAPTIIIIIII
+
+	dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, num_viewpoints);
+	dim3 block(BLOCK_X, BLOCK_Y, num_viewpoints);
+	int tile_num = tile_grid.x * tile_grid.y*tile_grid.z;
+
+	// allocate temporary buffer for tiles_touched.
+	// In sep_rendering==True case, we will compute tiles_touched in the renderForward. 
+	// TODO: remove it later by modifying FORWARD::preprocess when we deprecate sep_rendering==False case
+	uint32_t* tiles_touched_temp_buffer;
+	CHECK_CUDA(cudaMalloc(&tiles_touched_temp_buffer, P * sizeof(uint32_t)), debug);
+	CHECK_CUDA(cudaMemset(tiles_touched_temp_buffer, 0, P * sizeof(uint32_t)), debug);
+
+	timer.start("10 preprocess");
+	// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
+	CHECK_CUDA(FORWARD::preprocess(
+		P, D, M,
+		means3D,
+		(glm::vec3*)scales,
+		scale_modifier,
+		(glm::vec4*)rotations,
+		opacities,
+		shs,
+		clamped,
+		nullptr,//cov3D_precomp,
+		nullptr,//colors_precomp,TODO: this is correct?
+		viewmatrix, projmatrix,
+		(glm::vec3*)cam_pos,
+		width, height,
+		focal_x, focal_y,
+		tan_fovx, tan_fovy,
+		radii,
+		means2D,
+		depths,
+		cov3D,
+		rgb,
+		conic_opacity,
+		tile_grid,
+		tiles_touched_temp_buffer,
+		prefiltered
+	), debug)
+	timer.stop("10 preprocess");
+
+	int num_rendered = 0;//TODO: should I calculate this here?
+
+	// Print out timing information
+	if (zhx_time && iteration % log_interval == 1) {
+		timer.printAllTimes(iteration, world_size, global_rank, log_folder, true);
+	}
+	delete log_tmp;
+	// free temporary buffer for tiles_touched. TODO: remove it. 
+	CHECK_CUDA(cudaFree(tiles_touched_temp_buffer), debug);
+	return num_rendered;
+}
+
 void CudaRasterizer::Rasterizer::preprocessBackward(
 	const int* radii,
 	const float* cov3D,
diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index 73ab7e9..957d9cd 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -31,7 +31,7 @@ def preprocess_gaussians(
     sh,
     opacities,
     raster_settings,
-    cuda_args,
+    cuda_args,flag_batched=False
 ):
     return _PreprocessGaussians.apply(
         means3D,
@@ -40,7 +40,7 @@ def preprocess_gaussians(
         sh,
         opacities,
         raster_settings,
-        cuda_args,
+        cuda_args,flag_batched
     )
 
 class _PreprocessGaussians(torch.autograd.Function):
@@ -52,45 +52,88 @@ def forward(
         rotations,
         sh,
         opacities,
-        raster_settings,
-        cuda_args,
+        raster_settings_list,
+        batched_cuda_args,flag_batched
     ):
 
         # Restructure arguments the way that the C++ lib expects them
-        args = (
-            means3D,
-            scales,
-            rotations,
-            sh,
-            opacities,# 3dgs' parametes.
-            raster_settings.scale_modifier,
-            raster_settings.viewmatrix,
-            raster_settings.projmatrix,
-            raster_settings.tanfovx,
-            raster_settings.tanfovy,
-            raster_settings.image_height,
-            raster_settings.image_width,
-            raster_settings.sh_degree,
-            raster_settings.campos,
-            raster_settings.prefiltered,
-            raster_settings.debug,#raster_settings
-            cuda_args
-        )
-
-        # TODO: update this. 
-        num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
+        if flag_batched==False:
+            args = (
+                means3D,
+                scales,
+                rotations,
+                sh,
+                opacities,# 3dgs' parametes.
+                raster_settings.scale_modifier,
+                raster_settings.viewmatrix,
+                raster_settings.projmatrix,
+                raster_settings.tanfovx,
+                raster_settings.tanfovy,
+                raster_settings.image_height,
+                raster_settings.image_width,
+                raster_settings.sh_degree,
+                raster_settings.campos,
+                raster_settings.prefiltered,
+                raster_settings.debug,#raster_settings
+                cuda_args
+            )
+
+            # TODO: update this. 
+            num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
+
+            # Keep relevant tensors for backward
+            ctx.raster_settings = raster_settings
+            ctx.cuda_args = cuda_args
+            ctx.num_rendered = num_rendered
+            ctx.save_for_backward(means3D, scales, rotations, sh, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped)
+            ctx.mark_non_differentiable(radii, depths)
+
+            # # TODO: double check. means2D is padded to (P, 3) in python. It is (P, 2) in cuda code.
+            # means2D_pad = torch.zeros((means2D.shape[0], 1), dtype = means2D.dtype, device = means2D.device)
+            # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous()
+            return means2D, rgb, conic_opacity, radii, depths
+
+        else:
+            args_list=[]
+            for raster_settings,cuda_args in zip(raster_settings_list,batched_cuda_args):
+
+                args = (
+                    means3D,
+                    scales,
+                    rotations,
+                    sh,
+                    opacities,# 3dgs' parametes.
+                    raster_settings.scale_modifier,
+                    raster_settings.viewmatrix,
+                    raster_settings.projmatrix,
+                    raster_settings.tanfovx,
+                    raster_settings.tanfovy,
+                    raster_settings.image_height,
+                    raster_settings.image_width,
+                    raster_settings.sh_degree,
+                    raster_settings.campos,
+                    raster_settings.prefiltered,
+                    raster_settings.debug,#raster_settings
+                    cuda_args
+                )
+                args_list.append(args)
+
+            # TODO: update this. 
+            num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batches(*args_list)
+
+            # Keep relevant tensors for backward
+            ctx.raster_settings = raster_settings_list
+            ctx.cuda_args = batched_cuda_args
+            ctx.num_rendered = num_rendered
+            ctx.save_for_backward(means3D, scales, rotations, sh, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped)
+            ctx.mark_non_differentiable(radii, depths)
+
+            # # TODO: double check. means2D is padded to (P, 3) in python. It is (P, 2) in cuda code.
+            # means2D_pad = torch.zeros((means2D.shape[0], 1), dtype = means2D.dtype, device = means2D.device)
+            # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous()
+            return means2D, rgb, conic_opacity, radii, depths
 
-        # Keep relevant tensors for backward
-        ctx.raster_settings = raster_settings
-        ctx.cuda_args = cuda_args
-        ctx.num_rendered = num_rendered
-        ctx.save_for_backward(means3D, scales, rotations, sh, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped)
-        ctx.mark_non_differentiable(radii, depths)
 
-        # # TODO: double check. means2D is padded to (P, 3) in python. It is (P, 2) in cuda code.
-        # means2D_pad = torch.zeros((means2D.shape[0], 1), dtype = means2D.dtype, device = means2D.device)
-        # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous()
-        return means2D, rgb, conic_opacity, radii, depths
 
     @staticmethod # TODO: gradient for conic_opacity is tricky. because cuda render backward generate dL_dconic and dL_dopacity sperately. 
     def backward(ctx, grad_means2D, grad_rgb, grad_conic_opacity, grad_radii, grad_depths):
@@ -320,14 +363,14 @@ def markVisible(self, positions):
     def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batched_cuda_args=None):
         # Invoke C++/CUDA rasterization routine
         
-            return preprocess_gaussians_batches(
+            return preprocess_gaussians(
                 means3D,
                 scales,
                 rotations,
                 shs,
                 opacities,
                 self.raster_settings_list,
-                batched_cuda_args)
+                batched_cuda_args,True)
 
 class GaussianRasterizer(nn.Module):
     def __init__(self, raster_settings):
diff --git a/ext.cpp b/ext.cpp
index a957cd2..e4249bb 100644
--- a/ext.cpp
+++ b/ext.cpp
@@ -16,6 +16,7 @@
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("mark_visible", &markVisible);
   m.def("preprocess_gaussians", &PreprocessGaussiansCUDA);
+  m.def("preprocess_gaussians_batched", &PreprocessGaussiansCUDABatches);
   m.def("preprocess_gaussians_backward", &PreprocessGaussiansBackwardCUDA);
   m.def("get_distribution_strategy", &GetDistributionStrategyCUDA);
   m.def("render_gaussians", &RenderGaussiansCUDA);
diff --git a/rasterization_tests.py b/rasterization_tests.py
index 1dcc91a..b26f3cd 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -143,7 +143,7 @@ def test_improved_gaussian_rasterizer():
 
     rasterizer=GaussianRasterizerBatches(raster_settings=raster_settings_list)
     start_time = time.time()
-    batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians_batches(
+    batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians(
             means3D=means3D,
             scales=scales,
             rotations=rotations,
diff --git a/rasterize_points.cu b/rasterize_points.cu
index e4400a6..9a3fb7b 100644
--- a/rasterize_points.cu
+++ b/rasterize_points.cu
@@ -142,6 +142,88 @@ PreprocessGaussiansCUDA(
 	return std::make_tuple(rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped);
 }
 
+std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+PreprocessGaussiansCUDABatches(
+	const torch::Tensor& means3D,
+	const torch::Tensor& scales,
+	const torch::Tensor& rotations,
+	const torch::Tensor& sh,
+    const torch::Tensor& opacity,//3dgs' parametes.
+	const std::vector<float>& scale_modifier,
+	const std::vector<torch::Tensor>& viewmatrix,
+	const std::vector<torch::Tensor>& projmatrix,
+	const std::vector<float>& tan_fovx, 
+	const std::vector<float>& tan_fovy,
+    const std::vector<int>& image_height,
+    const std::vector<int>& image_width,
+	const std::vector<int>& degree,
+	const std::vector<torch::Tensor>& campos,
+	const std::vector<bool>& prefiltered,//raster_settings
+	const std::vector<bool>& debug,
+	const std::vector<pybind11::dict> &args) {
+
+	if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
+		AT_ERROR("means3D must have dimensions (num_points, 3)");
+	}
+
+	const int P = means3D.size(0);
+	// const int H = image_height;
+	// const int W = image_width;
+
+	// of shape (P, 2). means2D is (P, 2) in cuda. It will be converted to (P, 3) when is sent back to python to meet torch graph's requirement.
+	torch::Tensor means2D = torch::full({P, 2}, 0.0, means3D.options());//TODO: what about require_grads?
+	// of shape (P)
+	torch::Tensor depths = torch::full({P}, 0.0, means3D.options());
+	// of shape (P)
+	torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
+	// of shape (P, 6)
+	torch::Tensor cov3D = torch::full({P, 6}, 0.0, means3D.options());
+	// of shape (P, 4)
+	torch::Tensor conic_opacity = torch::full({P, 4}, 0.0, means3D.options());
+	// of shape (P, 3)
+	torch::Tensor rgb = torch::full({P, 3}, 0.0, means3D.options());
+	// of shape (P)
+	torch::Tensor clamped = torch::full({P, 3}, false, means3D.options().dtype(at::kBool));
+	//TODO: compare to original GeometryState implementation, this one does not explicitly do gpu memory alignment. 
+	//That may lead to problems. However, pytorch does implicit memory alignment.
+
+	int rendered = 0;//TODO: I could compute rendered here by summing up geomState.tiles_touched. 
+	if(P != 0)
+	{
+		int M = 0;
+		if(sh.size(0) != 0)
+		{
+			M = sh.size(1);
+		}
+
+		rendered = CudaRasterizer::Rasterizer::preprocessForwardBatches(
+			reinterpret_cast<float2*>(means2D.contiguous().data<float>()),//TODO: check whether it supports float2?
+			depths.contiguous().data<float>(),
+			radii.contiguous().data<int>(),
+			cov3D.contiguous().data<float>(),
+			reinterpret_cast<float4*>(conic_opacity.contiguous().data<float>()),
+			rgb.contiguous().data<float>(),
+			clamped.contiguous().data<bool>(),
+			P, degree, M,
+			image_width, image_height,
+			means3D,
+			scales,
+			rotations,
+			sh,
+			opacity, 
+			scale_modifier,
+			viewmatrix, 
+			projmatrix,
+			campos,
+			tan_fovx,
+			tan_fovy,
+			prefiltered,
+			debug,
+			args);
+	}
+	return std::make_tuple(rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped);
+}
+
 
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
   PreprocessGaussiansBackwardCUDA(

From f0e0469a7141a53cdb6cde62eba59f173b468beb Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sun, 21 Apr 2024 22:23:59 -0400
Subject: [PATCH 05/53] Refactor rasterizer import in rasterization_tests.py

---
 rasterization_tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index f5100f7..ccbde17 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -6,6 +6,7 @@
 from diff_gaussian_rasterization import (
     GaussianRasterizationSettings,
     GaussianRasterizer,
+    GaussianRasterizerBatches,
 )
 
 
@@ -303,7 +304,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
     )
 
     # Create the GaussianRasterizer for the batch
-    rasterizer = GaussianRasterizer(raster_settings=batched_raster_settings)
+    rasterizer = GaussianRasterizerBatches(raster_settings=batched_raster_settings)
 
     # Set up CUDA arguments for the batch
     cuda_args = get_cuda_args(batched_strategies[0], mode)  # TODO: Check if this is correct for the batch

From a16acd034afe77f91f29528d708cc94078e68dfa Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sun, 21 Apr 2024 22:46:43 -0400
Subject: [PATCH 06/53] Refactor GaussianRasterizerBatches class to support
 batched preprocess_gaussians function.

---
 diff_gaussian_rasterization/__init__.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index 73ab7e9..64a826f 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -9,11 +9,14 @@
 # For inquiries contact  george.drettakis@inria.fr
 #
 
+import time
 from typing import NamedTuple
-import torch.nn as nn
+
 import torch
+import torch.nn as nn
+
 from . import _C
-import time
+
 
 def cpu_deep_copy_tuple(input_tuple):
     copied_tensors = [item.cpu().clone() if isinstance(item, torch.Tensor) else item for item in input_tuple]
@@ -78,7 +81,11 @@ def forward(
         )
 
         # TODO: update this. 
-        num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
+        batch_size = len(raster_settings.tanfovx) if isinstance(raster_settings.tanfovx, list) else 1
+        if batch_size == 1:
+            num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
+        else:
+            num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batched(*args)
 
         # Keep relevant tensors for backward
         ctx.raster_settings = raster_settings
@@ -320,7 +327,7 @@ def markVisible(self, positions):
     def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batched_cuda_args=None):
         # Invoke C++/CUDA rasterization routine
         
-            return preprocess_gaussians_batches(
+            return preprocess_gaussians(
                 means3D,
                 scales,
                 rotations,

From 268f46a5cd05f0608de1b7aef6755da3bc8c0765 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sun, 21 Apr 2024 22:54:13 -0400
Subject: [PATCH 07/53] Refactor preprocess_gaussians function to remove
 flag_batched parameter in __init__.py

---
 diff_gaussian_rasterization/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index 8152d7f..ca95f73 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -34,7 +34,7 @@ def preprocess_gaussians(
     sh,
     opacities,
     raster_settings,
-    cuda_args,flag_batched=False
+    cuda_args
 ):
     return _PreprocessGaussians.apply(
         means3D,
@@ -43,7 +43,7 @@ def preprocess_gaussians(
         sh,
         opacities,
         raster_settings,
-        cuda_args,flag_batched
+        cuda_args
     )
 
 class _PreprocessGaussians(torch.autograd.Function):
@@ -336,7 +336,7 @@ def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batch
                 shs,
                 opacities,
                 self.raster_settings_list,
-                batched_cuda_args,True)
+                batched_cuda_args)
 
 class GaussianRasterizer(nn.Module):
     def __init__(self, raster_settings):

From 7361323a6c84edc4b17673e763d8731e3e7bb066 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Mon, 22 Apr 2024 17:12:36 -0400
Subject: [PATCH 08/53] batched forward pass kernel

---
 cuda_rasterizer/forward.cu         | 149 +++++++++++++++++++++++++++++
 cuda_rasterizer/rasterizer_impl.cu |  24 ++---
 rasterization_tests.py             |  10 +-
 rasterize_points.cu                |  68 +++++++------
 4 files changed, 196 insertions(+), 55 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 9362275..102c6d9 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -498,4 +498,153 @@ void FORWARD::preprocess(int P, int D, int M,
 		tiles_touched,
 		prefiltered
 		);
+}
+
+
+template<int C>
+__global__ void preprocessCUDABatched(
+    int P, int D, int M,
+    const float* orig_points, const glm::vec3* scales, const float scale_modifier,
+    const glm::vec4* rotations, const float* opacities, const float* shs,
+    bool* clamped, const float* cov3D_precomp, const float* colors_precomp,
+    const float* viewmatrix_arr, const float* projmatrix_arr, const glm::vec3* cam_pos,
+    const int W, int H, const float* focal_x, const float* focal_y,
+    const float* tan_fovx, const float* tan_fovy,
+    int* radii, float2* points_xy_image, float* depths, float* cov3Ds,
+    float* rgb, float4* conic_opacity, const dim3 grid, uint32_t* tiles_touched,
+    bool prefiltered, int num_viewpoints)
+{
+    auto point_idx = cg::this_grid().thread_rank();
+    auto viewpoint_idx = blockIdx.z;
+
+    if (viewpoint_idx >= num_viewpoints || point_idx >= P) return;
+
+    auto idx = viewpoint_idx * P + point_idx;
+    const float* viewmatrix = viewmatrix_arr + viewpoint_idx * 16;
+    const float* projmatrix = projmatrix_arr + viewpoint_idx * 16;
+
+    // Initialize radius and touched tiles to 0. If this isn't changed,
+    // this Gaussian will not be processed further.
+    radii[idx] = 0;
+    tiles_touched[idx] = 0;
+
+    // Perform near culling, quit if outside.
+    float3 p_view;
+    if (!in_frustum(point_idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return;
+
+    // Transform point by projecting
+    float3 p_orig = { orig_points[3 * point_idx], orig_points[3 * point_idx + 1], orig_points[3 * point_idx + 2] };
+    float4 p_hom = transformPoint4x4(p_orig, projmatrix);
+    float p_w = 1.0f / (p_hom.w + 0.0000001f);
+    float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
+
+    // If 3D covariance matrix is precomputed, use it, otherwise compute
+    // from scaling and rotation parameters.
+    const float* cov3D;
+    if (cov3D_precomp != nullptr) {
+        cov3D = cov3D_precomp + point_idx * 6;
+    } else {
+        computeCov3D(scales[point_idx], scale_modifier, rotations[point_idx], cov3Ds + idx * 6);
+        cov3D = cov3Ds + idx * 6;
+    }
+
+    // Compute 2D screen-space covariance matrix
+    const float focal_x = W / (2.0f * tan_fovx[viewpoint_idx]);
+    const float focal_y = H / (2.0f * tan_fovy[viewpoint_idx]);
+    float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx[viewpoint_idx], tan_fovy[viewpoint_idx], cov3D, viewmatrix);
+
+    // Invert covariance (EWA algorithm)
+    float det = (cov.x * cov.z - cov.y * cov.y);
+    if (det == 0.0f) return;
+    float det_inv = 1.f / det;
+    float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv };
+
+    // Compute extent in screen space (by finding eigenvalues of
+    // 2D covariance matrix). Use extent to compute a bounding rectangle
+    // of screen-space tiles that this Gaussian overlaps with. Quit if
+    // rectangle covers 0 tiles.
+    float mid = 0.5f * (cov.x + cov.z);
+    float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
+    float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
+    float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
+    float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) };
+    uint2 rect_min, rect_max;
+    getRect(point_image, my_radius, rect_min, rect_max, grid);
+    if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0) return;
+
+    // If colors have been precomputed, use them, otherwise convert
+    // spherical harmonics coefficients to RGB color.
+    if (colors_precomp == nullptr) {
+        glm::vec3 result = computeColorFromSH(point_idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3);
+        rgb[idx * C + 0] = result.x;
+        rgb[idx * C + 1] = result.y;
+        rgb[idx * C + 2] = result.z;
+    }
+
+    // Store some useful helper data for the next steps.
+    depths[idx] = p_view.z;
+    radii[idx] = my_radius;
+    points_xy_image[idx] = point_image;
+
+    // Inverse 2D covariance and opacity neatly pack into one float4
+    conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[point_idx] };
+    tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
+}
+
+void FORWARD::preprocess(int P, int D, int M,
+	const float* means3D,
+	const glm::vec3* scales,
+	const float scale_modifier,
+	const glm::vec4* rotations,
+	const float* opacities,
+	const float* shs,
+	bool* clamped,
+	const float* cov3D_precomp,
+	const float* colors_precomp,
+	const float* viewmatrix,
+	const float* projmatrix,
+	const glm::vec3* cam_pos,
+	const int W, int H,
+	const float focal_x, float focal_y,
+	const float tan_fovx, float tan_fovy,
+	int* radii,
+	float2* means2D,
+	float* depths,
+	float* cov3Ds,
+	float* rgb,
+	float4* conic_opacity,
+	const dim3 grid,
+	uint32_t* tiles_touched,
+	bool prefiltered,
+    int num_viewpoints)
+{
+    dim3 block(BLOCK_X, BLOCK_Y, 1);
+    dim3 grid((P + BLOCK_X - 1) / BLOCK_X, 1, num_viewpoints);
+    preprocessCUDABatched<NUM_CHANNELS><<<grid, block>>>(
+		P, D, M,
+		means3D,
+		scales,
+		scale_modifier,
+		rotations,
+		opacities,
+		shs,
+		clamped,
+		cov3D_precomp,
+		colors_precomp,
+		viewmatrix, 
+		projmatrix,
+		cam_pos,
+		W, H,
+		tan_fovx, tan_fovy,
+		focal_x, focal_y,
+		radii,
+		means2D,
+		depths,
+		cov3Ds,
+		rgb,
+		conic_opacity,
+		grid,
+		tiles_touched,
+		prefiltered,
+		);
 }
\ No newline at end of file
diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
index 1215c25..09a82b6 100644
--- a/cuda_rasterizer/rasterizer_impl.cu
+++ b/cuda_rasterizer/rasterizer_impl.cu
@@ -434,20 +434,21 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches(
 	float* rgb,
 	bool* clamped,//the above are all per-Gaussian intemediate results.
 	const int P, int D, int M,
-	const std::vector<int>& width, std::vector<int>& height,
+	const int width, int height,
 	const float* means3D,
 	const float* scales,
 	const float* rotations,
 	const float* shs,
 	const float* opacities,//3dgs parameters
-	const std::vector<float>& scale_modifier,
-	const std::vector<torch::Tensor>& viewmatrix,
-	const std::vector<torch::Tensor>& projmatrix,
-	const std::vector<float>& cam_pos,
-	const std::vector<float>& tan_fovx, std::vector<float>& tan_fovy,
-	const std::vector<bool>& prefiltered,
-	std::vector<bool>& debug,//raster_settings
-	const std::vector<pybind11::dict> &args)
+	const float scale_modifier,
+	const float* viewmatrix,
+	const float* projmatrix,
+	const float* cam_pos,
+	const float* tan_fovx, float* tan_fovy,
+	const bool prefiltered,
+    const int num_viewpoints,
+	bool debug,//raster_settings
+	const pybind11::dict &args)
 {
 	auto [global_rank, world_size, iteration, log_interval, device, zhx_debug, zhx_time, mode, dist_division_mode, log_folder] = prepareArgs(args);
 	char* log_tmp = new char[500];
@@ -462,14 +463,9 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches(
 	MyTimerOnGPU timer;
 	// const float focal_y = height / (2.0f * tan_fovy);
 	// const float focal_x = width / (2.0f * tan_fovx);
-	const int num_viewpoints=viewmatrix.size();
 
 	//CONVERT ALL VECTORS TO FLOATSSSSSS PRAPTIIIIIII
 
-	dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, num_viewpoints);
-	dim3 block(BLOCK_X, BLOCK_Y, num_viewpoints);
-	int tile_num = tile_grid.x * tile_grid.y*tile_grid.z;
-
 	// allocate temporary buffer for tiles_touched.
 	// In sep_rendering==True case, we will compute tiles_touched in the renderForward. 
 	// TODO: remove it later by modifying FORWARD::preprocess when we deprecate sep_rendering==False case
diff --git a/rasterization_tests.py b/rasterization_tests.py
index 7c8a93b..10da643 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -282,11 +282,11 @@ def test_batched_gaussian_rasterizer_batch_processing():
     mode = "train"
 
     # Set up rasterization configuration for the batch
-    batched_tanfovx = [math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]
-    batched_tanfovy = [math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]
-    batched_viewmatrix = [camera.world_view_transform for camera in batched_viewpoint_cameras]
-    batched_projmatrix = [camera.full_proj_transform for camera in batched_viewpoint_cameras]
-    batched_campos = [camera.camera_center for camera in batched_viewpoint_cameras]
+    batched_tanfovx = torch.stack([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras])
+    batched_tanfovy = torch.stack([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras])
+    batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras])
+    batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras])
+    batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras])
 
     batched_raster_settings = GaussianRasterizationSettings(
         image_height=int(batched_viewpoint_cameras[0].image_height),
diff --git a/rasterize_points.cu b/rasterize_points.cu
index 9a3fb7b..e8eb8a7 100644
--- a/rasterize_points.cu
+++ b/rasterize_points.cu
@@ -149,52 +149,47 @@ PreprocessGaussiansCUDABatches(
 	const torch::Tensor& rotations,
 	const torch::Tensor& sh,
     const torch::Tensor& opacity,//3dgs' parametes.
-	const std::vector<float>& scale_modifier,
-	const std::vector<torch::Tensor>& viewmatrix,
-	const std::vector<torch::Tensor>& projmatrix,
-	const std::vector<float>& tan_fovx, 
-	const std::vector<float>& tan_fovy,
-    const std::vector<int>& image_height,
-    const std::vector<int>& image_width,
-	const std::vector<int>& degree,
-	const std::vector<torch::Tensor>& campos,
-	const std::vector<bool>& prefiltered,//raster_settings
-	const std::vector<bool>& debug,
-	const std::vector<pybind11::dict> &args) {
+	const float scale_modifier,
+	const torch::Tensor& viewmatrix,
+	const torch::Tensor& projmatrix,
+	const torch::Tensor& tan_fovx, 
+	const torch::Tensor& tan_fovy,
+    const int image_height,
+    const int image_width,
+	const int degree,
+	const torch::Tensor& campos,
+    const bool prefiltered,//raster_settings
+	const bool debug,
+	const pybind11::dict &args) {
 
 	if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
 		AT_ERROR("means3D must have dimensions (num_points, 3)");
 	}
 
 	const int P = means3D.size(0);
-	// const int H = image_height;
-	// const int W = image_width;
+    const int num_viewpoints = viewmatrix.size(0);
 
 	// of shape (P, 2). means2D is (P, 2) in cuda. It will be converted to (P, 3) when is sent back to python to meet torch graph's requirement.
-	torch::Tensor means2D = torch::full({P, 2}, 0.0, means3D.options());//TODO: what about require_grads?
+	torch::Tensor means2D = torch::full({num_viewpoints, P, 2}, 0.0, means3D.options());//TODO: what about require_grads?
 	// of shape (P)
-	torch::Tensor depths = torch::full({P}, 0.0, means3D.options());
+	torch::Tensor depths = torch::full({num_viewpoints, P}, 0.0, means3D.options());
 	// of shape (P)
-	torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
+	torch::Tensor radii = torch::full({num_viewpoints, P}, 0, means3D.options().dtype(torch::kInt32));
 	// of shape (P, 6)
-	torch::Tensor cov3D = torch::full({P, 6}, 0.0, means3D.options());
+	torch::Tensor cov3D = torch::full({num_viewpoints, P, 6}, 0.0, means3D.options());
 	// of shape (P, 4)
-	torch::Tensor conic_opacity = torch::full({P, 4}, 0.0, means3D.options());
+	torch::Tensor conic_opacity = torch::full({num_viewpoints, P, 4}, 0.0, means3D.options());
 	// of shape (P, 3)
-	torch::Tensor rgb = torch::full({P, 3}, 0.0, means3D.options());
+	torch::Tensor rgb = torch::full({num_viewpoints, P, 3}, 0.0, means3D.options());
 	// of shape (P)
-	torch::Tensor clamped = torch::full({P, 3}, false, means3D.options().dtype(at::kBool));
+	torch::Tensor clamped = torch::full({num_viewpoints, P, 3}, false, means3D.options().dtype(at::kBool));
 	//TODO: compare to original GeometryState implementation, this one does not explicitly do gpu memory alignment. 
 	//That may lead to problems. However, pytorch does implicit memory alignment.
 
 	int rendered = 0;//TODO: I could compute rendered here by summing up geomState.tiles_touched. 
 	if(P != 0)
 	{
-		int M = 0;
-		if(sh.size(0) != 0)
-		{
-			M = sh.size(1);
-		}
+        int M = sh.size(0) != 0 ? sh.size(1) : 0;
 
 		rendered = CudaRasterizer::Rasterizer::preprocessForwardBatches(
 			reinterpret_cast<float2*>(means2D.contiguous().data<float>()),//TODO: check whether it supports float2?
@@ -206,18 +201,19 @@ PreprocessGaussiansCUDABatches(
 			clamped.contiguous().data<bool>(),
 			P, degree, M,
 			image_width, image_height,
-			means3D,
-			scales,
-			rotations,
-			sh,
-			opacity, 
+			means3D.contiguous().data<float>(),
+			scales.contiguous().data_ptr<float>(),
+			rotations.contiguous().data_ptr<float>(),
+			sh.contiguous().data_ptr<float>(),
+			opacity.contiguous().data<float>(), 
 			scale_modifier,
-			viewmatrix, 
-			projmatrix,
-			campos,
-			tan_fovx,
-			tan_fovy,
+			viewmatrix.contiguous().data<float>(), 
+			projmatrix.contiguous().data<float>(),
+			campos.contiguous().data<float>(),
+			tan_fovx.contiguous().data<float>(),
+			tan_fovy.contiguous().data<float>(),
 			prefiltered,
+            num_viewpoints,
 			debug,
 			args);
 	}

From 7f4935d86ca48772e57e1356b9ef57a5a5c5e0b0 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Tue, 23 Apr 2024 00:21:58 -0400
Subject: [PATCH 09/53] added headers and changed kernel structure to 1d block

---
 cuda_rasterizer/forward.cu         | 17 ++++++---------
 cuda_rasterizer/forward.h          | 33 ++++++++++++++++++++++++++++--
 cuda_rasterizer/rasterizer_impl.cu | 10 ++++-----
 rasterize_points.h                 | 20 ++++++++++++++++++
 4 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 102c6d9..156985b 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -471,7 +471,7 @@ void FORWARD::preprocess(int P, int D, int M,
 	uint32_t* tiles_touched,
 	bool prefiltered)
 {
-	preprocessCUDA<NUM_CHANNELS> << <(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >> > (
+	preprocessCUDA<NUM_CHANNELS> << <cdiv(P, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE >> > (
 		P, D, M,
 		means3D,
 		scales,
@@ -508,14 +508,13 @@ __global__ void preprocessCUDABatched(
     const glm::vec4* rotations, const float* opacities, const float* shs,
     bool* clamped, const float* cov3D_precomp, const float* colors_precomp,
     const float* viewmatrix_arr, const float* projmatrix_arr, const glm::vec3* cam_pos,
-    const int W, int H, const float* focal_x, const float* focal_y,
-    const float* tan_fovx, const float* tan_fovy,
+    const int W, int H, const float* tan_fovx, const float* tan_fovy,
     int* radii, float2* points_xy_image, float* depths, float* cov3Ds,
     float* rgb, float4* conic_opacity, const dim3 grid, uint32_t* tiles_touched,
     bool prefiltered, int num_viewpoints)
 {
     auto point_idx = cg::this_grid().thread_rank();
-    auto viewpoint_idx = blockIdx.z;
+    auto viewpoint_idx = blockIdx.y;
 
     if (viewpoint_idx >= num_viewpoints || point_idx >= P) return;
 
@@ -591,7 +590,7 @@ __global__ void preprocessCUDABatched(
     tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
 }
 
-void FORWARD::preprocess(int P, int D, int M,
+void FORWARD::preprocess_batch(int P, int D, int M,
 	const float* means3D,
 	const glm::vec3* scales,
 	const float scale_modifier,
@@ -605,8 +604,7 @@ void FORWARD::preprocess(int P, int D, int M,
 	const float* projmatrix,
 	const glm::vec3* cam_pos,
 	const int W, int H,
-	const float focal_x, float focal_y,
-	const float tan_fovx, float tan_fovy,
+	const float* tan_fovx, float* tan_fovy,
 	int* radii,
 	float2* means2D,
 	float* depths,
@@ -618,9 +616,7 @@ void FORWARD::preprocess(int P, int D, int M,
 	bool prefiltered,
     int num_viewpoints)
 {
-    dim3 block(BLOCK_X, BLOCK_Y, 1);
-    dim3 grid((P + BLOCK_X - 1) / BLOCK_X, 1, num_viewpoints);
-    preprocessCUDABatched<NUM_CHANNELS><<<grid, block>>>(
+    preprocessCUDABatched<NUM_CHANNELS><<<grid, ONE_DIM_BLOCK_SIZE>>>(
 		P, D, M,
 		means3D,
 		scales,
@@ -636,7 +632,6 @@ void FORWARD::preprocess(int P, int D, int M,
 		cam_pos,
 		W, H,
 		tan_fovx, tan_fovy,
-		focal_x, focal_y,
 		radii,
 		means2D,
 		depths,
diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h
index 86e5cb9..0f036d9 100644
--- a/cuda_rasterizer/forward.h
+++ b/cuda_rasterizer/forward.h
@@ -45,7 +45,35 @@ namespace FORWARD
 		float4* conic_opacity,
 		const dim3 grid,
 		uint32_t* tiles_touched,
-		bool prefiltered);
+		bool prefiltered
+    );
+
+    void preprocess_batch(int P, int D, int M,
+        const float* means3D,
+        const glm::vec3* scales,
+        const float scale_modifier,
+        const glm::vec4* rotations,
+        const float* opacities,
+        const float* shs,
+        bool* clamped,
+        const float* cov3D_precomp,
+        const float* colors_precomp,
+        const float* viewmatrix,
+        const float* projmatrix,
+        const glm::vec3* cam_pos,
+        const int W, int H,
+        const float* tan_fovx, float* tan_fovy,
+        int* radii,
+        float2* means2D,
+        float* depths,
+        float* cov3Ds,
+        float* rgb,
+        float4* conic_opacity,
+        const dim3 grid,
+        uint32_t* tiles_touched,
+        bool prefiltered,
+        int num_viewpoints
+    );
 
 	// Main rasterization method.
 	void render(
@@ -61,7 +89,8 @@ namespace FORWARD
 		uint32_t* n_contrib2loss,
         const int* compute_locally_1D_2D_map,
 		const float* bg_color,
-		float* out_color);
+		float* out_color
+    );
 }
 
 
diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
index 09a82b6..4aca70e 100644
--- a/cuda_rasterizer/rasterizer_impl.cu
+++ b/cuda_rasterizer/rasterizer_impl.cu
@@ -461,11 +461,11 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches(
 	}
 
 	MyTimerOnGPU timer;
-	// const float focal_y = height / (2.0f * tan_fovy);
-	// const float focal_x = width / (2.0f * tan_fovx);
 
 	//CONVERT ALL VECTORS TO FLOATSSSSSS PRAPTIIIIIII
 
+    dim3 tile_grid(cdiv(P, ONE_DIM_BLOCK_SIZE), num_viewpoints);
+
 	// allocate temporary buffer for tiles_touched.
 	// In sep_rendering==True case, we will compute tiles_touched in the renderForward. 
 	// TODO: remove it later by modifying FORWARD::preprocess when we deprecate sep_rendering==False case
@@ -475,7 +475,7 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches(
 
 	timer.start("10 preprocess");
 	// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
-	CHECK_CUDA(FORWARD::preprocess(
+	CHECK_CUDA(FORWARD::preprocess_batch(
 		P, D, M,
 		means3D,
 		(glm::vec3*)scales,
@@ -486,10 +486,10 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches(
 		clamped,
 		nullptr,//cov3D_precomp,
 		nullptr,//colors_precomp,TODO: this is correct?
-		viewmatrix, projmatrix,
+		viewmatrix, 
+        projmatrix,
 		(glm::vec3*)cam_pos,
 		width, height,
-		focal_x, focal_y,
 		tan_fovx, tan_fovy,
 		radii,
 		means2D,
diff --git a/rasterize_points.h b/rasterize_points.h
index 86798ec..3700126 100644
--- a/rasterize_points.h
+++ b/rasterize_points.h
@@ -49,6 +49,26 @@ PreprocessGaussiansCUDA(
 	const bool debug,
 	const pybind11::dict &args);
 
+std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+PreprocessGaussiansCUDABatches(
+	const torch::Tensor& means3D,
+	const torch::Tensor& scales,
+	const torch::Tensor& rotations,
+	const torch::Tensor& sh,
+    const torch::Tensor& opacity,//3dgs' parametes.
+	const float scale_modifier,
+	const torch::Tensor& viewmatrix,
+	const torch::Tensor& projmatrix,
+	const torch::Tensor& tan_fovx, 
+	const torch::Tensor& tan_fovy,
+    const int image_height,
+    const int image_width,
+	const int degree,
+	const torch::Tensor& campos,
+    const bool prefiltered,//raster_settings
+	const bool debug,
+	const pybind11::dict &args);
+
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
   PreprocessGaussiansBackwardCUDA(
 	const torch::Tensor& radii,

From 5f05af5b0f5d689cf219afd2a5a868c7d360fc2a Mon Sep 17 00:00:00 2001
From: Prapti Devansh Trivedi <pt2295@gv009.hpc.nyu.edu>
Date: Tue, 23 Apr 2024 15:14:05 -0400
Subject: [PATCH 10/53] solved syntax errors

---
 cuda_rasterizer/forward.cu         |  5 +++--
 cuda_rasterizer/forward.h          |  2 +-
 cuda_rasterizer/rasterizer.h       | 25 +++++++++++++++++++++++++
 cuda_rasterizer/rasterizer_impl.cu |  3 ++-
 rasterization_tests.py             |  6 +-----
 5 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 156985b..1e3e119 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -511,7 +511,7 @@ __global__ void preprocessCUDABatched(
     const int W, int H, const float* tan_fovx, const float* tan_fovy,
     int* radii, float2* points_xy_image, float* depths, float* cov3Ds,
     float* rgb, float4* conic_opacity, const dim3 grid, uint32_t* tiles_touched,
-    bool prefiltered, int num_viewpoints)
+    bool prefiltered, const int num_viewpoints)
 {
     auto point_idx = cg::this_grid().thread_rank();
     auto viewpoint_idx = blockIdx.y;
@@ -614,7 +614,7 @@ void FORWARD::preprocess_batch(int P, int D, int M,
 	const dim3 grid,
 	uint32_t* tiles_touched,
 	bool prefiltered,
-    int num_viewpoints)
+    const int num_viewpoints)
 {
     preprocessCUDABatched<NUM_CHANNELS><<<grid, ONE_DIM_BLOCK_SIZE>>>(
 		P, D, M,
@@ -641,5 +641,6 @@ void FORWARD::preprocess_batch(int P, int D, int M,
 		grid,
 		tiles_touched,
 		prefiltered,
+		num_viewpoints
 		);
 }
\ No newline at end of file
diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h
index 0f036d9..6c4a1da 100644
--- a/cuda_rasterizer/forward.h
+++ b/cuda_rasterizer/forward.h
@@ -72,7 +72,7 @@ namespace FORWARD
         const dim3 grid,
         uint32_t* tiles_touched,
         bool prefiltered,
-        int num_viewpoints
+        const int num_viewpoints
     );
 
 	// Main rasterization method.
diff --git a/cuda_rasterizer/rasterizer.h b/cuda_rasterizer/rasterizer.h
index ddc989c..4ad1bf8 100644
--- a/cuda_rasterizer/rasterizer.h
+++ b/cuda_rasterizer/rasterizer.h
@@ -65,6 +65,31 @@ namespace CudaRasterizer
 			bool debug,//raster_settings
 			const pybind11::dict &args);
 
+		static int preprocessForwardBatches(
+			float2* means2D,
+			float* depths,
+			int* radii,
+			float* cov3D,
+			float4* conic_opacity,
+			float* rgb,
+			bool* clamped,//the above are all per-Gaussian intemediate results.
+			const int P, int D, int M,
+			const int width, int height,
+			const float* means3D,
+			const float* scales,
+			const float* rotations,
+			const float* shs,
+			const float* opacities,//3dgs parameters
+			const float scale_modifier,
+			const float* viewmatrix,
+			const float* projmatrix,
+			const float* cam_pos,
+			const float* tan_fovx, float* tan_fovy,
+			const bool prefiltered,
+			const int num_viewpoints,
+			bool debug,//raster_settings
+			const pybind11::dict &args);
+
 		static void preprocessBackward(
 			const int* radii,
 			const float* cov3D,
diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
index 4aca70e..2f43492 100644
--- a/cuda_rasterizer/rasterizer_impl.cu
+++ b/cuda_rasterizer/rasterizer_impl.cu
@@ -499,7 +499,8 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches(
 		conic_opacity,
 		tile_grid,
 		tiles_touched_temp_buffer,
-		prefiltered
+		prefiltered,
+		num_viewpoints
 	), debug)
 	timer.stop("10 preprocess");
 
diff --git a/rasterization_tests.py b/rasterization_tests.py
index 10da643..cb65cdb 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -3,11 +3,7 @@
 
 import torch
 
-from diff_gaussian_rasterization import (
-    GaussianRasterizationSettings,
-    GaussianRasterizer,
-    GaussianRasterizerBatches,
-)
+import diff_gaussian_rasterization import (GaussianRasterizationSettings,GaussianRasterizer,GaussianRasterizerBatches)
 
 
 def get_cuda_args(strategy, mode="train"):

From 543d4b85d47491c1b91acd8ef7fad8eb8de71fb1 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <snm6477@b-3-28.c.hpc-slurm-9c75.internal>
Date: Tue, 23 Apr 2024 22:20:28 -0400
Subject: [PATCH 11/53] fixed import syntax in test

---
 rasterization_tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index cb65cdb..88277ef 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -3,7 +3,7 @@
 
 import torch
 
-import diff_gaussian_rasterization import (GaussianRasterizationSettings,GaussianRasterizer,GaussianRasterizerBatches)
+from diff_gaussian_rasterization import (GaussianRasterizationSettings,GaussianRasterizer,GaussianRasterizerBatches)
 
 
 def get_cuda_args(strategy, mode="train"):
@@ -333,4 +333,4 @@ def test_batched_gaussian_rasterizer_batch_processing():
 if __name__ == "__main__":
     test_gaussian_rasterizer_time()
     test_batched_gaussian_rasterizer_batch_processing()
-    
\ No newline at end of file
+    

From 8ca5a9fb07215f0e52549dcaa6a115a845f23fa4 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Tue, 23 Apr 2024 23:00:23 -0400
Subject: [PATCH 12/53] formatting changes

---
 rasterization_tests.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 88277ef..90f18b9 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -3,7 +3,11 @@
 
 import torch
 
-from diff_gaussian_rasterization import (GaussianRasterizationSettings,GaussianRasterizer,GaussianRasterizerBatches)
+from diff_gaussian_rasterization import (
+    GaussianRasterizationSettings,
+    GaussianRasterizer,
+    GaussianRasterizerBatches,
+)
 
 
 def get_cuda_args(strategy, mode="train"):

From 0dbe8fd4f4b116146c5e8e3128047b54fea2d1be Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Tue, 23 Apr 2024 23:14:34 -0400
Subject: [PATCH 13/53] Refactor GaussianRasterizerBatches class to use
 torch.tensor instead of math.tan in
 test_batched_gaussian_rasterizer_batch_processing function

---
 rasterization_tests.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 90f18b9..0200210 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -234,8 +234,8 @@ def test_batched_gaussian_rasterizer():
             cuda_args=cuda_args
         )
 
-        if mode == "train":
-            means2D.retain_grad()
+        # if mode == "train":
+        #     means2D.retain_grad()
 
         batched_means2D.append(means2D)
         screenspace_params = [means2D, rgb, conic_opacity, radii, depths]
@@ -282,8 +282,8 @@ def test_batched_gaussian_rasterizer_batch_processing():
     mode = "train"
 
     # Set up rasterization configuration for the batch
-    batched_tanfovx = torch.stack([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras])
-    batched_tanfovy = torch.stack([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras])
+    batched_tanfovx = torch.tensor([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras])
+    batched_tanfovy = torch.tensor([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras])
     batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras])
     batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras])
     batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras])
@@ -335,6 +335,6 @@ def test_batched_gaussian_rasterizer_batch_processing():
 
 
 if __name__ == "__main__":
-    test_gaussian_rasterizer_time()
+    test_batched_gaussian_rasterizer()
     test_batched_gaussian_rasterizer_batch_processing()
     

From 193fa821e57fdba31b7f309f00c4fdbe5f7d1b3b Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Tue, 23 Apr 2024 23:20:44 -0400
Subject: [PATCH 14/53] Refactor variable name in
 test_batched_gaussian_rasterizer_batch_processing function

---
 rasterization_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 0200210..7ba64a6 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -316,7 +316,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
         rotations=rotations,
         shs=shs,
         opacities=opacity,
-        cuda_args=cuda_args
+        batched_cuda_args=cuda_args
     )
 
     if mode == "train":

From ac43fc44625ea62a1e6fed753d08f1ba7db52caa Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Tue, 23 Apr 2024 23:26:12 -0400
Subject: [PATCH 15/53] Refactor preprocess_gaussians function to handle
 batched and non-batched inputs in __init__.py

---
 diff_gaussian_rasterization/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index ca95f73..e73be82 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -9,7 +9,6 @@
 # For inquiries contact  george.drettakis@inria.fr
 #
 
-import time
 from typing import NamedTuple
 
 import torch
@@ -81,7 +80,7 @@ def forward(
         )
 
         # TODO: update this. 
-        batch_size = len(raster_settings.tanfovx) if isinstance(raster_settings.tanfovx, list) else 1
+        batch_size = raster_settings.tanfovx.shape[0] if torch.is_tensor(raster_settings.tanfovx) else 1
         if batch_size == 1:
             num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
         else:

From 4115266c5a2f4d88bdddfa632945a8bc68d8cf1c Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Tue, 23 Apr 2024 23:26:19 -0400
Subject: [PATCH 16/53] Refactor test_batched_gaussian_rasterizer and
 test_batched_gaussian_rasterizer_batch_processing functions in
 rasterization_tests.py

---
 rasterization_tests.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 7ba64a6..93248df 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -199,6 +199,8 @@ def test_batched_gaussian_rasterizer():
     batched_means2D = []
     batched_radii = []
 
+    start_time = time.time()
+    
     for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)):
         ########## [START] Prepare CUDA Rasterization Settings ##########
         cuda_args = get_cuda_args(strategy, mode)
@@ -243,6 +245,9 @@ def test_batched_gaussian_rasterizer():
         batched_screenspace_params.append(screenspace_params)
         batched_radii.append(radii)
 
+    end_time = time.time()
+    preprocess_time = end_time - start_time
+    print(f"Time taken by test_batched_gaussian_rasterizer: {preprocess_time:.4f} seconds")
     # Perform further operations with the batched results
     # Test results and performance
     
@@ -256,6 +261,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
     shs = torch.randn(num_gaussians, 9).cuda()
     opacity = torch.randn(num_gaussians, 1).cuda()
 
+    start_time = time.time()
     # Set up the viewpoint cameras
     batched_viewpoint_cameras = []
     for _ in range(num_batches):
@@ -318,6 +324,9 @@ def test_batched_gaussian_rasterizer_batch_processing():
         opacities=opacity,
         batched_cuda_args=cuda_args
     )
+    end_time = time.time()
+    preprocess_time = end_time - start_time
+    print(f"Time taken by test_batched_gaussian_rasterizer_batch_processing: {preprocess_time:.4f} seconds")
 
     if mode == "train":
         batched_means2D.retain_grad()

From 162e7d02cb38e701d666899cba3d9af4fd829065 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Tue, 23 Apr 2024 23:30:08 -0400
Subject: [PATCH 17/53] Refactor test_batched_gaussian_rasterizer and
 test_batched_gaussian_rasterizer_batch_processing functions in
 rasterization_tests.py

---
 rasterization_tests.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 93248df..93c0cd5 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -111,9 +111,7 @@ def test_improved_gaussian_rasterizer():
     pipe.debug = False
     mode = "train"
 
-    batched_rasterizers = []
     batched_cuda_args = []
-    batched_screenspace_params = []
     batched_means2D = []
     batched_radii = []
     raster_settings_list=[]
@@ -236,6 +234,7 @@ def test_batched_gaussian_rasterizer():
             cuda_args=cuda_args
         )
 
+        # TODO: make the below work
         # if mode == "train":
         #     means2D.retain_grad()
 
@@ -328,17 +327,17 @@ def test_batched_gaussian_rasterizer_batch_processing():
     preprocess_time = end_time - start_time
     print(f"Time taken by test_batched_gaussian_rasterizer_batch_processing: {preprocess_time:.4f} seconds")
 
-    if mode == "train":
-        batched_means2D.retain_grad()
+    # TODO: make the below work
+    # if mode == "train":
+    #     batched_means2D.retain_grad()
 
-    batched_screenspace_params = [batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths]
 
     # Perform assertions on the preprocessed data
-    assert batched_means2D.shape == (num_gaussians, num_batches, 2)
-    assert batched_rgb.shape == (num_gaussians, num_batches, 3)
-    assert batched_conic_opacity.shape == (num_gaussians, num_batches, 1)
-    assert batched_radii.shape == (num_gaussians, num_batches)
-    assert batched_depths.shape == (num_gaussians, num_batches)
+    assert batched_means2D.shape == (num_batches, num_gaussians, 2)
+    assert batched_rgb.shape == (num_batches, num_gaussians, 3)
+    assert batched_conic_opacity.shape == (num_batches, num_gaussians, 1)
+    assert batched_radii.shape == (num_batches, num_gaussians)
+    assert batched_depths.shape == (num_batches, num_gaussians)
 
 
 

From fdf3bf5b4429744d8ef20fd380343a9b64d09c8b Mon Sep 17 00:00:00 2001
From: prapti19 <prapti.trivedi19@gmail.com>
Date: Wed, 24 Apr 2024 23:19:25 -0400
Subject: [PATCH 18/53] add parity test

---
 diff_gaussian_rasterization/__init__.py |   4 +-
 rasterization_tests.py                  | 130 +++++-------------------
 2 files changed, 31 insertions(+), 103 deletions(-)

diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index e73be82..633adba 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -81,9 +81,11 @@ def forward(
 
         # TODO: update this. 
         batch_size = raster_settings.tanfovx.shape[0] if torch.is_tensor(raster_settings.tanfovx) else 1
-        if batch_size == 1:
+        if  not torch.is_tensor(raster_settings.tanfovx):
+            print("normal batch calling")
             num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
         else:
+            print("improved batch calling")
             num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batched(*args)
 
         # Keep relevant tensors for backward
diff --git a/rasterization_tests.py b/rasterization_tests.py
index 93c0cd5..84a5391 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -8,7 +8,13 @@
     GaussianRasterizer,
     GaussianRasterizerBatches,
 )
-
+num_gaussians = 10000
+num_batches=1
+means3D = torch.randn(num_gaussians, 3).cuda()
+scales = torch.randn(num_gaussians, 3).cuda()
+rotations = torch.randn(num_gaussians, 3, 3).cuda()
+shs = torch.randn(num_gaussians, 9).cuda()
+opacity = torch.randn(num_gaussians, 1).cuda()
 
 def get_cuda_args(strategy, mode="train"):
     cuda_args = {
@@ -74,98 +80,13 @@ def test_gaussian_rasterizer_time():
 
     preprocess_time = end_time - start_time
     print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds")
-
-def test_improved_gaussian_rasterizer():
-
-    # Set up the input data
-    num_gaussians = 10000
-    num_batches = 4
-    means3D = torch.randn(num_gaussians, 3).cuda()
-    scales = torch.randn(num_gaussians, 3).cuda()
-    rotations = torch.randn(num_gaussians, 3, 3).cuda()
-    shs = torch.randn(num_gaussians, 9).cuda()
-    opacity = torch.randn(num_gaussians, 1).cuda()
-
-    # Set up the viewpoint cameras
-    batched_viewpoint_cameras = []
-    for _ in range(num_batches):
-        viewpoint_camera = type('ViewpointCamera', (), {})
-        viewpoint_camera.FoVx = math.radians(60)
-        viewpoint_camera.FoVy = math.radians(60)
-        viewpoint_camera.image_height = 512
-        viewpoint_camera.image_width = 512
-        viewpoint_camera.world_view_transform = torch.eye(4).cuda()
-        viewpoint_camera.full_proj_transform = torch.eye(4).cuda()
-        viewpoint_camera.camera_center = torch.zeros(3).cuda()
-        batched_viewpoint_cameras.append(viewpoint_camera)
-
-    # Set up the strategies
-    batched_strategies = [None] * num_batches
-
-    # Set up other parameters
-    bg_color = torch.ones(3).cuda()
-    scaling_modifier = 1.0
-    pc = type('PC', (), {})
-    pc.active_sh_degree = 2
-    pipe = type('Pipe', (), {})
-    pipe.debug = False
-    mode = "train"
-
-    batched_cuda_args = []
-    batched_means2D = []
-    batched_radii = []
-    raster_settings_list=[]
-    for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)):
-        ########## [START] Prepare CUDA Rasterization Settings ##########
-        cuda_args = get_cuda_args(strategy, mode)
-        batched_cuda_args.append(cuda_args)
-
-        # Set up rasterization configuration
-        tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
-        tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
-
-        raster_settings_list.append(GaussianRasterizationSettings(
-            image_height=int(viewpoint_camera.image_height),
-            image_width=int(viewpoint_camera.image_width),
-            tanfovx=tanfovx,
-            tanfovy=tanfovy,
-            bg=bg_color,
-            scale_modifier=scaling_modifier,
-            viewmatrix=viewpoint_camera.world_view_transform,
-            projmatrix=viewpoint_camera.full_proj_transform,
-            sh_degree=pc.active_sh_degree,
-            campos=viewpoint_camera.camera_center,
-            prefiltered=False,
-            debug=pipe.debug
-        ))
-
-
-    rasterizer=GaussianRasterizerBatches(raster_settings=raster_settings_list)
-    start_time = time.time()
-    batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians(
-            means3D=means3D,
-            scales=scales,
-            rotations=rotations,
-            shs=shs,
-            opacities=opacity,
-            cuda_args=batched_cuda_args
-        )
-    end_time = time.time()
-
-    preprocess_time = end_time - start_time
-    print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds")
-
+    
 
 def test_batched_gaussian_rasterizer():
     # Set up the input data
     num_gaussians = 10000
-    num_batches = 4
-    means3D = torch.randn(num_gaussians, 3).cuda()
-    scales = torch.randn(num_gaussians, 3).cuda()
-    rotations = torch.randn(num_gaussians, 3, 3).cuda()
-    shs = torch.randn(num_gaussians, 9).cuda()
-    opacity = torch.randn(num_gaussians, 1).cuda()
-
+    
+   
     # Set up the viewpoint cameras
     batched_viewpoint_cameras = []
     for _ in range(num_batches):
@@ -244,22 +165,18 @@ def test_batched_gaussian_rasterizer():
         batched_screenspace_params.append(screenspace_params)
         batched_radii.append(radii)
 
+
     end_time = time.time()
     preprocess_time = end_time - start_time
     print(f"Time taken by test_batched_gaussian_rasterizer: {preprocess_time:.4f} seconds")
     # Perform further operations with the batched results
     # Test results and performance
+   
+    return torch.stack(batched_means2D,dim=0).clone().cpu()
     
-def test_batched_gaussian_rasterizer_batch_processing():
+    
+def test_batched_gaussian_rasterizer_batch_processing(orig_means2D):
     # Set up the input data
-    num_gaussians = 10000
-    num_batches = 4
-    means3D = torch.randn(num_gaussians, 3).cuda()
-    scales = torch.randn(num_gaussians, 3).cuda()
-    rotations = torch.randn(num_gaussians, 3, 3).cuda()
-    shs = torch.randn(num_gaussians, 9).cuda()
-    opacity = torch.randn(num_gaussians, 1).cuda()
-
     start_time = time.time()
     # Set up the viewpoint cameras
     batched_viewpoint_cameras = []
@@ -292,7 +209,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
     batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras])
     batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras])
     batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras])
-
+    
     batched_raster_settings = GaussianRasterizationSettings(
         image_height=int(batched_viewpoint_cameras[0].image_height),
         image_width=int(batched_viewpoint_cameras[0].image_width),
@@ -333,16 +250,25 @@ def test_batched_gaussian_rasterizer_batch_processing():
 
 
     # Perform assertions on the preprocessed data
+    
     assert batched_means2D.shape == (num_batches, num_gaussians, 2)
     assert batched_rgb.shape == (num_batches, num_gaussians, 3)
-    assert batched_conic_opacity.shape == (num_batches, num_gaussians, 1)
+    assert batched_conic_opacity.shape == (num_batches, num_gaussians,4)
     assert batched_radii.shape == (num_batches, num_gaussians)
     assert batched_depths.shape == (num_batches, num_gaussians)
+    torch.cuda.empty_cache()
+    new_batched_means2D=batched_means2D.clone().cpu()
+   
+    equal_elements = torch.eq(orig_means2D, new_batched_means2D)
+    all_equal = torch.all(equal_elements)
+    print(all_equal)
+
+    assert(all_equal==True)#means2d
 
 
 
 
 if __name__ == "__main__":
-    test_batched_gaussian_rasterizer()
-    test_batched_gaussian_rasterizer_batch_processing()
+    means2D=test_batched_gaussian_rasterizer()
+    test_batched_gaussian_rasterizer_batch_processing(means2D)
     

From cace4fd0c79dff9e9bbdbd4adcda549399ce4dca Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 00:21:13 -0400
Subject: [PATCH 19/53] Refactor preprocess_gaussians function to handle
 batched and non-batched inputs in __init__.py

---
 diff_gaussian_rasterization/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index e73be82..8b041f9 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -80,8 +80,7 @@ def forward(
         )
 
         # TODO: update this. 
-        batch_size = raster_settings.tanfovx.shape[0] if torch.is_tensor(raster_settings.tanfovx) else 1
-        if batch_size == 1:
+        if not torch.is_tensor(raster_settings.tanfovx):
             num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
         else:
             num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batched(*args)

From eaf0d42a22f67aab5fc663bb84b66a8372c423d7 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 00:32:19 -0400
Subject: [PATCH 20/53] Refactor test_batched_gaussian_rasterizer and
 test_batched_gaussian_rasterizer_batch_processing functions in
 rasterization_tests.py

---
 rasterization_tests.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 84a5391..6fd9d4f 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -8,6 +8,7 @@
     GaussianRasterizer,
     GaussianRasterizerBatches,
 )
+
 num_gaussians = 10000
 num_batches=1
 means3D = torch.randn(num_gaussians, 3).cuda()
@@ -82,11 +83,7 @@ def test_gaussian_rasterizer_time():
     print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds")
     
 
-def test_batched_gaussian_rasterizer():
-    # Set up the input data
-    num_gaussians = 10000
-    
-   
+def test_batched_gaussian_rasterizer():       
     # Set up the viewpoint cameras
     batched_viewpoint_cameras = []
     for _ in range(num_batches):
@@ -172,10 +169,10 @@ def test_batched_gaussian_rasterizer():
     # Perform further operations with the batched results
     # Test results and performance
    
-    return torch.stack(batched_means2D,dim=0).clone().cpu()
+    return batched_means2D
     
     
-def test_batched_gaussian_rasterizer_batch_processing(orig_means2D):
+def test_batched_gaussian_rasterizer_batch_processing():
     # Set up the input data
     start_time = time.time()
     # Set up the viewpoint cameras
@@ -257,18 +254,18 @@ def test_batched_gaussian_rasterizer_batch_processing(orig_means2D):
     assert batched_radii.shape == (num_batches, num_gaussians)
     assert batched_depths.shape == (num_batches, num_gaussians)
     torch.cuda.empty_cache()
-    new_batched_means2D=batched_means2D.clone().cpu()
-   
-    equal_elements = torch.eq(orig_means2D, new_batched_means2D)
-    all_equal = torch.all(equal_elements)
-    print(all_equal)
-
-    assert(all_equal==True)#means2d
+    
+    return batched_means2D
 
 
+if __name__ == "__main__":
+    batched_means2D=test_batched_gaussian_rasterizer()
+    batched_means2D_batch_processed = test_batched_gaussian_rasterizer_batch_processing()
+    
+    equal_elements = torch.eq(batched_means2D, batched_means2D_batch_processed)
+    all_equal = torch.all(equal_elements)
+    print(all_equal)
 
+    assert(all_equal is True)#means2d
 
-if __name__ == "__main__":
-    means2D=test_batched_gaussian_rasterizer()
-    test_batched_gaussian_rasterizer_batch_processing(means2D)
     

From d9eb4e83913ff00e77fc24a9ac617f67c2284fda Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 00:38:42 -0400
Subject: [PATCH 21/53] Refactor test_batched_gaussian_rasterizer and
 test_batched_gaussian_rasterizer_batch_processing functions in
 rasterization_tests.py

---
 rasterization_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 6fd9d4f..16feb19 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -169,7 +169,7 @@ def test_batched_gaussian_rasterizer():
     # Perform further operations with the batched results
     # Test results and performance
    
-    return batched_means2D
+    return torch.stack(batched_means2D, dim=0)
     
     
 def test_batched_gaussian_rasterizer_batch_processing():

From c38cfa9ad9bf6ed18568c5131ce376f0c31eff00 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 01:13:37 -0400
Subject: [PATCH 22/53] add debug flag to extra_compile_args

---
 setup.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 8c4d011..3d16c4d 100644
--- a/setup.py
+++ b/setup.py
@@ -9,9 +9,11 @@
 # For inquiries contact  george.drettakis@inria.fr
 #
 
-from setuptools import setup
-from torch.utils.cpp_extension import CUDAExtension, BuildExtension
 import os
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
 os.path.dirname(os.path.abspath(__file__))
 
 setup(
@@ -30,7 +32,7 @@
             headers=[
                 "config.h"
             ],
-            extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]})
+            extra_compile_args={"nvcc": ["-g", "-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]})
         ],
     cmdclass={
         'build_ext': BuildExtension

From 24905aa81a50720bc263222d81d47c4799b8c7b9 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 18:36:26 -0400
Subject: [PATCH 23/53] Refactor tan_fovy parameter to be const in CUDA
 rasterizer files

---
 cuda_rasterizer/forward.cu         | 2 +-
 cuda_rasterizer/rasterizer_impl.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 1e3e119..16d33a1 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -604,7 +604,7 @@ void FORWARD::preprocess_batch(int P, int D, int M,
 	const float* projmatrix,
 	const glm::vec3* cam_pos,
 	const int W, int H,
-	const float* tan_fovx, float* tan_fovy,
+	const float* tan_fovx, const float* tan_fovy,
 	int* radii,
 	float2* means2D,
 	float* depths,
diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
index 2f43492..c7fe67b 100644
--- a/cuda_rasterizer/rasterizer_impl.cu
+++ b/cuda_rasterizer/rasterizer_impl.cu
@@ -444,7 +444,7 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches(
 	const float* viewmatrix,
 	const float* projmatrix,
 	const float* cam_pos,
-	const float* tan_fovx, float* tan_fovy,
+	const float* tan_fovx, const float* tan_fovy,
 	const bool prefiltered,
     const int num_viewpoints,
 	bool debug,//raster_settings

From d376d418204a068daa4ca043e143060985f5e0a5 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 18:37:11 -0400
Subject: [PATCH 24/53] Refactor tan_fovy parameter to be const in CUDA
 rasterizer files

---
 cuda_rasterizer/forward.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h
index 6c4a1da..902fa00 100644
--- a/cuda_rasterizer/forward.h
+++ b/cuda_rasterizer/forward.h
@@ -62,7 +62,7 @@ namespace FORWARD
         const float* projmatrix,
         const glm::vec3* cam_pos,
         const int W, int H,
-        const float* tan_fovx, float* tan_fovy,
+        const float* tan_fovx, const float* tan_fovy,
         int* radii,
         float2* means2D,
         float* depths,

From 8c82fa74f885a647c1f65ba048511ca9ba4b2a21 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 18:46:07 -0400
Subject: [PATCH 25/53] Refactor tan_fovy parameter to be const in CUDA
 rasterizer files

---
 cuda_rasterizer/rasterizer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_rasterizer/rasterizer.h b/cuda_rasterizer/rasterizer.h
index 4ad1bf8..b7f93fd 100644
--- a/cuda_rasterizer/rasterizer.h
+++ b/cuda_rasterizer/rasterizer.h
@@ -84,7 +84,7 @@ namespace CudaRasterizer
 			const float* viewmatrix,
 			const float* projmatrix,
 			const float* cam_pos,
-			const float* tan_fovx, float* tan_fovy,
+			const float* tan_fovx, const float* tan_fovy,
 			const bool prefiltered,
 			const int num_viewpoints,
 			bool debug,//raster_settings

From 4cca1185ee555386bf4f374247770e65eb9086af Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 19:18:09 -0400
Subject: [PATCH 26/53] Refactor CUDA rasterizer files to use CUDA tensors for
 batched calculations

---
 rasterization_tests.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 16feb19..d55e34b 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -201,11 +201,11 @@ def test_batched_gaussian_rasterizer_batch_processing():
     mode = "train"
 
     # Set up rasterization configuration for the batch
-    batched_tanfovx = torch.tensor([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras])
-    batched_tanfovy = torch.tensor([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras])
-    batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras])
-    batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras])
-    batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras])
+    batched_tanfovx = torch.tensor([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]).cuda()
+    batched_tanfovy = torch.tensor([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]).cuda()
+    batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras]).cuda()
+    batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras]).cuda()
+    batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras]).cuda()
     
     batched_raster_settings = GaussianRasterizationSettings(
         image_height=int(batched_viewpoint_cameras[0].image_height),

From 34ebced4edd04a3cd05a2a6a16b33ccae7302504 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 19:29:02 -0400
Subject: [PATCH 27/53] Refactor test_batched_gaussian_rasterizer and
 test_batched_gaussian_rasterizer_batch_processing functions in
 rasterization_tests.py

---
 rasterization_tests.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index d55e34b..35fd237 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -263,9 +263,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
     batched_means2D_batch_processed = test_batched_gaussian_rasterizer_batch_processing()
     
     equal_elements = torch.eq(batched_means2D, batched_means2D_batch_processed)
-    all_equal = torch.all(equal_elements)
-    print(all_equal)
 
-    assert(all_equal is True)#means2d
+    assert torch.all(equal_elements)
 
     

From f6374d98c20f66aaefa5c73af0504811c8ab7b7b Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 19:37:01 -0400
Subject: [PATCH 28/53] Refactor test_batched_gaussian_rasterizer and
 test_batched_gaussian_rasterizer_batch_processing functions in
 rasterization_tests.py

---
 rasterization_tests.py | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 35fd237..231500e 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -169,7 +169,10 @@ def test_batched_gaussian_rasterizer():
     # Perform further operations with the batched results
     # Test results and performance
    
-    return torch.stack(batched_means2D, dim=0)
+    batched_means2D = torch.stack(batched_means2D, dim=0)
+    batched_radii = torch.stack(batched_radii, dim=0)
+    
+    return batched_means2D, batched_radii, batched_screenspace_params
     
     
 def test_batched_gaussian_rasterizer_batch_processing():
@@ -255,15 +258,33 @@ def test_batched_gaussian_rasterizer_batch_processing():
     assert batched_depths.shape == (num_batches, num_gaussians)
     torch.cuda.empty_cache()
     
-    return batched_means2D
+    batched_screenspace_params = []
+    for i in range(num_batches):
+        means2D = batched_means2D[i]
+        rgb = batched_rgb[i]
+        conic_opacity = batched_conic_opacity[i]
+        radii = batched_radii[i]
+        depths = batched_depths[i]
+        
+        screenspace_params = [means2D, rgb, conic_opacity, radii, depths]
+        batched_screenspace_params.append(screenspace_params)
+    
+    return batched_means2D, batched_radii, batched_screenspace_params
 
 
-if __name__ == "__main__":
-    batched_means2D=test_batched_gaussian_rasterizer()
-    batched_means2D_batch_processed = test_batched_gaussian_rasterizer_batch_processing()
-    
-    equal_elements = torch.eq(batched_means2D, batched_means2D_batch_processed)
+def assert_tensor_equal(tensor1, tensor2):
+    return torch.all(torch.eq(tensor1, tensor2))
 
-    assert torch.all(equal_elements)
+if __name__ == "__main__":
+    batched_means2D, batched_radii, batched_screenspace_params = test_batched_gaussian_rasterizer()
+    batched_means2D_batch_processed, batched_radii_batch_processed, batched_screenspace_params_batch_processed = test_batched_gaussian_rasterizer_batch_processing()
+        
+    assert assert_tensor_equal(batched_means2D, batched_means2D_batch_processed)
+    assert assert_tensor_equal(batched_radii, batched_radii_batch_processed)
+    assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed)
+    for i in range(len(batched_screenspace_params)):
+        assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i])
+        for j in range(len(batched_screenspace_params[i])):
+            assert assert_tensor_equal(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j])
 
     

From d0230a6b2181f0eb784d4705a5db05e8106f7a65 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 19:44:23 -0400
Subject: [PATCH 29/53] Refactor assert_tensor_equal function to
 compare_tensors in rasterization_tests.py

---
 rasterization_tests.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 231500e..a88c800 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -272,19 +272,36 @@ def test_batched_gaussian_rasterizer_batch_processing():
     return batched_means2D, batched_radii, batched_screenspace_params
 
 
-def assert_tensor_equal(tensor1, tensor2):
-    return torch.all(torch.eq(tensor1, tensor2))
+def compare_tensors(tensor1, tensor2):
+    if tensor1.shape != tensor2.shape:
+        print("Tensors have different shapes:")
+        print("Tensor 1 shape:", tensor1.shape)
+        print("Tensor 2 shape:", tensor2.shape)
+        return False
+
+    equality_matrix = torch.eq(tensor1, tensor2)
+    if torch.all(equality_matrix):
+        print("All values in the tensors are equal.")
+        return True
+    else:
+        print("Tensors have non-matching values.")
+        non_matching_indices = torch.where(equality_matrix == False)
+        for idx in zip(*non_matching_indices):
+            value1 = tensor1[idx].item()
+            value2 = tensor2[idx].item()
+            print(f"Non-matching values at index {idx}: {value1} != {value2}")
+        return False
 
 if __name__ == "__main__":
     batched_means2D, batched_radii, batched_screenspace_params = test_batched_gaussian_rasterizer()
     batched_means2D_batch_processed, batched_radii_batch_processed, batched_screenspace_params_batch_processed = test_batched_gaussian_rasterizer_batch_processing()
         
-    assert assert_tensor_equal(batched_means2D, batched_means2D_batch_processed)
-    assert assert_tensor_equal(batched_radii, batched_radii_batch_processed)
+    assert compare_tensors(batched_means2D, batched_means2D_batch_processed)
+    assert compare_tensors(batched_radii, batched_radii_batch_processed)
     assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed)
     for i in range(len(batched_screenspace_params)):
         assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i])
         for j in range(len(batched_screenspace_params[i])):
-            assert assert_tensor_equal(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j])
+            assert compare_tensors(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j])
 
     

From e529d2ac76dec0aea6d9a755393b9d388594bd62 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 20:12:22 -0400
Subject: [PATCH 30/53] tile_grid calculated before kernel launch

---
 cuda_rasterizer/forward.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 16d33a1..92fda9b 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -616,7 +616,8 @@ void FORWARD::preprocess_batch(int P, int D, int M,
 	bool prefiltered,
     const int num_viewpoints)
 {
-    preprocessCUDABatched<NUM_CHANNELS><<<grid, ONE_DIM_BLOCK_SIZE>>>(
+    dim3 tile_grid(cdiv(P, ONE_DIM_BLOCK_SIZE), num_viewpoints);
+    preprocessCUDABatched<NUM_CHANNELS><<<tile_grid, ONE_DIM_BLOCK_SIZE>>>(
 		P, D, M,
 		means3D,
 		scales,

From 09b853effb501077563d3231470cbcc98dd91abb Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 20:12:40 -0400
Subject: [PATCH 31/53] Fix indexing bug in preprocessCUDABatched function

---
 cuda_rasterizer/forward.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 92fda9b..0ec6dd0 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -513,12 +513,12 @@ __global__ void preprocessCUDABatched(
     float* rgb, float4* conic_opacity, const dim3 grid, uint32_t* tiles_touched,
     bool prefiltered, const int num_viewpoints)
 {
-    auto point_idx = cg::this_grid().thread_rank();
+    auto point_idx = blockIdx.x * blockDim.x + threadIdx.x;
     auto viewpoint_idx = blockIdx.y;
 
     if (viewpoint_idx >= num_viewpoints || point_idx >= P) return;
 
-    auto idx = viewpoint_idx * P + point_idx;
+    auto idx = viewpoint_idx * num_viewpoints + point_idx;
     const float* viewmatrix = viewmatrix_arr + viewpoint_idx * 16;
     const float* projmatrix = projmatrix_arr + viewpoint_idx * 16;
 

From 44f8fc19581bc5782525824088b5574c28018bc3 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 22:01:39 -0400
Subject: [PATCH 32/53] Refactor indexing in preprocessCUDABatched function in
 forward.cu

---
 cuda_rasterizer/forward.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 0ec6dd0..358ab5e 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -529,10 +529,10 @@ __global__ void preprocessCUDABatched(
 
     // Perform near culling, quit if outside.
     float3 p_view;
-    if (!in_frustum(point_idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return;
+    if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return;
 
     // Transform point by projecting
-    float3 p_orig = { orig_points[3 * point_idx], orig_points[3 * point_idx + 1], orig_points[3 * point_idx + 2] };
+    float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
     float4 p_hom = transformPoint4x4(p_orig, projmatrix);
     float p_w = 1.0f / (p_hom.w + 0.0000001f);
     float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
@@ -541,9 +541,9 @@ __global__ void preprocessCUDABatched(
     // from scaling and rotation parameters.
     const float* cov3D;
     if (cov3D_precomp != nullptr) {
-        cov3D = cov3D_precomp + point_idx * 6;
+        cov3D = cov3D_precomp + idx * 6;
     } else {
-        computeCov3D(scales[point_idx], scale_modifier, rotations[point_idx], cov3Ds + idx * 6);
+        computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
         cov3D = cov3Ds + idx * 6;
     }
 
@@ -574,7 +574,7 @@ __global__ void preprocessCUDABatched(
     // If colors have been precomputed, use them, otherwise convert
     // spherical harmonics coefficients to RGB color.
     if (colors_precomp == nullptr) {
-        glm::vec3 result = computeColorFromSH(point_idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3);
+        glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3);
         rgb[idx * C + 0] = result.x;
         rgb[idx * C + 1] = result.y;
         rgb[idx * C + 2] = result.z;
@@ -586,7 +586,7 @@ __global__ void preprocessCUDABatched(
     points_xy_image[idx] = point_image;
 
     // Inverse 2D covariance and opacity neatly pack into one float4
-    conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[point_idx] };
+    conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] };
     tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
 }
 

From a00921dfd5ae4e061dac171b55fa163e05071465 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Thu, 25 Apr 2024 22:11:59 -0400
Subject: [PATCH 33/53] Refactor tile_grid calculation in rasterizer_impl.cu

---
 cuda_rasterizer/rasterizer_impl.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
index c7fe67b..06e95db 100644
--- a/cuda_rasterizer/rasterizer_impl.cu
+++ b/cuda_rasterizer/rasterizer_impl.cu
@@ -464,7 +464,9 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches(
 
 	//CONVERT ALL VECTORS TO FLOATSSSSSS PRAPTIIIIIII
 
-    dim3 tile_grid(cdiv(P, ONE_DIM_BLOCK_SIZE), num_viewpoints);
+    dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
+	dim3 block(BLOCK_X, BLOCK_Y, 1);
+	int tile_num = tile_grid.x * tile_grid.y;
 
 	// allocate temporary buffer for tiles_touched.
 	// In sep_rendering==True case, we will compute tiles_touched in the renderForward. 

From 890d95fc21bdfc7763e66ddbf712e69f81bff700 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Fri, 26 Apr 2024 10:36:34 -0400
Subject: [PATCH 34/53] Refactor indexing in preprocessCUDABatched function in
 forward.cu

---
 cuda_rasterizer/forward.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 358ab5e..5dbc9ff 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -529,10 +529,10 @@ __global__ void preprocessCUDABatched(
 
     // Perform near culling, quit if outside.
     float3 p_view;
-    if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return;
+    if (!in_frustum(point_idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return;
 
     // Transform point by projecting
-    float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
+    float3 p_orig = { orig_points[3 * point_idx], orig_points[3 * point_idx + 1], orig_points[3 * point_idx + 2] };
     float4 p_hom = transformPoint4x4(p_orig, projmatrix);
     float p_w = 1.0f / (p_hom.w + 0.0000001f);
     float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
@@ -543,7 +543,7 @@ __global__ void preprocessCUDABatched(
     if (cov3D_precomp != nullptr) {
         cov3D = cov3D_precomp + idx * 6;
     } else {
-        computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
+        computeCov3D(scales[point_idx], scale_modifier, rotations[point_idx], cov3Ds + idx * 6);
         cov3D = cov3Ds + idx * 6;
     }
 
@@ -574,7 +574,7 @@ __global__ void preprocessCUDABatched(
     // If colors have been precomputed, use them, otherwise convert
     // spherical harmonics coefficients to RGB color.
     if (colors_precomp == nullptr) {
-        glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3);
+        glm::vec3 result = computeColorFromSH(point_idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3);
         rgb[idx * C + 0] = result.x;
         rgb[idx * C + 1] = result.y;
         rgb[idx * C + 2] = result.z;
@@ -586,7 +586,7 @@ __global__ void preprocessCUDABatched(
     points_xy_image[idx] = point_image;
 
     // Inverse 2D covariance and opacity neatly pack into one float4
-    conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] };
+    conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[point_idx] };
     tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
 }
 

From b3ad196bbf3a9996fe980c70a917aca31ec9c88c Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Fri, 26 Apr 2024 14:05:12 -0400
Subject: [PATCH 35/53] Refactor compare_tensors function in
 rasterization_tests.py to handle non-matching values

---
 rasterization_tests.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index a88c800..ef3c23f 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -281,12 +281,11 @@ def compare_tensors(tensor1, tensor2):
 
     equality_matrix = torch.eq(tensor1, tensor2)
     if torch.all(equality_matrix):
-        print("All values in the tensors are equal.")
         return True
     else:
-        print("Tensors have non-matching values.")
+        print("Tensors have non-matching values. Some of them are")
         non_matching_indices = torch.where(equality_matrix == False)
-        for idx in zip(*non_matching_indices):
+        for idx in zip(*non_matching_indices[:5]):
             value1 = tensor1[idx].item()
             value2 = tensor2[idx].item()
             print(f"Non-matching values at index {idx}: {value1} != {value2}")

From abfb8b4119e278b670a1c97235c46d27e0a69c99 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Fri, 26 Apr 2024 14:07:03 -0400
Subject: [PATCH 36/53] Fix indexing bug in preprocessCUDABatched function

---
 cuda_rasterizer/forward.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 5dbc9ff..807bb84 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -518,7 +518,7 @@ __global__ void preprocessCUDABatched(
 
     if (viewpoint_idx >= num_viewpoints || point_idx >= P) return;
 
-    auto idx = viewpoint_idx * num_viewpoints + point_idx;
+    auto idx = viewpoint_idx * P + point_idx;
     const float* viewmatrix = viewmatrix_arr + viewpoint_idx * 16;
     const float* projmatrix = projmatrix_arr + viewpoint_idx * 16;
 

From 18f9c2022fc643aa6628458344f67b8acd94cdb0 Mon Sep 17 00:00:00 2001
From: Prapti Trivedi <prapti.trivedi19@gmail.com>
Date: Fri, 26 Apr 2024 14:24:43 -0400
Subject: [PATCH 37/53] Update rasterization_tests.py

---
 rasterization_tests.py | 43 ++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index ef3c23f..1295902 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -9,7 +9,7 @@
     GaussianRasterizerBatches,
 )
 
-num_gaussians = 10000
+num_gaussians = 50
 num_batches=1
 means3D = torch.randn(num_gaussians, 3).cuda()
 scales = torch.randn(num_gaussians, 3).cuda()
@@ -114,6 +114,9 @@ def test_batched_gaussian_rasterizer():
     batched_screenspace_params = []
     batched_means2D = []
     batched_radii = []
+    batched_conic_opacity=[]
+    batched_depths=[]
+    batched_rgb=[]
 
     start_time = time.time()
     
@@ -161,6 +164,9 @@ def test_batched_gaussian_rasterizer():
         batched_rasterizers.append(rasterizer)
         batched_screenspace_params.append(screenspace_params)
         batched_radii.append(radii)
+        batched_rgb.append(rgb)
+        batched_conic_opacity.append(conic_opacity)
+        batched_depths.append(depths)
 
 
     end_time = time.time()
@@ -171,8 +177,11 @@ def test_batched_gaussian_rasterizer():
    
     batched_means2D = torch.stack(batched_means2D, dim=0)
     batched_radii = torch.stack(batched_radii, dim=0)
+    batched_conic_opacity=torch.stack(batched_conic_opacity,dim=0)
+    batched_rgb=torch.stack(batched_rgb,dim=0)
+    batched_depths=torch.stack(batched_depths,dim=0)
     
-    return batched_means2D, batched_radii, batched_screenspace_params
+    return batched_means2D, batched_radii, batched_screenspace_params,batched_conic_opacity,batched_rgb,batched_depths
     
     
 def test_batched_gaussian_rasterizer_batch_processing():
@@ -269,7 +278,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
         screenspace_params = [means2D, rgb, conic_opacity, radii, depths]
         batched_screenspace_params.append(screenspace_params)
     
-    return batched_means2D, batched_radii, batched_screenspace_params
+    return batched_means2D, batched_radii, batched_screenspace_params, batched_conic_opacity,batched_rgb,batched_depths
 
 
 def compare_tensors(tensor1, tensor2):
@@ -281,26 +290,36 @@ def compare_tensors(tensor1, tensor2):
 
     equality_matrix = torch.eq(tensor1, tensor2)
     if torch.all(equality_matrix):
+        print("All values in the tensors are equal.")
         return True
     else:
-        print("Tensors have non-matching values. Some of them are")
+        print("Tensors have non-matching values.")
         non_matching_indices = torch.where(equality_matrix == False)
-        for idx in zip(*non_matching_indices[:5]):
+        for idx in zip(*non_matching_indices):
             value1 = tensor1[idx].item()
             value2 = tensor2[idx].item()
-            print(f"Non-matching values at index {idx}: {value1} != {value2}")
+            # print(f"Non-matching values at index {idx}: {value1} != {value2}")
         return False
 
 if __name__ == "__main__":
-    batched_means2D, batched_radii, batched_screenspace_params = test_batched_gaussian_rasterizer()
-    batched_means2D_batch_processed, batched_radii_batch_processed, batched_screenspace_params_batch_processed = test_batched_gaussian_rasterizer_batch_processing()
+    batched_means2D, batched_radii, batched_screenspace_params,batched_conic_opacity,batched_rgb,batched_depths = test_batched_gaussian_rasterizer()
+    batched_means2D_batch_processed, batched_radii_batch_processed, batched_screenspace_params_batch_processed,batched_conic_opacity_batch_processed,batched_rgb_batch_processed,batched_depths_batch_processed = test_batched_gaussian_rasterizer_batch_processing()
         
     assert compare_tensors(batched_means2D, batched_means2D_batch_processed)
     assert compare_tensors(batched_radii, batched_radii_batch_processed)
+    assert compare_tensors(batched_conic_opacity, batched_conic_opacity_batch_processed)
+    print(batched_rgb.shape,batched_rgb_batch_processed.shape)
+    print(batched_rgb)
+    print('*****')
+    print(batched_rgb_batch_processed)
+
+    assert compare_tensors(batched_rgb, batched_rgb_batch_processed)
+    assert compare_tensors(batched_depths, batched_depths_batch_processed)
     assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed)
-    for i in range(len(batched_screenspace_params)):
-        assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i])
-        for j in range(len(batched_screenspace_params[i])):
-            assert compare_tensors(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j])
+    # for i in range(len(batched_screenspace_params)):
+    #     assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i])
+    #     for j in range(len(batched_screenspace_params[i])):
+    #         print(i,j)
+    #         assert compare_tensors(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j])
 
     

From a04b34dd88544c7721b69c2e7383ec5e40c520b7 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Fri, 26 Apr 2024 14:28:51 -0400
Subject: [PATCH 38/53] Refactor compare_tensors function in
 rasterization_tests.py to handle non-matching values

---
 rasterization_tests.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 1295902..0613e80 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -290,15 +290,14 @@ def compare_tensors(tensor1, tensor2):
 
     equality_matrix = torch.eq(tensor1, tensor2)
     if torch.all(equality_matrix):
-        print("All values in the tensors are equal.")
         return True
     else:
         print("Tensors have non-matching values.")
         non_matching_indices = torch.where(equality_matrix == False)
-        for idx in zip(*non_matching_indices):
+        for idx in zip(*non_matching_indices)[:5]:
             value1 = tensor1[idx].item()
             value2 = tensor2[idx].item()
-            # print(f"Non-matching values at index {idx}: {value1} != {value2}")
+            print(f"Non-matching values at index {idx}: {value1} != {value2}")
         return False
 
 if __name__ == "__main__":

From e593132761793c958fd481b266df94bc360d8a8e Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Fri, 26 Apr 2024 14:34:52 -0400
Subject: [PATCH 39/53] Refactor compare_tensors function to fix indexing bug
 and handle non-matching values

---
 rasterization_tests.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 0613e80..af252e6 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -294,7 +294,7 @@ def compare_tensors(tensor1, tensor2):
     else:
         print("Tensors have non-matching values.")
         non_matching_indices = torch.where(equality_matrix == False)
-        for idx in zip(*non_matching_indices)[:5]:
+        for idx in zip(*non_matching_indices[:5]):
             value1 = tensor1[idx].item()
             value2 = tensor2[idx].item()
             print(f"Non-matching values at index {idx}: {value1} != {value2}")
@@ -307,7 +307,6 @@ def compare_tensors(tensor1, tensor2):
     assert compare_tensors(batched_means2D, batched_means2D_batch_processed)
     assert compare_tensors(batched_radii, batched_radii_batch_processed)
     assert compare_tensors(batched_conic_opacity, batched_conic_opacity_batch_processed)
-    print(batched_rgb.shape,batched_rgb_batch_processed.shape)
     print(batched_rgb)
     print('*****')
     print(batched_rgb_batch_processed)
@@ -315,10 +314,4 @@ def compare_tensors(tensor1, tensor2):
     assert compare_tensors(batched_rgb, batched_rgb_batch_processed)
     assert compare_tensors(batched_depths, batched_depths_batch_processed)
     assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed)
-    # for i in range(len(batched_screenspace_params)):
-    #     assert len(batched_screenspace_params[i]) == len(batched_screenspace_params_batch_processed[i])
-    #     for j in range(len(batched_screenspace_params[i])):
-    #         print(i,j)
-    #         assert compare_tensors(batched_screenspace_params[i][j], batched_screenspace_params_batch_processed[i][j])
-
     

From e109969dc0cca0a6c5eded09498b1742a77a04c5 Mon Sep 17 00:00:00 2001
From: Prapti Trivedi <prapti.trivedi19@gmail.com>
Date: Fri, 26 Apr 2024 17:17:49 -0400
Subject: [PATCH 40/53] Update forward.cu

---
 cuda_rasterizer/forward.cu | 71 ++++++++++++++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 807bb84..8d384e6 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -28,8 +28,9 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const
 	dir = dir / glm::length(dir);
 
 	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
+	
 	glm::vec3 result = SH_C0 * sh[0];
-
+	
 	if (deg > 0)
 	{
 		float x = dir.x;
@@ -70,6 +71,59 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const
 	clamped[3 * idx + 2] = (result.z < 0);
 	return glm::max(result, 0.0f);
 }
+__device__ glm::vec3 computeColorFromSHBatched(int idx, int view_idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
+{
+	// The implementation is loosely based on code for 
+	// "Differentiable Point-Based Radiance Fields for 
+	// Efficient View Synthesis" by Zhang et al. (2022)
+	glm::vec3 pos = means[idx];
+	glm::vec3 dir = pos - campos;
+	dir = dir / glm::length(dir);
+
+	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
+	
+	glm::vec3 result = SH_C0 * sh[0];
+	
+	if (deg > 0)
+	{
+		float x = dir.x;
+		float y = dir.y;
+		float z = dir.z;
+		result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
+
+		if (deg > 1)
+		{
+			float xx = x * x, yy = y * y, zz = z * z;
+			float xy = x * y, yz = y * z, xz = x * z;
+			result = result +
+				SH_C2[0] * xy * sh[4] +
+				SH_C2[1] * yz * sh[5] +
+				SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
+				SH_C2[3] * xz * sh[7] +
+				SH_C2[4] * (xx - yy) * sh[8];
+
+			if (deg > 2)
+			{
+				result = result +
+					SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
+					SH_C3[1] * xy * z * sh[10] +
+					SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
+					SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
+					SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
+					SH_C3[5] * z * (xx - yy) * sh[14] +
+					SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
+			}
+		}
+	}
+	result += 0.5f;
+
+	// RGB colors are clamped to positive values. If values are
+	// clamped, we need to keep track of this for the backward pass.
+	clamped[3 * view_idx + 0] = (result.x < 0);
+	clamped[3 * view_idx + 1] = (result.y < 0);
+	clamped[3 * view_idx + 2] = (result.z < 0);
+	return glm::max(result, 0.0f);
+}
 
 // Forward version of 2D covariance matrix computation
 __device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix)
@@ -197,6 +251,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
 
 	// Transform point by projecting
 	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
+
 	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
 	float p_w = 1.0f / (p_hom.w + 0.0000001f);
 	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
@@ -213,10 +268,9 @@ __global__ void preprocessCUDA(int P, int D, int M,
 		computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
 		cov3D = cov3Ds + idx * 6;
 	}
-
 	// Compute 2D screen-space covariance matrix
 	float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix);
-
+	
 	// Invert covariance (EWA algorithm)
 	float det = (cov.x * cov.z - cov.y * cov.y);
 	if (det == 0.0f)
@@ -241,7 +295,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	// If colors have been precomputed, use them, otherwise convert
 	// spherical harmonics coefficients to RGB color.
 	if (colors_precomp == nullptr)
-	{
+	{	
 		glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
 		rgb[idx * C + 0] = result.x;
 		rgb[idx * C + 1] = result.y;
@@ -533,6 +587,7 @@ __global__ void preprocessCUDABatched(
 
     // Transform point by projecting
     float3 p_orig = { orig_points[3 * point_idx], orig_points[3 * point_idx + 1], orig_points[3 * point_idx + 2] };
+
     float4 p_hom = transformPoint4x4(p_orig, projmatrix);
     float p_w = 1.0f / (p_hom.w + 0.0000001f);
     float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
@@ -546,12 +601,14 @@ __global__ void preprocessCUDABatched(
         computeCov3D(scales[point_idx], scale_modifier, rotations[point_idx], cov3Ds + idx * 6);
         cov3D = cov3Ds + idx * 6;
     }
+	
 
     // Compute 2D screen-space covariance matrix
     const float focal_x = W / (2.0f * tan_fovx[viewpoint_idx]);
     const float focal_y = H / (2.0f * tan_fovy[viewpoint_idx]);
     float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx[viewpoint_idx], tan_fovy[viewpoint_idx], cov3D, viewmatrix);
 
+
     // Invert covariance (EWA algorithm)
     float det = (cov.x * cov.z - cov.y * cov.y);
     if (det == 0.0f) return;
@@ -573,8 +630,10 @@ __global__ void preprocessCUDABatched(
 
     // If colors have been precomputed, use them, otherwise convert
     // spherical harmonics coefficients to RGB color.
+	
     if (colors_precomp == nullptr) {
-        glm::vec3 result = computeColorFromSH(point_idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3);
+		
+        glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3);
         rgb[idx * C + 0] = result.x;
         rgb[idx * C + 1] = result.y;
         rgb[idx * C + 2] = result.z;
@@ -644,4 +703,4 @@ void FORWARD::preprocess_batch(int P, int D, int M,
 		prefiltered,
 		num_viewpoints
 		);
-}
\ No newline at end of file
+}

From edfea2eefc16fa890e0f92f4842b5b7e5ddb78cd Mon Sep 17 00:00:00 2001
From: Prapti Trivedi <prapti.trivedi19@gmail.com>
Date: Sat, 27 Apr 2024 00:54:29 -0400
Subject: [PATCH 41/53] Update rasterization_tests.py

---
 rasterization_tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index af252e6..fab689a 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -13,8 +13,8 @@
 num_batches=1
 means3D = torch.randn(num_gaussians, 3).cuda()
 scales = torch.randn(num_gaussians, 3).cuda()
-rotations = torch.randn(num_gaussians, 3, 3).cuda()
-shs = torch.randn(num_gaussians, 9).cuda()
+rotations = torch.randn(num_gaussians,4).cuda()
+shs = torch.randn(num_gaussians, 16,3).cuda()
 opacity = torch.randn(num_gaussians, 1).cuda()
 
 def get_cuda_args(strategy, mode="train"):

From 53a14e29740e331a537a3c66d5f63758ce503d15 Mon Sep 17 00:00:00 2001
From: Prapti Trivedi <prapti.trivedi19@gmail.com>
Date: Sat, 27 Apr 2024 00:55:05 -0400
Subject: [PATCH 42/53] Update forward.cu

---
 cuda_rasterizer/forward.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 8d384e6..567c7ae 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -633,7 +633,7 @@ __global__ void preprocessCUDABatched(
 	
     if (colors_precomp == nullptr) {
 		
-        glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3);
+        glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped);
         rgb[idx * C + 0] = result.x;
         rgb[idx * C + 1] = result.y;
         rgb[idx * C + 2] = result.z;

From 32a601f41467c1d27ccb80698d831a786b2dd39d Mon Sep 17 00:00:00 2001
From: Sandeep Menon <snm6477@nyu.edu>
Date: Sat, 27 Apr 2024 01:06:07 -0400
Subject: [PATCH 43/53] fixed sh_sdegree

---
 cuda_rasterizer/forward.cu |  2 +-
 rasterization_tests.py     | 17 +++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 8d384e6..567c7ae 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -633,7 +633,7 @@ __global__ void preprocessCUDABatched(
 	
     if (colors_precomp == nullptr) {
 		
-        glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped + idx * 3);
+        glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped);
         rgb[idx * C + 0] = result.x;
         rgb[idx * C + 1] = result.y;
         rgb[idx * C + 2] = result.z;
diff --git a/rasterization_tests.py b/rasterization_tests.py
index af252e6..afea6b9 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -9,12 +9,12 @@
     GaussianRasterizerBatches,
 )
 
-num_gaussians = 50
-num_batches=1
+num_gaussians = 10000
+num_batches=32
 means3D = torch.randn(num_gaussians, 3).cuda()
 scales = torch.randn(num_gaussians, 3).cuda()
-rotations = torch.randn(num_gaussians, 3, 3).cuda()
-shs = torch.randn(num_gaussians, 9).cuda()
+rotations = torch.randn(num_gaussians, 4).cuda()
+shs = torch.randn(num_gaussians, 16, 3).cuda()
 opacity = torch.randn(num_gaussians, 1).cuda()
 
 def get_cuda_args(strategy, mode="train"):
@@ -104,7 +104,7 @@ def test_batched_gaussian_rasterizer():
     bg_color = torch.ones(3).cuda()
     scaling_modifier = 1.0
     pc = type('PC', (), {})
-    pc.active_sh_degree = 2
+    pc.active_sh_degree = 3
     pipe = type('Pipe', (), {})
     pipe.debug = False
     mode = "train"
@@ -207,7 +207,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
     bg_color = torch.ones(3).cuda()
     scaling_modifier = 1.0
     pc = type('PC', (), {})
-    pc.active_sh_degree = 2
+    pc.active_sh_degree = 3
     pipe = type('Pipe', (), {})
     pipe.debug = False
     mode = "train"
@@ -265,7 +265,6 @@ def test_batched_gaussian_rasterizer_batch_processing():
     assert batched_conic_opacity.shape == (num_batches, num_gaussians,4)
     assert batched_radii.shape == (num_batches, num_gaussians)
     assert batched_depths.shape == (num_batches, num_gaussians)
-    torch.cuda.empty_cache()
     
     batched_screenspace_params = []
     for i in range(num_batches):
@@ -307,11 +306,9 @@ def compare_tensors(tensor1, tensor2):
     assert compare_tensors(batched_means2D, batched_means2D_batch_processed)
     assert compare_tensors(batched_radii, batched_radii_batch_processed)
     assert compare_tensors(batched_conic_opacity, batched_conic_opacity_batch_processed)
-    print(batched_rgb)
-    print('*****')
-    print(batched_rgb_batch_processed)
 
     assert compare_tensors(batched_rgb, batched_rgb_batch_processed)
     assert compare_tensors(batched_depths, batched_depths_batch_processed)
     assert len(batched_screenspace_params) == len(batched_screenspace_params_batch_processed)
     
+

From 22eb043f05cf4d47256f12907873ca643d294408 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sat, 27 Apr 2024 20:23:47 -0400
Subject: [PATCH 44/53] Refactor GaussianRasterizationSettings class to handle
 raster_settings as a batch

---
 diff_gaussian_rasterization/__init__.py |  23 +++++-
 rasterization_tests.py                  | 100 +++++++-----------------
 2 files changed, 47 insertions(+), 76 deletions(-)

diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index f7860b4..6be6778 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -59,6 +59,19 @@ def forward(
     ):
 
         # Restructure arguments the way that the C++ lib expects them
+        if isinstance(raster_settings, list):
+            rs = raster_settings[0]
+            rs.viewmatrix, rs.projmatrix, rs.campos = [
+                torch.stack(tensors) for tensors in zip(
+                    *[(rs.viewmatrix, rs.projmatrix, rs.campos) for rs in raster_settings]
+                )
+            ]
+            rs.tanfovx, rs.tanfovy = [
+                torch.tensor(vals, device=means3D.device)
+                for vals in zip(*[(rs.tanfovx, rs.tanfovy) for rs in raster_settings])
+            ]
+            raster_settings = rs
+            
         args = (
             means3D,
             scales,
@@ -314,15 +327,17 @@ class GaussianRasterizationSettings(NamedTuple):
     debug : bool
 
 class GaussianRasterizerBatches(nn.Module):
-    def __init__(self, raster_settings):
+    def __init__(self, raster_settings_batch):
         super().__init__()
-        self.raster_settings_list = raster_settings
+        self.raster_settings_batch = raster_settings_batch
 
     def markVisible(self, positions):
         # Mark visible points (based on frustum culling for camera) with a boolean 
         with torch.no_grad():
             visible = []
-            for viewmatrix, projmatrix in zip(self.raster_settings.viewmatrix, self.raster_settings.projmatrix):
+            for raster_settings in self.raster_settings_batch:
+                viewmatrix = raster_settings.viewmatrix
+                projmatrix = raster_settings.projmatrix
                 visible.append(_C.mark_visible(positions, viewmatrix, projmatrix))
         return visible
 
@@ -335,7 +350,7 @@ def preprocess_gaussians(self, means3D, scales, rotations, shs, opacities, batch
                 rotations,
                 shs,
                 opacities,
-                self.raster_settings_list,
+                self.raster_settings_batch,
                 batched_cuda_args)
 
 class GaussianRasterizer(nn.Module):
diff --git a/rasterization_tests.py b/rasterization_tests.py
index afea6b9..2452cbf 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -16,6 +16,7 @@
 rotations = torch.randn(num_gaussians, 4).cuda()
 shs = torch.randn(num_gaussians, 16, 3).cuda()
 opacity = torch.randn(num_gaussians, 1).cuda()
+SH_ACTIVE_DEGREE = 3
 
 def get_cuda_args(strategy, mode="train"):
     cuda_args = {
@@ -36,52 +37,6 @@ def get_cuda_args(strategy, mode="train"):
     }
     return cuda_args
 
-def test_gaussian_rasterizer_time():
-    # Set up the input data
-    num_gaussians = 10000
-    means3D = torch.randn(num_gaussians, 3).cuda()
-    scales = torch.randn(num_gaussians, 3).cuda()
-    rotations = torch.randn(num_gaussians, 3, 3).cuda()
-    shs = torch.randn(num_gaussians, 9).cuda()
-    opacities = torch.randn(num_gaussians, 1).cuda()
-
-    # Set up the rasterization settings
-    image_height = 512
-    image_width = 512
-    tanfovx = 1.0
-    tanfovy = 1.0
-    bg = torch.ones(3).cuda()
-    scale_modifier = 1.0
-    viewmatrix = torch.eye(4).cuda()
-    projmatrix = torch.eye(4).cuda()
-    sh_degree = 2
-    campos = torch.zeros(3).cuda()
-    prefiltered = False
-    debug = False
-    
-    # mode="train"
-    # strategy=None
-    # cuda_args = get_cuda_args(strategy, mode)
-
-    raster_settings = GaussianRasterizationSettings(
-        image_height, image_width, tanfovx, tanfovy, bg,
-        scale_modifier, viewmatrix, projmatrix, sh_degree,
-        campos, prefiltered, debug
-    )
-
-    # Create the GaussianRasterizer
-    rasterizer = GaussianRasterizer(raster_settings)
-
-    # Measure the time for preprocess_gaussians
-    start_time = time.time()
-    means2D, rgb, conic_opacity, radii, depths = rasterizer.preprocess_gaussians(
-        means3D, scales, rotations, shs, opacities
-    )
-    end_time = time.time()
-
-    preprocess_time = end_time - start_time
-    print(f"Time taken by preprocess_gaussians: {preprocess_time:.4f} seconds")
-    
 
 def test_batched_gaussian_rasterizer():       
     # Set up the viewpoint cameras
@@ -104,7 +59,7 @@ def test_batched_gaussian_rasterizer():
     bg_color = torch.ones(3).cuda()
     scaling_modifier = 1.0
     pc = type('PC', (), {})
-    pc.active_sh_degree = 3
+    pc.active_sh_degree = SH_ACTIVE_DEGREE
     pipe = type('Pipe', (), {})
     pipe.debug = False
     mode = "train"
@@ -207,39 +162,40 @@ def test_batched_gaussian_rasterizer_batch_processing():
     bg_color = torch.ones(3).cuda()
     scaling_modifier = 1.0
     pc = type('PC', (), {})
-    pc.active_sh_degree = 3
+    pc.active_sh_degree = SH_ACTIVE_DEGREE
     pipe = type('Pipe', (), {})
     pipe.debug = False
     mode = "train"
 
     # Set up rasterization configuration for the batch
-    batched_tanfovx = torch.tensor([math.tan(camera.FoVx * 0.5) for camera in batched_viewpoint_cameras]).cuda()
-    batched_tanfovy = torch.tensor([math.tan(camera.FoVy * 0.5) for camera in batched_viewpoint_cameras]).cuda()
-    batched_viewmatrix = torch.stack([camera.world_view_transform for camera in batched_viewpoint_cameras]).cuda()
-    batched_projmatrix = torch.stack([camera.full_proj_transform for camera in batched_viewpoint_cameras]).cuda()
-    batched_campos = torch.stack([camera.camera_center for camera in batched_viewpoint_cameras]).cuda()
-    
-    batched_raster_settings = GaussianRasterizationSettings(
-        image_height=int(batched_viewpoint_cameras[0].image_height),
-        image_width=int(batched_viewpoint_cameras[0].image_width),
-        tanfovx=batched_tanfovx,
-        tanfovy=batched_tanfovy,
-        bg=bg_color,
-        scale_modifier=scaling_modifier,
-        viewmatrix=batched_viewmatrix,
-        projmatrix=batched_projmatrix,
-        sh_degree=pc.active_sh_degree,
-        campos=batched_campos,
-        prefiltered=False,
-        debug=pipe.debug
-    )
+    batched_raster_settings = []
+    batched_cuda_args = []
+    for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)):
+        ########## [START] Prepare CUDA Rasterization Settings ##########
+        cuda_args = get_cuda_args(strategy, mode)
+        batched_cuda_args.append(cuda_args)
+        tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
+        tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
+
+        raster_settings = GaussianRasterizationSettings(
+            image_height=int(batched_viewpoint_cameras[0].image_height),
+            image_width=int(batched_viewpoint_cameras[0].image_width),
+            tanfovx=tanfovx,
+            tanfovy=tanfovy,
+            bg=bg_color,
+            scale_modifier=scaling_modifier,
+            viewmatrix=viewpoint_camera.world_view_transform,
+            projmatrix=viewpoint_camera.full_proj_transform,
+            sh_degree=pc.active_sh_degree,
+            campos=viewpoint_camera.camera_center,
+            prefiltered=False,
+            debug=pipe.debug
+        )
+        batched_raster_settings.append(raster_settings)
 
     # Create the GaussianRasterizer for the batch
     rasterizer = GaussianRasterizerBatches(raster_settings=batched_raster_settings)
 
-    # Set up CUDA arguments for the batch
-    cuda_args = get_cuda_args(batched_strategies[0], mode)  # TODO: Check if this is correct for the batch
-
     # Preprocess the Gaussians for the entire batch
     batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians(
         means3D=means3D,
@@ -247,7 +203,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
         rotations=rotations,
         shs=shs,
         opacities=opacity,
-        batched_cuda_args=cuda_args
+        batched_cuda_args=batched_cuda_args
     )
     end_time = time.time()
     preprocess_time = end_time - start_time

From 7ff2fd31b038cafd942e680018793004fc6815ba Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sat, 27 Apr 2024 20:33:22 -0400
Subject: [PATCH 45/53] Refactor rasterization_tests.py to use
 raster_settings_batch instead of batched_raster_settings

---
 rasterization_tests.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 2452cbf..f69cc6e 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -168,7 +168,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
     mode = "train"
 
     # Set up rasterization configuration for the batch
-    batched_raster_settings = []
+    raster_settings_batch = []
     batched_cuda_args = []
     for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)):
         ########## [START] Prepare CUDA Rasterization Settings ##########
@@ -191,10 +191,10 @@ def test_batched_gaussian_rasterizer_batch_processing():
             prefiltered=False,
             debug=pipe.debug
         )
-        batched_raster_settings.append(raster_settings)
+        raster_settings_batch.append(raster_settings)
 
     # Create the GaussianRasterizer for the batch
-    rasterizer = GaussianRasterizerBatches(raster_settings=batched_raster_settings)
+    rasterizer = GaussianRasterizerBatches(raster_settings_batch=raster_settings_batch)
 
     # Preprocess the Gaussians for the entire batch
     batched_means2D, batched_rgb, batched_conic_opacity, batched_radii, batched_depths = rasterizer.preprocess_gaussians(

From fc48eece6f933afda42ab7ca19690b667f71b186 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sat, 27 Apr 2024 20:45:00 -0400
Subject: [PATCH 46/53] fixed namedtuple setting bug

---
 diff_gaussian_rasterization/__init__.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index 6be6778..fe4a910 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -60,18 +60,23 @@ def forward(
 
         # Restructure arguments the way that the C++ lib expects them
         if isinstance(raster_settings, list):
-            rs = raster_settings[0]
-            rs.viewmatrix, rs.projmatrix, rs.campos = [
+            viewmatrix, projmatrix, campos = [
                 torch.stack(tensors) for tensors in zip(
                     *[(rs.viewmatrix, rs.projmatrix, rs.campos) for rs in raster_settings]
                 )
             ]
-            rs.tanfovx, rs.tanfovy = [
+            tanfovx, tanfovy = [
                 torch.tensor(vals, device=means3D.device)
                 for vals in zip(*[(rs.tanfovx, rs.tanfovy) for rs in raster_settings])
             ]
-            raster_settings = rs
-            
+            raster_settings = raster_settings[0]._replace(
+                tanfovx=tanfovx,
+                tanfovy=tanfovy,
+                viewmatrix=viewmatrix,
+                projmatrix=projmatrix,
+                campos=campos
+            )  
+                      
         args = (
             means3D,
             scales,

From 49c517978304a5c7d183bbd714fb616de46a7f85 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sat, 27 Apr 2024 20:53:21 -0400
Subject: [PATCH 47/53] Refactor GaussianRasterizationSettings class to handle
 raster_settings as a batch

---
 diff_gaussian_rasterization/__init__.py | 2 --
 rasterization_tests.py                  | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index fe4a910..41b6a43 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -99,10 +99,8 @@ def forward(
 
         # TODO: update this. 
         if not torch.is_tensor(raster_settings.tanfovx):
-            print("normal batch calling")
             num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians(*args)
         else:
-            print("improved batch calling")
             num_rendered, means2D, depths, radii, cov3D, conic_opacity, rgb, clamped = _C.preprocess_gaussians_batched(*args)
 
         # Keep relevant tensors for backward
diff --git a/rasterization_tests.py b/rasterization_tests.py
index f69cc6e..7501854 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -203,7 +203,7 @@ def test_batched_gaussian_rasterizer_batch_processing():
         rotations=rotations,
         shs=shs,
         opacities=opacity,
-        batched_cuda_args=batched_cuda_args
+        batched_cuda_args=batched_cuda_args[0]  #TODO: look into sending list of cuda_args/strategies
     )
     end_time = time.time()
     preprocess_time = end_time - start_time

From a0d7127c4d80844d4566a779c520c1c284f6a543 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sat, 27 Apr 2024 21:01:03 -0400
Subject: [PATCH 48/53] Update setup.py to remove debug flag from
 extra_compile_args

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3d16c4d..03b2df8 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@
             headers=[
                 "config.h"
             ],
-            extra_compile_args={"nvcc": ["-g", "-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]})
+            extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")]})
         ],
     cmdclass={
         'build_ext': BuildExtension

From a21c4b9303e56a901ea77555f8e767f1a0940a17 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sat, 27 Apr 2024 21:01:27 -0400
Subject: [PATCH 49/53] Fix formatting issues in forward.cu and __init__.py

---
 cuda_rasterizer/forward.cu              | 8 +++-----
 diff_gaussian_rasterization/__init__.py | 2 --
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 567c7ae..fd1732a 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -28,9 +28,8 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const
 	dir = dir / glm::length(dir);
 
 	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
-	
 	glm::vec3 result = SH_C0 * sh[0];
-	
+
 	if (deg > 0)
 	{
 		float x = dir.x;
@@ -251,7 +250,6 @@ __global__ void preprocessCUDA(int P, int D, int M,
 
 	// Transform point by projecting
 	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
-
 	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
 	float p_w = 1.0f / (p_hom.w + 0.0000001f);
 	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
@@ -270,7 +268,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	}
 	// Compute 2D screen-space covariance matrix
 	float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix);
-	
+
 	// Invert covariance (EWA algorithm)
 	float det = (cov.x * cov.z - cov.y * cov.y);
 	if (det == 0.0f)
@@ -295,7 +293,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	// If colors have been precomputed, use them, otherwise convert
 	// spherical harmonics coefficients to RGB color.
 	if (colors_precomp == nullptr)
-	{	
+	{
 		glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
 		rgb[idx * C + 0] = result.x;
 		rgb[idx * C + 1] = result.y;
diff --git a/diff_gaussian_rasterization/__init__.py b/diff_gaussian_rasterization/__init__.py
index 41b6a43..7667063 100644
--- a/diff_gaussian_rasterization/__init__.py
+++ b/diff_gaussian_rasterization/__init__.py
@@ -115,8 +115,6 @@ def forward(
         # means2D = torch.cat((means2D, means2D_pad), dim = 1).contiguous()
         return means2D, rgb, conic_opacity, radii, depths
 
-
-
     @staticmethod # TODO: gradient for conic_opacity is tricky. because cuda render backward generate dL_dconic and dL_dopacity sperately. 
     def backward(ctx, grad_means2D, grad_rgb, grad_conic_opacity, grad_radii, grad_depths):
         # grad_radii, grad_depths should be all None. 

From 25c6812f1eb4b654588df3f6d1df664528c256bd Mon Sep 17 00:00:00 2001
From: Sandeep Menon <menonsandu@gmail.com>
Date: Sat, 27 Apr 2024 21:11:10 -0400
Subject: [PATCH 50/53] Refactor computeColorFromSH function in forward.cu to
 use point_idx and result_idx instead of only idx.

---
 cuda_rasterizer/forward.cu | 69 +++++---------------------------------
 1 file changed, 8 insertions(+), 61 deletions(-)

diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index fd1732a..5f83847 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -18,16 +18,16 @@ namespace cg = cooperative_groups;
 
 // Forward method for converting the input spherical harmonics
 // coefficients of each Gaussian to a simple RGB color.
-__device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
+__device__ glm::vec3 computeColorFromSH(int point_idx, int result_idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
 {
 	// The implementation is loosely based on code for 
 	// "Differentiable Point-Based Radiance Fields for 
 	// Efficient View Synthesis" by Zhang et al. (2022)
-	glm::vec3 pos = means[idx];
+	glm::vec3 pos = means[point_idx];
 	glm::vec3 dir = pos - campos;
 	dir = dir / glm::length(dir);
 
-	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
+	glm::vec3* sh = ((glm::vec3*)shs) + point_idx * max_coeffs;
 	glm::vec3 result = SH_C0 * sh[0];
 
 	if (deg > 0)
@@ -65,62 +65,9 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const
 
 	// RGB colors are clamped to positive values. If values are
 	// clamped, we need to keep track of this for the backward pass.
-	clamped[3 * idx + 0] = (result.x < 0);
-	clamped[3 * idx + 1] = (result.y < 0);
-	clamped[3 * idx + 2] = (result.z < 0);
-	return glm::max(result, 0.0f);
-}
-__device__ glm::vec3 computeColorFromSHBatched(int idx, int view_idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
-{
-	// The implementation is loosely based on code for 
-	// "Differentiable Point-Based Radiance Fields for 
-	// Efficient View Synthesis" by Zhang et al. (2022)
-	glm::vec3 pos = means[idx];
-	glm::vec3 dir = pos - campos;
-	dir = dir / glm::length(dir);
-
-	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
-	
-	glm::vec3 result = SH_C0 * sh[0];
-	
-	if (deg > 0)
-	{
-		float x = dir.x;
-		float y = dir.y;
-		float z = dir.z;
-		result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
-
-		if (deg > 1)
-		{
-			float xx = x * x, yy = y * y, zz = z * z;
-			float xy = x * y, yz = y * z, xz = x * z;
-			result = result +
-				SH_C2[0] * xy * sh[4] +
-				SH_C2[1] * yz * sh[5] +
-				SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
-				SH_C2[3] * xz * sh[7] +
-				SH_C2[4] * (xx - yy) * sh[8];
-
-			if (deg > 2)
-			{
-				result = result +
-					SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
-					SH_C3[1] * xy * z * sh[10] +
-					SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
-					SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
-					SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
-					SH_C3[5] * z * (xx - yy) * sh[14] +
-					SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
-			}
-		}
-	}
-	result += 0.5f;
-
-	// RGB colors are clamped to positive values. If values are
-	// clamped, we need to keep track of this for the backward pass.
-	clamped[3 * view_idx + 0] = (result.x < 0);
-	clamped[3 * view_idx + 1] = (result.y < 0);
-	clamped[3 * view_idx + 2] = (result.z < 0);
+	clamped[3 * result_idx + 0] = (result.x < 0);
+	clamped[3 * result_idx + 1] = (result.y < 0);
+	clamped[3 * result_idx + 2] = (result.z < 0);
 	return glm::max(result, 0.0f);
 }
 
@@ -294,7 +241,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	// spherical harmonics coefficients to RGB color.
 	if (colors_precomp == nullptr)
 	{
-		glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
+		glm::vec3 result = computeColorFromSH(idx, idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
 		rgb[idx * C + 0] = result.x;
 		rgb[idx * C + 1] = result.y;
 		rgb[idx * C + 2] = result.z;
@@ -631,7 +578,7 @@ __global__ void preprocessCUDABatched(
 	
     if (colors_precomp == nullptr) {
 		
-        glm::vec3 result = computeColorFromSHBatched(point_idx,idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped);
+        glm::vec3 result = computeColorFromSH(point_idx, idx, D, M, (glm::vec3*)orig_points, cam_pos[viewpoint_idx], shs, clamped);
         rgb[idx * C + 0] = result.x;
         rgb[idx * C + 1] = result.y;
         rgb[idx * C + 2] = result.z;

From 1b7fdc4bdc4358db42b89f2bc4f1c9979f2438f6 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <snm6477@nyu.edu>
Date: Tue, 7 May 2024 23:33:04 -0400
Subject: [PATCH 51/53] replaced python time with torch event records

---
 rasterization_tests.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/rasterization_tests.py b/rasterization_tests.py
index 7501854..b140aab 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -9,7 +9,7 @@
     GaussianRasterizerBatches,
 )
 
-num_gaussians = 10000
+num_gaussians = 20000
 num_batches=32
 means3D = torch.randn(num_gaussians, 3).cuda()
 scales = torch.randn(num_gaussians, 3).cuda()
@@ -73,8 +73,10 @@ def test_batched_gaussian_rasterizer():
     batched_depths=[]
     batched_rgb=[]
 
-    start_time = time.time()
-    
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+    start_event.record()
     for i, (viewpoint_camera, strategy) in enumerate(zip(batched_viewpoint_cameras, batched_strategies)):
         ########## [START] Prepare CUDA Rasterization Settings ##########
         cuda_args = get_cuda_args(strategy, mode)
@@ -124,9 +126,10 @@ def test_batched_gaussian_rasterizer():
         batched_depths.append(depths)
 
 
-    end_time = time.time()
-    preprocess_time = end_time - start_time
-    print(f"Time taken by test_batched_gaussian_rasterizer: {preprocess_time:.4f} seconds")
+    end_event.record()
+    torch.cuda.synchronize()
+    elapsed_time_ms = start_event.elapsed_time(end_event)
+    print(f"Time taken by test_batched_gaussian_rasterizer: {elapsed_time_ms:.4f} ms")
     # Perform further operations with the batched results
     # Test results and performance
    
@@ -141,7 +144,10 @@ def test_batched_gaussian_rasterizer():
     
 def test_batched_gaussian_rasterizer_batch_processing():
     # Set up the input data
-    start_time = time.time()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()  # Wait for the events to be recorded!
+    start_event.record()
     # Set up the viewpoint cameras
     batched_viewpoint_cameras = []
     for _ in range(num_batches):
@@ -205,9 +211,10 @@ def test_batched_gaussian_rasterizer_batch_processing():
         opacities=opacity,
         batched_cuda_args=batched_cuda_args[0]  #TODO: look into sending list of cuda_args/strategies
     )
-    end_time = time.time()
-    preprocess_time = end_time - start_time
-    print(f"Time taken by test_batched_gaussian_rasterizer_batch_processing: {preprocess_time:.4f} seconds")
+    end_event.record()
+    torch.cuda.synchronize()  # Wait for the events to be recorded!
+    elapsed_time_ms = start_event.elapsed_time(end_event)
+    print(f"Time taken by test_batched_gaussian_rasterizer_batch_processing: {elapsed_time_ms:.4f} ms")
 
     # TODO: make the below work
     # if mode == "train":

From 3c4c66775ed187e19c462d75e30598d70a0d9f94 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <snm6477@nyu.edu>
Date: Wed, 8 May 2024 00:13:11 -0400
Subject: [PATCH 52/53] fixed cuda illegal memory bug and can run for 1M
 gaussians

---
 cuda_rasterizer/rasterizer_impl.cu | 4 ++--
 rasterization_tests.py             | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
index 06e95db..44d09f4 100644
--- a/cuda_rasterizer/rasterizer_impl.cu
+++ b/cuda_rasterizer/rasterizer_impl.cu
@@ -472,8 +472,8 @@ int CudaRasterizer::Rasterizer::preprocessForwardBatches(
 	// In sep_rendering==True case, we will compute tiles_touched in the renderForward. 
 	// TODO: remove it later by modifying FORWARD::preprocess when we deprecate sep_rendering==False case
 	uint32_t* tiles_touched_temp_buffer;
-	CHECK_CUDA(cudaMalloc(&tiles_touched_temp_buffer, P * sizeof(uint32_t)), debug);
-	CHECK_CUDA(cudaMemset(tiles_touched_temp_buffer, 0, P * sizeof(uint32_t)), debug);
+	CHECK_CUDA(cudaMalloc(&tiles_touched_temp_buffer, num_viewpoints * P * sizeof(uint32_t)), debug);
+	CHECK_CUDA(cudaMemset(tiles_touched_temp_buffer, 0, num_viewpoints * P * sizeof(uint32_t)), debug);
 
 	timer.start("10 preprocess");
 	// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
diff --git a/rasterization_tests.py b/rasterization_tests.py
index b140aab..90afdcd 100644
--- a/rasterization_tests.py
+++ b/rasterization_tests.py
@@ -1,5 +1,4 @@
 import math
-import time
 
 import torch
 
@@ -9,8 +8,8 @@
     GaussianRasterizerBatches,
 )
 
-num_gaussians = 20000
-num_batches=32
+num_gaussians = 1000000
+num_batches=64
 means3D = torch.randn(num_gaussians, 3).cuda()
 scales = torch.randn(num_gaussians, 3).cuda()
 rotations = torch.randn(num_gaussians, 4).cuda()

From 1e4cbc9845807daef10358252322d9314d56c4c2 Mon Sep 17 00:00:00 2001
From: Sandeep Menon <snm6477@nyu.edu>
Date: Wed, 8 May 2024 00:15:59 -0400
Subject: [PATCH 53/53] chore: Update .gitignore to ignore *.pyc files

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index aa1ae78..77db517 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ diff_gaussian_rasterization.egg-info/
 dist/
 diff_gaussian_rasterization/__pycache__/
 *so
+*.pyc
\ No newline at end of file