[Kernel] Fix CUTLASS 3.x custom broadcast load epilogue (vllm-project#5516)

tlrmchlsmth · robertgshaw2-redhat · commit 923d05ac414d · 2024-06-23T21:18:36.000Z
diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
@@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast {
 
     CUTLASS_DEVICE void
     begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
-      if (params.ptr_row == nullptr) {
+      if (!params.row_broadcast) {
         return;
       }
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -257,9 +257,7 @@ def apply(self,
         #   If dynamic, layer.input_scale is None and x_scale computed from x.
         #   If static, layer.input_scale is scalar and x_scale is input_scale.
 
-        # Temporarily disable CUTLASS kernels due to an illegal memory access
-        #if  bias is None and self.cutlass_fp8_supported:
-        if False:
+        if bias is None and self.cutlass_fp8_supported:
             qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
 
             # Fused GEMM_DQ

Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast {`
`153`	`153`
`154`	`154`	`CUTLASS_DEVICE void`
`155`	`155`	`begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {`
`156`		`- if (params.ptr_row == nullptr) {`
	`156`	`+ if (!params.row_broadcast) {`
`157`	`157`	`return;`
`158`	`158`	`}`
`159`	`159`