Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit 923d05a

Browse files
tlrmchlsmthrobertgshaw2-redhat
authored andcommitted
[Kernel] Fix CUTLASS 3.x custom broadcast load epilogue (vllm-project#5516)
1 parent cab4a5d commit 923d05a

File tree

2 files changed

+2
-4
lines changed

2 files changed

+2
-4
lines changed

csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast {
153153

154154
CUTLASS_DEVICE void
155155
begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
156-
if (params.ptr_row == nullptr) {
156+
if (!params.row_broadcast) {
157157
return;
158158
}
159159

vllm/model_executor/layers/quantization/fp8.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -257,9 +257,7 @@ def apply(self,
257257
# If dynamic, layer.input_scale is None and x_scale computed from x.
258258
# If static, layer.input_scale is scalar and x_scale is input_scale.
259259

260-
# Temporarily disable CUTLASS kernels due to an illegal memory access
261-
#if bias is None and self.cutlass_fp8_supported:
262-
if False:
260+
if bias is None and self.cutlass_fp8_supported:
263261
qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
264262

265263
# Fused GEMM_DQ

0 commit comments

Comments
 (0)