@@ -206,7 +206,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
206
206
SET (CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library" )
207
207
208
208
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
209
- set (CUTLASS_REVISION "v3.5.1 " CACHE STRING "CUTLASS revision to use" )
209
+ set (CUTLASS_REVISION "v3.6.0 " CACHE STRING "CUTLASS revision to use" )
210
210
211
211
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
212
212
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
223
223
FetchContent_Declare(
224
224
cutlass
225
225
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
226
- GIT_TAG v3.5.1
226
+ GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
227
227
GIT_PROGRESS TRUE
228
228
229
229
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
230
230
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
231
231
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
232
- GIT_SHALLOW TRUE
232
+ GIT_SHALLOW FALSE
233
233
)
234
234
endif ()
235
235
FetchContent_MakeAvailable(cutlass)
@@ -241,7 +241,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
241
241
"csrc/quantization/awq/gemm_kernels.cu"
242
242
"csrc/custom_all_reduce.cu"
243
243
"csrc/permute_cols.cu"
244
- "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" )
244
+ "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
245
+ "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
246
+ "csrc/sparse/cutlass/sparse_compressor_entry.cu"
247
+ "csrc/cutlass_extensions/common.cpp" )
245
248
246
249
set_gencode_flags_for_srcs(
247
250
SRCS "${VLLM_EXT_SRC} "
@@ -271,11 +274,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
271
274
endif ()
272
275
273
276
#
274
- # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
277
+ # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
278
+ # For Hopper (c3x, i.e. CUTLASS 3.x) require
275
279
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
276
280
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS} " )
277
281
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
278
- set (SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" )
282
+ set (SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
283
+ "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
284
+ "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu" )
279
285
set_gencode_flags_for_srcs(
280
286
SRCS "${SRCS} "
281
287
CUDA_ARCHS "${SCALED_MM_3X_ARCHS} " )
@@ -284,12 +290,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
284
290
message (STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS} " )
285
291
else ()
286
292
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
287
- message (STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
293
+ message (STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
288
294
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
289
- "later if you intend on running FP8 quantized models on "
295
+ "later if you intend on running FP8 sparse or quantized models on "
290
296
"Hopper." )
291
297
else ()
292
- message (STATUS "Not building scaled_mm_c3x as no compatible archs found "
298
+ message (STATUS "Not building cutlass_c3x as no compatible archs found "
293
299
"in CUDA target architectures" )
294
300
endif ()
295
301
@@ -404,7 +410,7 @@ define_gpu_extension_target(
404
410
SOURCES ${VLLM_EXT_SRC}
405
411
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
406
412
ARCHITECTURES ${VLLM_GPU_ARCHES}
407
- INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
413
+ INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} ; ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
408
414
USE_SABI 3
409
415
WITH_SOABI)
410
416
0 commit comments