Skip to content

Commit 5eb17f0

Browse files
YellowRoseCxardforkfunnbotEngininja2KerfuffleV2
committed
ROCm Port update
* use hipblas based on cublas * Update Makefile for the Cuda kernels * Expand arch list and make it overrideable * Fix multi GPU on multiple amd architectures with rocblas_initialize() (ggml-org#5) * add hipBLAS to README * new build arg LLAMA_CUDA_MMQ_Y * fix half2 decomposition * Add intrinsics polyfills for AMD * AMD assembly optimized __dp4a * Allow overriding CC_TURING * use "ROCm" instead of "CUDA" * ignore all build dirs * Add Dockerfiles * fix llama-bench * fix -nommq help for non CUDA/HIP --------- Co-Authored-By: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Co-Authored-By: ardfork <134447697+ardfork@users.noreply.github.com> Co-Authored-By: funnbot <22226942+funnbot@users.noreply.github.com> Co-Authored-By: Engininja2 <139037756+Engininja2@users.noreply.github.com> Co-Authored-By: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Co-Authored-By: jammm <2500920+jammm@users.noreply.github.com> Co-Authored-By: jdecourval <7315817+jdecourval@users.noreply.github.com>
1 parent b34f4bd commit 5eb17f0

File tree

7 files changed

+31
-25
lines changed

7 files changed

+31
-25
lines changed

.gitignore

+1-14
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,7 @@
1212
.vs/
1313
.vscode/
1414

15-
build/
16-
build-em/
17-
build-debug/
18-
build-release/
19-
build-ci-debug/
20-
build-ci-release/
21-
build-static/
22-
build-cublas/
23-
build-opencl/
24-
build-metal/
25-
build-mpi/
26-
build-no-accel/
27-
build-sanitize-addr/
28-
build-sanitize-thread/
15+
build*/
2916
out/
3017
tmp/
3118

CMakeLists.txt

+7-2
Original file line numberDiff line numberDiff line change
@@ -137,16 +137,18 @@ if (LLAMA_HIPBLAS)
137137

138138
find_package(hip)
139139
find_package(hipblas)
140+
find_package(rocblas)
140141

141142
if (${hipblas_FOUND} AND ${hip_FOUND})
142143
message(STATUS "HIP and hipBLAS found")
143144
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
144145
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
145146
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
146-
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
147+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
147148
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
149+
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
148150
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
149-
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas)
151+
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
150152

151153
if (LLAMA_STATIC)
152154
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
@@ -175,6 +177,9 @@ if (LLAMA_HIPBLAS)
175177
message(STATUS "HIP and hipBLAS found")
176178
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
177179
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
180+
if (LLAMA_CUDA_FORCE_DMMV)
181+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
182+
endif()
178183
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
179184
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
180185
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

Makefile

+6-2
Original file line numberDiff line numberDiff line change
@@ -194,14 +194,17 @@ ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-l
194194
endif # LLAMA_CUBLAS
195195

196196
ifdef LLAMA_HIPBLAS
197-
ROCM_PATH ?= /opt/rocm
197+
ROCM_PATH ?= /opt/rocm
198198
CC := $(ROCM_PATH)/llvm/bin/clang
199199
CXX := $(ROCM_PATH)/llvm/bin/clang++
200-
GPU_TARGETS = gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
200+
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100
201201
LLAMA_CUDA_DMMV_X ?= 128
202202
LLAMA_CUDA_MMV_Y ?= 2
203203
LLAMA_CUDA_KQUANTS_ITER ?= 1
204204
HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
205+
ifdef LLAMA_CUDA_FORCE_DMMV
206+
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
207+
endif # LLAMA_CUDA_FORCE_DMMV
205208
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
206209
HIP_OBJS += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
207210
ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
@@ -228,6 +231,7 @@ ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-l
228231
endif # LLAMA_HIPBLAS
229232

230233

234+
231235
ifdef LLAMA_METAL
232236
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
233237
CXXFLAGS += -DGGML_USE_METAL

examples/common.cpp

+7-5
Original file line numberDiff line numberDiff line change
@@ -597,11 +597,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
597597
fprintf(stdout, " number of layers to store in VRAM\n");
598598
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
599599
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
600-
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
601-
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
602-
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
603-
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
604-
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
600+
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
601+
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
602+
#ifdef GGML_USE_CUBLAS
603+
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
604+
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
605+
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
606+
#endif // GGML_USE_CUBLAS
605607
#endif
606608
fprintf(stdout, " --mtest compute maximum memory usage\n");
607609
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");

ggml-cuda.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -4746,7 +4746,7 @@ void ggml_init_cublas() {
47464746
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
47474747
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
47484748
int64_t total_vram = 0;
4749-
fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
4749+
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
47504750
for (int id = 0; id < g_device_count; ++id) {
47514751
cudaDeviceProp prop;
47524752
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));

ggml-cuda.h

+8
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
#include "ggml.h"
44

5+
#ifdef GGML_USE_HIPBLAS
6+
#define GGML_CUDA_NAME "ROCm"
7+
#define GGML_CUBLAS_NAME "hipBLAS"
8+
#else
9+
#define GGML_CUDA_NAME "CUDA"
10+
#define GGML_CUBLAS_NAME "cuBLAS"
11+
#endif
12+
513
#ifdef __cplusplus
614
extern "C" {
715
#endif

llama.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1187,7 +1187,7 @@ static void llama_model_load_internal(
11871187
(void) main_gpu;
11881188
(void) mul_mat_q;
11891189
#if defined(GGML_USE_CUBLAS)
1190-
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
1190+
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
11911191
ggml_cuda_set_main_device(main_gpu);
11921192
ggml_cuda_set_mul_mat_q(mul_mat_q);
11931193
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU

0 commit comments

Comments
 (0)