mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
cuda : add LLAMA_CUDA_NO_PEER_COPY to workaround broken ROCm p2p copy (#6208)
* cuda : add LLAMA_CUDA_NO_PEER_COPY to workaround broken ROCm p2p copy * add LLAMA_CUDA_NO_PEER_COPY to HIP build
This commit is contained in:
parent
29ab270e65
commit
2f0e81e053
@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
|
|||||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||||
"llama: max. batch size for using peer access")
|
"llama: max. batch size for using peer access")
|
||||||
|
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
||||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||||
@ -387,6 +388,9 @@ if (LLAMA_CUBLAS)
|
|||||||
endif()
|
endif()
|
||||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||||
|
if (LLAMA_CUDA_NO_PEER_COPY)
|
||||||
|
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
@ -531,6 +535,10 @@ if (LLAMA_HIPBLAS)
|
|||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CUDA_NO_PEER_COPY)
|
||||||
|
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
9
Makefile
9
Makefile
@ -452,9 +452,9 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|||||||
else
|
else
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
||||||
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||||
#ifdef LLAMA_CUDA_CUBLAS
|
ifdef LLAMA_CUDA_NO_PEER_COPY
|
||||||
# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
||||||
#endif # LLAMA_CUDA_CUBLAS
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
||||||
ifdef LLAMA_CUDA_CCBIN
|
ifdef LLAMA_CUDA_CCBIN
|
||||||
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||||
endif
|
endif
|
||||||
@ -535,6 +535,9 @@ endif # LLAMA_HIP_UMA
|
|||||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
endif # LLAMA_CUDA_FORCE_DMMV
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
|
ifdef LLAMA_CUDA_NO_PEER_COPY
|
||||||
|
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
||||||
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||||
|
28
ggml-cuda.cu
28
ggml-cuda.cu
@ -771,7 +771,11 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
|
|||||||
if (src_ctx->device == dst_ctx->device) {
|
if (src_ctx->device == dst_ctx->device) {
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
|
||||||
} else {
|
} else {
|
||||||
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
|
return false;
|
||||||
|
#else
|
||||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
|
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||||
return true;
|
return true;
|
||||||
@ -11322,19 +11326,23 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|||||||
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
||||||
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
||||||
|
|
||||||
|
// copy on src stream
|
||||||
|
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
||||||
|
} else {
|
||||||
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
|
return false;
|
||||||
|
#else
|
||||||
|
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// record event on src stream
|
||||||
if (!cuda_ctx_src->copy_event) {
|
if (!cuda_ctx_src->copy_event) {
|
||||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||||
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy on src stream
|
|
||||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
|
||||||
} else {
|
|
||||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
|
||||||
}
|
|
||||||
|
|
||||||
// record event on src stream
|
|
||||||
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
|
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
|
||||||
|
|
||||||
// wait on dst stream for the copy to complete
|
// wait on dst stream for the copy to complete
|
||||||
@ -11530,6 +11538,9 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
||||||
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
|
return nullptr;
|
||||||
|
#else
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(cuda_ctx->device);
|
ggml_cuda_set_device(cuda_ctx->device);
|
||||||
@ -11541,6 +11552,7 @@ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend)
|
|||||||
/* .backend = */ backend,
|
/* .backend = */ backend,
|
||||||
/* .context = */ event,
|
/* .context = */ event,
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
||||||
|
Loading…
Reference in New Issue
Block a user