mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
cann: Add host buffer type for Ascend NPU (#9406)
Some checks failed
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
Python check requirements.txt / check-requirements (push) Has been cancelled
Some checks failed
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
Python check requirements.txt / check-requirements (push) Has been cancelled
* feat: Add host buffer type for Ascend NPU(CANN backend) * fix some checking errors * Add a few comments
This commit is contained in:
parent
e665744317
commit
e6b7801bd1
@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
|
|||||||
*/
|
*/
|
||||||
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||||
|
*
|
||||||
|
* @return A pointer to the host buffer type interface.
|
||||||
|
*/
|
||||||
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Retrieves the description of a specific CANN device.
|
* @brief Retrieves the description of a specific CANN device.
|
||||||
*
|
*
|
||||||
|
@ -1221,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
|||||||
return &ggml_backend_cann_buffer_types[device];
|
return &ggml_backend_cann_buffer_types[device];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Retrieves the name associated with a CANN host buffer type.
|
||||||
|
*
|
||||||
|
* This function returns the descriptive name associated with the specified
|
||||||
|
* CANN host buffer type context.
|
||||||
|
*
|
||||||
|
* @param buft Pointer to the host buffer type context.
|
||||||
|
* @return Const pointer to the C-style string containing the name.
|
||||||
|
*/
|
||||||
|
GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "CANN_Host";
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Retrieves the name associated with a CANN host buffer.
|
||||||
|
*
|
||||||
|
* This function returns the descriptive name associated with the specified
|
||||||
|
* CANN host buffer context.
|
||||||
|
*
|
||||||
|
* @param buft Pointer to the host buffer context.
|
||||||
|
* @return Const pointer to the C-style string containing the name.
|
||||||
|
*/
|
||||||
|
GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
|
return "CANN_Host";
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Free resources associated with a CANN host buffer.
|
||||||
|
*
|
||||||
|
* This function frees the resources associated with a CANN host buffer, including
|
||||||
|
* its context.
|
||||||
|
*
|
||||||
|
* @param buffer The CANN host buffer to free.
|
||||||
|
*/
|
||||||
|
GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
|
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Allocates a new CANN host buffer of the specified size.
|
||||||
|
*
|
||||||
|
* This function allocates a new CANN host buffer with the given size.
|
||||||
|
* @param size Size in bytes of the host buffer to allocate.
|
||||||
|
* @return Pointer to the allocated host buffer, or nullptr if allocation fails.
|
||||||
|
*/
|
||||||
|
static void * ggml_cann_host_malloc(size_t size) {
|
||||||
|
if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void * hostPtr = nullptr;
|
||||||
|
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||||
|
if (err != ACL_SUCCESS) {
|
||||||
|
|
||||||
|
GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
|
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return hostPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Allocates a new CANN host buffer of the specified type and size.
|
||||||
|
*
|
||||||
|
* @param buft Pointer to the host buffer type context.
|
||||||
|
* @param size Size in bytes of the host buffer to allocate.
|
||||||
|
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
||||||
|
*/
|
||||||
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
void * hostPtr = ggml_cann_host_malloc(size);
|
||||||
|
|
||||||
|
if (hostPtr == nullptr) {
|
||||||
|
// fallback to cpu buffer
|
||||||
|
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
||||||
|
buffer->buft = buft;
|
||||||
|
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
|
||||||
|
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Interface for managing CANN host buffer types in the GGML backend.
|
||||||
|
*
|
||||||
|
* Provides function pointers for allocating, querying properties, and managing
|
||||||
|
* memory for CANN buffer types in the GGML backend.
|
||||||
|
*/
|
||||||
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
|
},
|
||||||
|
/* .context = */ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_cann_buffer_type_host;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Computes the forward operation for a given tensor using CANN
|
* @brief Computes the forward operation for a given tensor using CANN
|
||||||
* operations.
|
* operations.
|
||||||
|
@ -2156,6 +2156,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|||||||
if (host_buffer) {
|
if (host_buffer) {
|
||||||
buft = ggml_backend_sycl_host_buffer_type();
|
buft = ggml_backend_sycl_host_buffer_type();
|
||||||
}
|
}
|
||||||
|
#elif defined(GGML_USE_CANN)
|
||||||
|
if (host_buffer) {
|
||||||
|
buft = ggml_backend_cann_host_buffer_type();
|
||||||
|
}
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
Loading…
Reference in New Issue
Block a user