mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
ggml: Add POOL2D OP for GPU acceleration to the Vulkan backend in the MobileVLM model. (#9763)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
* ggml: Add POOL2D OP for GPU ACC to the Vulkan. - The MobileVLM model now supports inference acceleration through GPU by utilizing the Vulkan backend. - A GGML_OP_POOL_2D shader has been added. (Pooling) - The encoding performance of the CLIP model improved from 2.8s on the CPU to 0.7s on the GPU. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * [fix] Correct the incorrect order of the parameters. fix casting to int. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> --------- Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com>
This commit is contained in:
parent
8d8ff71536
commit
8f275a7c45
@ -213,6 +213,7 @@ struct vk_device_struct {
|
|||||||
vk_pipeline pipeline_sum_rows_f32;
|
vk_pipeline pipeline_sum_rows_f32;
|
||||||
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
||||||
vk_pipeline pipeline_timestep_embedding_f32;
|
vk_pipeline pipeline_timestep_embedding_f32;
|
||||||
|
vk_pipeline pipeline_pool2d_f32;
|
||||||
|
|
||||||
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
||||||
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
||||||
@ -403,6 +404,17 @@ struct vk_op_timestep_embedding_push_constants {
|
|||||||
uint32_t max_period;
|
uint32_t max_period;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct vk_op_pool2d_push_constants {
|
||||||
|
uint32_t IW; uint32_t IH;
|
||||||
|
uint32_t OW; uint32_t OH;
|
||||||
|
uint32_t OC;
|
||||||
|
uint32_t pelements;
|
||||||
|
uint32_t op;
|
||||||
|
int32_t k0; int32_t k1;
|
||||||
|
int32_t s0; int32_t s1;
|
||||||
|
int32_t p0; int32_t p1;
|
||||||
|
};
|
||||||
|
|
||||||
// Allow pre-recording command buffers
|
// Allow pre-recording command buffers
|
||||||
struct vk_staging_memcpy {
|
struct vk_staging_memcpy {
|
||||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||||
@ -1803,6 +1815,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||||
|
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
for (auto &c : compiles) {
|
for (auto &c : compiles) {
|
||||||
c.wait();
|
c.wait();
|
||||||
}
|
}
|
||||||
@ -4234,6 +4248,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||||||
return ctx->device->pipeline_timestep_embedding_f32;
|
return ctx->device->pipeline_timestep_embedding_f32;
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
case GGML_OP_POOL_2D:
|
||||||
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||||
|
return ctx->device->pipeline_pool2d_f32;
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_leaky_relu_f32;
|
return ctx->device->pipeline_leaky_relu_f32;
|
||||||
@ -4464,6 +4483,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||||||
uint32_t half_ceil = (dim + 1) / 2;
|
uint32_t half_ceil = (dim + 1) / 2;
|
||||||
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_POOL_2D:
|
||||||
|
{
|
||||||
|
const uint32_t N = dst->ne[3];
|
||||||
|
const uint32_t OC = dst->ne[2];
|
||||||
|
const uint32_t OH = dst->ne[1];
|
||||||
|
const uint32_t OW = dst->ne[0];
|
||||||
|
elements = { N * OC * OH * OW, 1, 1};
|
||||||
|
} break;
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
@ -4914,6 +4941,34 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
|
|||||||
}, dryrun);
|
}, dryrun);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||||
|
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
||||||
|
const int32_t k1 = dst->op_params[1];
|
||||||
|
const int32_t k0 = dst->op_params[2];
|
||||||
|
const int32_t s1 = dst->op_params[3];
|
||||||
|
const int32_t s0 = dst->op_params[4];
|
||||||
|
const int32_t p1 = dst->op_params[5];
|
||||||
|
const int32_t p0 = dst->op_params[6];
|
||||||
|
|
||||||
|
const uint32_t IH = src0->ne[1];
|
||||||
|
const uint32_t IW = src0->ne[0];
|
||||||
|
|
||||||
|
const uint32_t N = dst->ne[3];
|
||||||
|
|
||||||
|
const uint32_t OC = dst->ne[2];
|
||||||
|
const uint32_t OH = dst->ne[1];
|
||||||
|
const uint32_t OW = dst->ne[0];
|
||||||
|
|
||||||
|
const uint32_t parallel_elements = N * OC * OH * OW;
|
||||||
|
|
||||||
|
ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
|
||||||
|
IW, IH, OW, OH, OC,
|
||||||
|
parallel_elements,
|
||||||
|
op,
|
||||||
|
k0, k1, s0, s1, p0, p1,
|
||||||
|
}, dryrun);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||||
const float * op_params = (const float *)dst->op_params;
|
const float * op_params = (const float *)dst->op_params;
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
||||||
@ -5792,6 +5847,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@ -5927,6 +5983,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
||||||
|
|
||||||
|
break;
|
||||||
|
case GGML_OP_POOL_2D:
|
||||||
|
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
|
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
|
||||||
@ -6018,6 +6078,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
buf = tensor->buffer;
|
buf = tensor->buffer;
|
||||||
@ -6821,6 +6882,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
@ -7334,6 +7396,16 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
|||||||
const int32_t dim = tensor->op_params[0];
|
const int32_t dim = tensor->op_params[0];
|
||||||
const int32_t max_period = tensor->op_params[1];
|
const int32_t max_period = tensor->op_params[1];
|
||||||
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
|
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
|
||||||
|
} else if (tensor->op == GGML_OP_POOL_2D) {
|
||||||
|
enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
|
||||||
|
const int32_t k0 = tensor->op_params[1];
|
||||||
|
const int32_t k1 = tensor->op_params[2];
|
||||||
|
const int32_t s0 = tensor->op_params[3];
|
||||||
|
const int32_t s1 = tensor->op_params[4];
|
||||||
|
const int32_t p0 = tensor->op_params[5];
|
||||||
|
const int32_t p1 = tensor->op_params[6];
|
||||||
|
|
||||||
|
tensor_clone = ggml_pool_2d(ggml_ctx, src0_clone, op, k0, k1, s0, s1, p0, p1);
|
||||||
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
||||||
const float * op_params = (const float *)tensor->op_params;
|
const float * op_params = (const float *)tensor->op_params;
|
||||||
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
|
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
|
||||||
|
74
ggml/src/vulkan-shaders/pool2d.comp
Normal file
74
ggml/src/vulkan-shaders/pool2d.comp
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
|
|
||||||
|
layout(push_constant) uniform parameter {
|
||||||
|
uint IW; uint IH;
|
||||||
|
uint OW; uint OH;
|
||||||
|
uint OC;
|
||||||
|
uint pelements;
|
||||||
|
uint op;
|
||||||
|
int k0; int k1;
|
||||||
|
int s0; int s1;
|
||||||
|
int p0; int p1;
|
||||||
|
} p;
|
||||||
|
|
||||||
|
#define BLOCK_SIZE 512
|
||||||
|
#define FLT_MAX 3.402823466e+38F
|
||||||
|
#define OP_POOL_MAX 0u
|
||||||
|
#define OP_POOL_AVG 1u
|
||||||
|
|
||||||
|
layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout(binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
|
layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint idx = gl_GlobalInvocationID.x;
|
||||||
|
if (idx >= p.pelements) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint O_HW = p.OW * p.OH;
|
||||||
|
|
||||||
|
const uint nc = idx / O_HW;
|
||||||
|
const uint cur_oh = (idx % O_HW) / p.OW;
|
||||||
|
const uint cur_ow = (idx % O_HW) % p.OW;
|
||||||
|
|
||||||
|
const int start_h = int(cur_oh) * p.s0 - p.p0;
|
||||||
|
const uint bh = max(start_h, 0);
|
||||||
|
const uint eh = min(start_h + p.k0, p.IH);
|
||||||
|
|
||||||
|
const int start_w = int(cur_ow) * p.s1 - p.p1;
|
||||||
|
const uint bw = max(start_w, 0);
|
||||||
|
const uint ew = min(start_w + p.k1, p.IW);
|
||||||
|
|
||||||
|
const float scale = 1.0 / float(p.k0 * p.k1);
|
||||||
|
float res;
|
||||||
|
|
||||||
|
if (p.op == OP_POOL_AVG) {
|
||||||
|
res = 0.0;
|
||||||
|
} else if (p.op == OP_POOL_MAX) {
|
||||||
|
res = -FLT_MAX;
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (uint i = bh; i < eh; i++) {
|
||||||
|
#pragma unroll
|
||||||
|
for (uint j = bw; j < ew; j++) {
|
||||||
|
const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);
|
||||||
|
|
||||||
|
if (p.op == OP_POOL_AVG) {
|
||||||
|
res += cur * scale;
|
||||||
|
} else if (p.op == OP_POOL_MAX) {
|
||||||
|
res = max(res, cur);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;
|
||||||
|
}
|
@ -493,6 +493,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
|||||||
tasks.push_back(std::async(std::launch::async, [=] {
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
|
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_output_files() {
|
void write_output_files() {
|
||||||
|
Loading…
Reference in New Issue
Block a user