mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 12:24:35 +00:00
Vulkan: Implement accumulator switch for specific mul mat mat shaders
This commit is contained in:
parent
6f2c49cc25
commit
a0deeeed28
@ -1779,6 +1779,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
|
|
||||||
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||||
|
|
||||||
|
if (device->vendor_id == VK_VENDOR_ID_INTEL) {
|
||||||
|
// Intel drivers don't support coopmat properly yet
|
||||||
|
device->coopmat_support = false;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
|
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
|
||||||
|
|
||||||
// Try to find a non-graphics compute queue and transfer-focused queues
|
// Try to find a non-graphics compute queue and transfer-focused queues
|
||||||
@ -1945,9 +1950,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false);
|
ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false);
|
||||||
|
|
||||||
// Shaders
|
// Shaders
|
||||||
// Disable matmul tile sizes early if not supported
|
// Disable matmul tile sizes early if performance low or not supported
|
||||||
switch (device->vendor_id) {
|
switch (device->vendor_id) {
|
||||||
case VK_VENDOR_ID_AMD:
|
case VK_VENDOR_ID_AMD:
|
||||||
|
case VK_VENDOR_ID_INTEL:
|
||||||
device->mul_mat_l = false;
|
device->mul_mat_l = false;
|
||||||
device->mul_mat_m = true;
|
device->mul_mat_m = true;
|
||||||
device->mul_mat_s = true;
|
device->mul_mat_s = true;
|
||||||
@ -1963,14 +1969,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||||||
device->mul_mat_id_m = true;
|
device->mul_mat_id_m = true;
|
||||||
device->mul_mat_id_s = false;
|
device->mul_mat_id_s = false;
|
||||||
break;
|
break;
|
||||||
case VK_VENDOR_ID_INTEL:
|
|
||||||
device->mul_mat_l = false;
|
|
||||||
device->mul_mat_m = false;
|
|
||||||
device->mul_mat_s = true;
|
|
||||||
device->mul_mat_id_l = false;
|
|
||||||
device->mul_mat_id_m = false;
|
|
||||||
device->mul_mat_id_s = true;
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
device->mul_mat_l = true;
|
device->mul_mat_l = true;
|
||||||
device->mul_mat_m = true;
|
device->mul_mat_m = true;
|
||||||
@ -2050,6 +2048,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (props2.properties.vendorID == VK_VENDOR_ID_INTEL) {
|
||||||
|
// Intel drivers don't support coopmat properly yet
|
||||||
|
coopmat_support = false;
|
||||||
|
}
|
||||||
|
|
||||||
const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16");
|
const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16");
|
||||||
bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr;
|
bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr;
|
||||||
|
|
||||||
@ -3025,20 +3028,33 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int
|
|||||||
return split_k;
|
return split_k;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
|
static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type type_a) {
|
||||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
|
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
|
||||||
|
|
||||||
|
// On F32 matmuls, selecting this way increases performance significantly. On quants or fp16, it reduces performance.
|
||||||
|
// Maybe because it reduces checks and uses more vector loads, but why is fp16 worse?
|
||||||
|
if (type_a == GGML_TYPE_F32) {
|
||||||
|
if ((ctx->device->mul_mat_l && (m % mmp->l->wg_denoms[0]) == 0 && (n & mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_s)) {
|
||||||
|
return aligned ? mmp->a_l : mmp->l;
|
||||||
|
}
|
||||||
|
if ((ctx->device->mul_mat_m && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s) {
|
||||||
|
return aligned ? mmp->a_m : mmp->m;
|
||||||
|
}
|
||||||
|
return aligned ? mmp->a_s : mmp->s;
|
||||||
|
}
|
||||||
|
|
||||||
if ((ctx->device->mul_mat_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_l)) {
|
if ((ctx->device->mul_mat_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_l)) {
|
||||||
return aligned ? mmp->a_s : mmp->s;
|
return aligned ? mmp->a_s : mmp->s;
|
||||||
}
|
}
|
||||||
if ((ctx->device->mul_mat_m && (m <= 64 || n <= 64 || ctx->device->coopmat_support)) || !ctx->device->mul_mat_l) {
|
if ((ctx->device->mul_mat_m && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l) {
|
||||||
return aligned ? mmp->a_m : mmp->m;
|
return aligned ? mmp->a_m : mmp->m;
|
||||||
}
|
}
|
||||||
return aligned ? mmp->a_l : mmp->l;
|
return aligned ? mmp->a_l : mmp->l;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
|
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type type_a) {
|
||||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
|
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
|
||||||
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
|
return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, type_a)->align;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_matmul(
|
static void ggml_vk_matmul(
|
||||||
@ -3227,10 +3243,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||||||
const int y_ne = ne11 * ne10;
|
const int y_ne = ne11 * ne10;
|
||||||
const int d_ne = ne11 * ne01;
|
const int d_ne = ne11 * ne01;
|
||||||
|
|
||||||
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
|
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, src0->type));
|
||||||
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
|
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
|
||||||
|
|
||||||
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned);
|
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, src0->type);
|
||||||
|
|
||||||
const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
|
const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
|
||||||
|
|
||||||
@ -5521,13 +5537,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|||||||
vk_pipeline p;
|
vk_pipeline p;
|
||||||
std::string shname;
|
std::string shname;
|
||||||
if (shader_size == 0) {
|
if (shader_size == 0) {
|
||||||
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_s;
|
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_s;
|
||||||
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S";
|
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S";
|
||||||
} else if (shader_size == 1) {
|
} else if (shader_size == 1) {
|
||||||
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_m;
|
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_m;
|
||||||
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M";
|
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M";
|
||||||
} else if (shader_size == 2) {
|
} else if (shader_size == 2) {
|
||||||
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->a_l;
|
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_l;
|
||||||
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L";
|
shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L";
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(0);
|
GGML_ASSERT(0);
|
||||||
@ -5537,13 +5553,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|||||||
|
|
||||||
if (k != kpad) {
|
if (k != kpad) {
|
||||||
if (shader_size == 0) {
|
if (shader_size == 0) {
|
||||||
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->s;
|
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->s;
|
||||||
shname = std::string(ggml_type_name(quant)) + "_S";
|
shname = std::string(ggml_type_name(quant)) + "_S";
|
||||||
} else if (shader_size == 1) {
|
} else if (shader_size == 1) {
|
||||||
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->m;
|
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->m;
|
||||||
shname = std::string(ggml_type_name(quant)) + "_M";
|
shname = std::string(ggml_type_name(quant)) + "_M";
|
||||||
} else if (shader_size == 2) {
|
} else if (shader_size == 2) {
|
||||||
p = ctx->device->pipeline_dequant_mul_mat_mat[quant]->l;
|
p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->l;
|
||||||
shname = std::string(ggml_type_name(quant)) + "_L";
|
shname = std::string(ggml_type_name(quant)) + "_L";
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(0);
|
GGML_ASSERT(0);
|
||||||
@ -5593,16 +5609,16 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|||||||
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
||||||
|
|
||||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||||
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||||
for (size_t i = 0; i < num_it; i++) {
|
for (size_t i = 0; i < num_it; i++) {
|
||||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
||||||
ggml_vk_matmul(
|
ggml_vk_matmul(
|
||||||
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
||||||
m, n, k,
|
m, n, k,
|
||||||
k, k, m, k*m, k*n, m*n,
|
k, k, m, k*m, k*n, m*n,
|
||||||
split_k, batch, batch, batch, 1, 1
|
split_k, batch, batch, batch, 1, 1
|
||||||
);
|
);
|
||||||
ggml_vk_ctx_end(subctx);
|
|
||||||
}
|
}
|
||||||
|
ggml_vk_ctx_end(subctx);
|
||||||
|
|
||||||
auto begin = std::chrono::high_resolution_clock::now();
|
auto begin = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
@ -5702,109 +5718,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|||||||
|
|
||||||
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||||
#if defined(GGML_VULKAN_RUN_TESTS)
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_0);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_1);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q8_0);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q2_K);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q3_K);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
|
|
||||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL);
|
|
||||||
|
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, 4, 4, 4, 1, 1, 1, 0);
|
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, 16, 16, 16, 1, 1, 1, 0);
|
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, 32, 32, 16, 1, 1, 1, 0);
|
|
||||||
|
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, 512, 512, 100, 32, 100, 1, 2);
|
|
||||||
|
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
|
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
|
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
|
|
||||||
// ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
|
|
||||||
// ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
|
|
||||||
// ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
|
||||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
|
||||||
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL);
|
|
||||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL);
|
|
||||||
|
|
||||||
std::cerr << std::endl;
|
|
||||||
|
|
||||||
const std::vector<size_t> vals {
|
const std::vector<size_t> vals {
|
||||||
|
512, 512, 128,
|
||||||
|
128, 512, 512,
|
||||||
|
4096, 512, 4096,
|
||||||
|
11008, 512, 4096,
|
||||||
|
4096, 512, 11008,
|
||||||
|
32000, 512, 4096,
|
||||||
8, 8, 8,
|
8, 8, 8,
|
||||||
100, 46, 576,
|
100, 46, 576,
|
||||||
623, 111, 128,
|
623, 111, 128,
|
||||||
@ -5817,25 +5737,51 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|||||||
49, 49, 128,
|
49, 49, 128,
|
||||||
128, 49, 49,
|
128, 49, 49,
|
||||||
4096, 49, 4096,
|
4096, 49, 4096,
|
||||||
11008, 49, 4096,
|
|
||||||
4096, 49, 11008,
|
|
||||||
32000, 49, 4096,
|
|
||||||
512, 512, 128,
|
|
||||||
128, 512, 512,
|
|
||||||
4096, 512, 4096,
|
|
||||||
11008, 512, 4096,
|
|
||||||
4096, 512, 11008,
|
|
||||||
32000, 512, 4096,
|
|
||||||
};
|
};
|
||||||
const size_t num_it = 1;
|
const size_t num_it = 100;
|
||||||
for (size_t i = 0; i < vals.size(); i += 3) {
|
for (size_t i = 0; i < vals.size(); i += 3) {
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
|
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
|
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
|
||||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
|
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
|
||||||
// ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
|
std::cerr << '\n';
|
||||||
// ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
|
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0);
|
||||||
// ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
|
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1);
|
||||||
std::cerr << std::endl;
|
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2);
|
||||||
|
std::cerr << '\n';
|
||||||
|
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
|
||||||
|
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
|
||||||
|
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
|
||||||
|
std::cerr << '\n' << std::endl;
|
||||||
|
|
||||||
|
if (vals[i + 2] % 32 == 0) {
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_0);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_0);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_0);
|
||||||
|
std::cerr << '\n';
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_0);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_0);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_0);
|
||||||
|
std::cerr << '\n';
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_0);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_0);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_0);
|
||||||
|
std::cerr << '\n' << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vals[i + 2] % 256 == 0) {
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_K);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_K);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_K);
|
||||||
|
std::cerr << '\n';
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_K);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_K);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_K);
|
||||||
|
std::cerr << '\n';
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_K);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_K);
|
||||||
|
ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_K);
|
||||||
|
std::cerr << '\n' << std::endl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
Loading…
Reference in New Issue
Block a user