mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 03:01:45 +00:00
tunning: add f16, todo: f32 failed with CL
This commit is contained in:
parent
7c05049f8b
commit
21e9379707
14
ggml-tune.c
14
ggml-tune.c
@ -103,8 +103,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
|
|||||||
names[i] = ggml_mulmat_tune_task_backend_name(
|
names[i] = ggml_mulmat_tune_task_backend_name(
|
||||||
prof->stages[i].backend);
|
prof->stages[i].backend);
|
||||||
}
|
}
|
||||||
printf(
|
printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
|
||||||
"\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
|
|
||||||
"fastest profile: %s %s %s\n",
|
"fastest profile: %s %s %s\n",
|
||||||
M, N, K, names[0], names[1], names[2]);
|
M, N, K, names[0], names[1], names[2]);
|
||||||
#endif
|
#endif
|
||||||
@ -707,8 +706,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
|
|||||||
void *buf = malloc(sz);
|
void *buf = malloc(sz);
|
||||||
|
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
fprintf(stderr,
|
fprintf(stderr, "[tune] error: failed to allocate %zu MiB memory",
|
||||||
"[tune] error: failed to allocate %zu MiB memory",
|
|
||||||
sz / 1024 / 1024);
|
sz / 1024 / 1024);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -835,8 +833,9 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
stages_time[j] = 0;
|
stages_time[j] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*enum ggml_compute_error err = */
|
enum ggml_compute_error err = ggml_threading_compute_tensor(
|
||||||
ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
|
thrd_ctx, node, wdata, wsize);
|
||||||
|
GGML_ASSERT(err == GGML_COMPUTE_OK);
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
int v = (int)stages_time[i];
|
int v = (int)stages_time[i];
|
||||||
@ -892,8 +891,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
|
|||||||
fprintf(stdout, "[tune] data was written to `%s`\n",
|
fprintf(stdout, "[tune] data was written to `%s`\n",
|
||||||
params->fname);
|
params->fname);
|
||||||
} else {
|
} else {
|
||||||
fprintf(
|
fprintf(stderr,
|
||||||
stderr,
|
|
||||||
"[tune] warn: failed to write file `%s`, print to "
|
"[tune] warn: failed to write file `%s`, print to "
|
||||||
"console instead\n\n",
|
"console instead\n\n",
|
||||||
params->fname);
|
params->fname);
|
||||||
|
@ -8,12 +8,13 @@
|
|||||||
static int bench(void);
|
static int bench(void);
|
||||||
static int estimate_time_non_zero_NK(void);
|
static int estimate_time_non_zero_NK(void);
|
||||||
|
|
||||||
static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
|
static void init_params(struct ggml_mulmat_tune_params *params,
|
||||||
|
enum ggml_ftype ftype, int m_num, int n_threads) {
|
||||||
*params = (struct ggml_mulmat_tune_params){
|
*params = (struct ggml_mulmat_tune_params){
|
||||||
.model =
|
.model =
|
||||||
(struct ggml_mulmat_tune_model){
|
(struct ggml_mulmat_tune_model){
|
||||||
.name = "3B", // fake
|
.name = "xB", // fake model name
|
||||||
.ftype = GGML_FTYPE_MOSTLY_Q4_0,
|
.ftype = ftype,
|
||||||
.n_vocab = 4096,
|
.n_vocab = 4096,
|
||||||
.n_embd = 1024,
|
.n_embd = 1024,
|
||||||
.n_ff = 2048,
|
.n_ff = 2048,
|
||||||
@ -21,7 +22,7 @@ static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
|
|||||||
},
|
},
|
||||||
.m_num = m_num,
|
.m_num = m_num,
|
||||||
.n_pass = 1,
|
.n_pass = 1,
|
||||||
.n_threads = 1,
|
.n_threads = n_threads,
|
||||||
.progress = false,
|
.progress = false,
|
||||||
.output_console = true,
|
.output_console = true,
|
||||||
.fname = NULL};
|
.fname = NULL};
|
||||||
@ -45,13 +46,11 @@ int main(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int bench(void) {
|
static int bench(void) {
|
||||||
printf("test: %s\n", __func__);
|
|
||||||
|
|
||||||
{
|
{
|
||||||
enum ggml_task_backend backends[16];
|
enum ggml_task_backend backends[16];
|
||||||
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
|
||||||
if (n_backends < 2) {
|
if (n_backends < 2) {
|
||||||
printf("test: %s, skipped because no BLAS\n", __func__);
|
printf("[test-ggml-tune] skipped because no BLAS\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -67,16 +66,48 @@ static int bench(void) {
|
|||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// F32: ggml_opencl: ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02,
|
||||||
|
// NULL) error -30 at /Users/mqy/tools/AI/llama.cpp/ggml-opencl.cpp:838
|
||||||
|
enum ggml_ftype ftypes[] = {
|
||||||
|
// GGML_FTYPE_ALL_F32,
|
||||||
|
GGML_FTYPE_MOSTLY_F16,
|
||||||
|
GGML_FTYPE_MOSTLY_Q4_0,
|
||||||
|
};
|
||||||
|
|
||||||
|
int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
|
||||||
|
|
||||||
|
const int m_num = 4;
|
||||||
|
|
||||||
|
// Don't use n_threads larger than 2 because Github build hots has limited
|
||||||
|
// resource quota.
|
||||||
|
int threads_arr[] = {1, 2};
|
||||||
|
int thread_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
|
||||||
|
|
||||||
|
int n_passed = 0;
|
||||||
|
int n_tests = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_ftypes; i++) {
|
||||||
|
for (int j = 0; j < thread_arr_len; j++) {
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
int n_threads = threads_arr[j];
|
||||||
struct ggml_mulmat_tune tune;
|
struct ggml_mulmat_tune tune;
|
||||||
|
|
||||||
struct ggml_mulmat_tune_params params;
|
struct ggml_mulmat_tune_params params;
|
||||||
|
memset(¶ms, 0, sizeof(struct ggml_mulmat_tune_params));
|
||||||
|
init_params(¶ms, ftypes[i], m_num, n_threads);
|
||||||
|
|
||||||
init_params(¶ms, /*m_num*/ 4);
|
++n_tests;
|
||||||
|
|
||||||
bool ok = ggml_mulmat_tune_bench(&tune, ¶ms);
|
bool ok = ggml_mulmat_tune_bench(&tune, ¶ms);
|
||||||
|
if (ok) {
|
||||||
|
++n_passed;
|
||||||
|
}
|
||||||
ggml_mulmat_tune_free(&tune);
|
ggml_mulmat_tune_free(&tune);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ok ? 0 : 1;
|
printf("[test-ggml-tune] %d / %d passed\n", n_passed, n_tests);
|
||||||
|
return (n_passed == n_tests) ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// implement `ggml_task_profiles_provider`
|
// implement `ggml_task_profiles_provider`
|
||||||
@ -93,7 +124,7 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int estimate_time_non_zero_NK(void) {
|
int estimate_time_non_zero_NK(void) {
|
||||||
printf("test: %s\n", __func__);
|
printf("test-ggml-tune: %s\n", __func__);
|
||||||
|
|
||||||
struct test_data_t {
|
struct test_data_t {
|
||||||
int M;
|
int M;
|
||||||
@ -106,9 +137,10 @@ int estimate_time_non_zero_NK(void) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const int m_num = 2;
|
const int m_num = 2;
|
||||||
|
const int n_threads = 1; // useless.
|
||||||
|
|
||||||
struct ggml_mulmat_tune_params params;
|
struct ggml_mulmat_tune_params params;
|
||||||
init_params(¶ms, m_num);
|
init_params(¶ms, tune.ftype, m_num, n_threads);
|
||||||
|
|
||||||
ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider);
|
ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider);
|
||||||
|
|
||||||
@ -123,8 +155,8 @@ int estimate_time_non_zero_NK(void) {
|
|||||||
GGML_ASSERT(shape->n_profiles == 2);
|
GGML_ASSERT(shape->n_profiles == 2);
|
||||||
GGML_ASSERT(ggml_is_quantized(shape->src0_type));
|
GGML_ASSERT(ggml_is_quantized(shape->src0_type));
|
||||||
|
|
||||||
printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K,
|
printf("[test-ggml-tune] %s, shape: N: %d, K: %d, n_profiles: %d\n",
|
||||||
shape->n_profiles);
|
__func__, shape->N, shape->K, shape->n_profiles);
|
||||||
|
|
||||||
{
|
{
|
||||||
shape->items[0] =
|
shape->items[0] =
|
||||||
|
Loading…
Reference in New Issue
Block a user