llama.cpp/ggml-tune.c

#include <string.h>

#include "ggml-threading.h"
#include "ggml-tune.h"
#include "ggml.h"

#ifdef GGML_USE_K_QUANTS
#include "k_quants.h"
#endif

// MUL_MAT fine tunning for non-GPU-offloading cases.

#define FNV_OFFSET 14695981039346656037UL
#define FNV_PRIME 1099511628211UL
static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
    char buf[30];
    snprintf(buf, 30, "%d%d%d", M, N, K);

    uint64_t hash = FNV_OFFSET;
    for (const char *p = buf; *p; p++) {
        hash ^= (uint64_t)(unsigned char)(*p);
        hash *= FNV_PRIME;
    }
    return hash;
}

// Return profile id, -1 when failed (such as unable to match shape).
// NOTE: we can not use the profile from tune because the profiles do not
// contain fields such as runner, get_size.
int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
                                         int N, int K, enum ggml_type src0_t,
                                         enum ggml_type src1_t,
                                         int stages_time[3]) {
    GGML_ASSERT(tune);

    // TODO: default_mm_cache is thread-unsafe.
    int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN;
    struct ggml_mulmat_tune_cache_ele *e = &tune->cache[slot];

    struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};

    const struct ggml_task_profile *prof = NULL;

    if (e->M == M && e->N == N && e->K == K) {
        prof = e->profile;
        if (stages_time) {
            for (int i = 0; i < 3; i++) {
                stages_time[i] = e->stages_time[i];
            }
        }
    } else {
        const struct ggml_mulmat_tune_shape *shape = NULL;
        shape = ggml_mulmat_tune_get_shape(tune, N, K, src0_t, src1_t);
        if (shape) {
            ggml_mulmat_tune_estimate_time(shape, M, profiles_time);

            int min = INT32_MAX;
            int index = -1;
            for (int i = 0; i < shape->n_profiles; i++) {
                int total = profiles_time[i].total_time;
                if (total < min) {
                    min = total;
                    index = i;
                }
            }

            if (index >= 0) {
                prof = profiles_time[index].profile;
                for (int i = 0; i < 3; i++) {
                    int t = profiles_time[index].stage_time[i];
                    if (stages_time) {
                        stages_time[i] = t;
                    }
                    e->stages_time[i] = t;
                }

                GGML_ASSERT(prof);

                e->profile = prof;
                e->M = M;
                e->N = N;
                e->K = K;

#ifndef GGML_TUNE_NDEBUG
                printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
                       "profile name: %s\n",
                       M, N, K, prof->id, prof->name);
#endif
            }
        }
    }

    return prof ? prof->id : -1;
}

void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
                                 const char *name, enum ggml_ftype ftype) {
    const int n_vocab = 32000;
    int n_embd;
    // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
    int n_ff;
    // n_rot = n_embd/n_head;
    int n_rot;

    if (strcmp(name, "3B") == 0) {
        // n_head=32, n_mult=216, n_layer=26
        // https://github.com/ggerganov/llama.cpp/pull/1588
        n_embd = 3200;
        n_ff = 8640;
        n_rot = 100;
    } else if (strcmp(name, "7B") == 0) {
        n_embd = 4096;
        n_ff = 11008;
        n_rot = 128;
    } else if (strcmp(name, "13B") == 0) {
        n_embd = 5120;
        n_ff = 13824;
        n_rot = 128;
    } else if (strcmp(name, "30B") == 0) {
        n_embd = 6656;
        n_ff = 17920;
        n_rot = 128;
    } else if (strcmp(name, "65B") == 0) {
        n_embd = 8192;
        n_ff = 22016;
        n_rot = 128;
    } else {
        GGML_ASSERT(false);
    }

    model->name = name;
    model->ftype = ftype;
    model->n_vocab = n_vocab;
    model->n_embd = n_embd;
    model->n_ff = n_ff;
    model->n_rot = n_rot;
}

bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
                           struct ggml_mulmat_tune_params *params,
                           ggml_task_profiles_provider *profiles_provider) {
    GGML_ASSERT(profiles_provider);
    struct ggml_mulmat_tune_model *model = &params->model;

    memset(tune, 0, sizeof(struct ggml_mulmat_tune));

    tune->version = GGML_MULMAT_TUNE_VERSION;
    tune->n_threads = params->n_threads;
    tune->ftype = model->ftype;

    size_t name_len = strlen(model->name);
    GGML_ASSERT(name_len > 0);
    strncpy(tune->model, model->name, sizeof(tune->model) - 1);

    const enum ggml_type rot_src0_type = GGML_TYPE_F16;
    const enum ggml_type src1_type = GGML_TYPE_F32;

    int n_vocab = model->n_vocab;
    int n_embd = model->n_embd;
    int n_ff = model->n_ff;
    int n_rot = model->n_rot;

    enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype);

    GGML_ASSERT(GGML_MULMAT_N_SHAPES == 4 || GGML_MULMAT_N_SHAPES == 6);
    tune->n_shapes = GGML_MULMAT_N_SHAPES;

    // Attention layers
    tune->shapes[0] = (struct ggml_mulmat_tune_shape){
        .N = n_embd, .K = n_embd, .src0_type = type, .src1_type = src1_type};
    // Feed forward layers
    tune->shapes[1] = (struct ggml_mulmat_tune_shape){
        .N = n_embd, .K = n_ff, .src0_type = type, .src1_type = src1_type};
    tune->shapes[2] = (struct ggml_mulmat_tune_shape){
        .N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type};
    tune->shapes[3] = (struct ggml_mulmat_tune_shape){
        .N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type};

    tune->n_shapes = GGML_MULMAT_N_SHAPES;

    if (GGML_MULMAT_N_SHAPES == 6) {
        // RoPE.
        // - very small comparing to previous, almost no need to bench.
        // - an Illegal instruction exception on Github (mac-latest-cmake).
        // - CL sometimes throws error on localhost.
        // So temporarily disabled as a workaround.
        tune->shapes[4] =
            (struct ggml_mulmat_tune_shape){.N = n_rot,
                                            .K = 0,
                                            .src0_type = rot_src0_type,
                                            .src1_type = src1_type};
        tune->shapes[5] =
            (struct ggml_mulmat_tune_shape){.N = 0,
                                            .K = n_rot,
                                            .src0_type = rot_src0_type,
                                            .src1_type = src1_type};
    }

    for (int i = 0; i < tune->n_shapes; i++) {
        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];

        struct ggml_tensor src0 = {
            .type = shape->src0_type,
        };
        struct ggml_tensor src1 = {
            .type = shape->src1_type,
        };
        struct ggml_tensor node = {
            .op = GGML_OP_MUL_MAT,
            .src0 = &src0,
            .src1 = &src1,
        };

        shape->n_profiles = profiles_provider(&node, shape->profiles);
        if (shape->n_profiles == 0) {
            // allowed for testing.
            continue;
        }

        shape->m_num = params->m_num;
        shape->arr_m = malloc(shape->m_num * sizeof(int));
        GGML_ASSERT(shape->arr_m);
        for (int j = 0; j < shape->m_num; j++) {
            shape->arr_m[j] = 1 << j;
        }

        size_t sz = sizeof(struct ggml_mulmat_tune_m) *
                    (shape->n_profiles * shape->m_num);
        shape->items = malloc(sz);
        GGML_ASSERT(shape->items);
        memset(shape->items, 0, sz);
    }

    return true;
}

void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
    for (int i = 0; i < tune->n_shapes; i++) {
        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
        GGML_ASSERT(shape);

        // arr_m and items can be NULL only when testing.
        if (shape->m_num > 0) {
            if (shape->arr_m) {
                free(shape->arr_m);
                shape->arr_m = NULL;
            }
            if (shape->items) {
                free(shape->items);
                shape->items = NULL;
            }
            shape->m_num = 0;
        }
    }
}

static bool ggml_mulmat_tune_write_profiles(
    FILE *fp, const struct ggml_task_profile *profiles, int n_profiles) {
    int rc;
    for (int i = 0; i < n_profiles; i++) {
        const struct ggml_task_profile *profile = &profiles[i];
        for (int j = 0; j < 3; j++) {
            const struct ggml_task_stage *ts = &profile->stages[j];
            rc = fprintf(fp, "%1d%1d%1d", ts->valid ? 1 : 0,
                         ts->parallel ? 1 : 0, ts->wait ? 1 : 0);
            if (rc <= 0) {
                return false;
            }
            if (j < 2) {
                rc = fprintf(fp, " ");
                if (rc <= 0) {
                    return false;
                }
            }
        }
        rc = fprintf(fp, " %d %s", profile->id, profile->name);
        if (rc <= 0) {
            return false;
        }
        rc = fprintf(fp, "\n");
        if (rc <= 0) {
            return false;
        }
    }

    return true;
}

static bool
ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
                                   const char *model, int ftype, int n_threads,
                                   char *errbuf, int errbuf_len) {
    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
        snprintf(errbuf, errbuf_len - 1,
                 "version mismatch, built-in: %d, "
                 "yours: %d",
                 GGML_MULMAT_TUNE_VERSION, tune->version);
        return false;
    } else if (strcmp(model, tune->model) != 0) {
        snprintf(errbuf, errbuf_len - 1,
                 "model mismatch. built-in: %s, yours: %s", model, tune->model);
        return false;
    } else if (ftype != tune->ftype) {
        snprintf(errbuf, errbuf_len - 1,
                 "ftype mismatch. built-in: %d, yours: %d\n", ftype,
                 tune->ftype);
        return false;
    } else if (n_threads != tune->n_threads) {
        snprintf(errbuf, errbuf_len - 1,
                 "n_threads mismatch. run-time: %d, yours: %d\n", n_threads,
                 tune->n_threads);
        return false;
    }

    for (int i = 0; i < tune->n_shapes; i++) {
        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];

        struct ggml_tensor src0 = {
            .type = shape->src0_type,
        };
        struct ggml_tensor src1 = {
            .type = shape->src1_type,
        };
        struct ggml_tensor node = {
            .op = GGML_OP_MUL_MAT,
            .src0 = &src0,
            .src1 = &src1,
        };

        struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
        memset(builtin_profiles, 0, sizeof(builtin_profiles));

        int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);

        if (n_profiles != shape->n_profiles) {
            snprintf(errbuf, errbuf_len - 1,
                     "task profiles mismatch (n_profiles)");
            return false;
        }

        // TODO: profiles order is relevant, too strict.
        // Only validate stages!
        size_t sz = sizeof(struct ggml_task_stage) * 3;
        bool matched = true;
        for (int j = 0; j < n_profiles; j++) {
            if (builtin_profiles[j].id != shape->profiles[j].id) {
                return false;
            }
            if (memcmp(builtin_profiles[j].stages, shape->profiles[j].stages,
                       sz) != 0) {
                matched = false;
                break;
            }
        }
        if (!matched) {
            snprintf(errbuf, errbuf_len - 1,
                     "task profiles mismatch (profiles)");

            printf("=== built-in profiles:\n");
            ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
                                            n_profiles);

            printf("=== incoming profiles:\n");
            ggml_mulmat_tune_write_profiles(stderr, shape->profiles,
                                            shape->n_profiles);
            return false;
        }
    }

    return true;
}

bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
                               const char *model, int ftype, int n_threads) {
    char errbuf[128];
    bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
                                                 errbuf, sizeof(errbuf));
    if (!ok) {
        fprintf(stderr, "[tune] error: %s. run bench again.\n", errbuf);
    }

    return ok;
}

int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
    GGML_ASSERT(tune);
    memset(tune, 0, sizeof(struct ggml_mulmat_tune));

    int rc = fscanf(fp, "%d", &tune->version);
    if (rc <= 0) {
        return 1;
    }

    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
        fprintf(stderr, "[tune] version mismatch, run bench again\n");
        return 2;
    }

    rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
                &tune->n_shapes, &tune->n_threads);
    if (rc <= 0) {
        return 3;
    }

    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];

        rc = fscanf(fp, "%d %d %d %d %d %d", &shape->N, &shape->K,
                    (int *)&shape->src0_type, (int *)&shape->src1_type,
                    &shape->n_profiles, &shape->m_num);
        if (rc <= 0) {
            return 4;
        }

        {
            size_t item_size = sizeof(struct ggml_mulmat_tune_m) *
                               (shape->n_profiles * shape->m_num);
            shape->items = malloc(item_size);
            if (shape->items == NULL) {
                fprintf(stderr, "[tune] failed to allocate memory\n");
                return false;
            }
            memset(shape->items, 0, item_size);
        }

        for (int ip = 0; ip < shape->n_profiles; ip++) {
            struct ggml_task_profile *profile = &shape->profiles[ip];

            for (int j = 0; j < 3; j++) {
                struct ggml_task_stage *ts = &profile->stages[j];
                int valid;
                int parallel;
                int wait;
                rc = fscanf(fp, " %1d%1d%1d", &valid, &parallel, &wait);
                if (rc <= 0) {
                    return 5;
                }
                ts->valid = valid ? true : false;
                ts->parallel = parallel ? true : false;
                ts->wait = wait ? true : false;
            }

            rc = fscanf(fp, "%d %s", &profile->id, profile->name);
            if (rc <= 0) {
                return 6;
            }
        }

        for (int i_m = 0; i_m < shape->m_num; i_m++) {
            int M;
            for (int ip = 0; ip < shape->n_profiles; ip++) {
                if (ip == 0) {
                    rc = fscanf(fp, "%d", &M);
                    if (rc <= 0) {
                        return 7;
                    }
                }
                struct ggml_mulmat_tune_m *item =
                    &shape->items[ip * shape->m_num + i_m];
                item->M = M;
                rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
                            &item->stages_time[1], &item->stages_time[2]);
                if (rc <= 0) {
                    return 8;
                }
            }
        }
    }

    return 0;
}

bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
                                 FILE *fp) {
    int rc;
    rc = fprintf(fp, "%d %s %d %d %d\n\n", tune->version, tune->model,
                 tune->ftype, tune->n_shapes, tune->n_threads);
    if (rc <= 0) {
        return false;
    }

    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
        if (i_shape > 0) {
            fprintf(fp, "\n");
        }
        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
        rc = fprintf(fp, "%d %d %d %d %d %d\n", shape->N, shape->K,
                     shape->src0_type, shape->src1_type, shape->n_profiles,
                     shape->m_num);
        if (rc <= 0) {
            return false;
        }

        rc = ggml_mulmat_tune_write_profiles(fp, shape->profiles,
                                             shape->n_profiles);
        if (rc <= 0) {
            return false;
        }

        for (int i_m = 0; i_m < shape->m_num; i_m++) {
            for (int ip = 0; ip < shape->n_profiles; ip++) {
                struct ggml_mulmat_tune_m *item =
                    &shape->items[ip * shape->m_num + i_m];
                if (ip == 0) {
                    rc = fprintf(fp, "%4d", item->M);
                    if (rc <= 0) {
                        return false;
                    }
                }

                const struct ggml_task_profile *profile = &shape->profiles[ip];
                for (int k = 0; k < 3; k++) {
                    if (profile->stages[k].valid) {
                        rc = fprintf(fp, "%9d", item->stages_time[k]);
                        if (rc <= 0) {
                            return false;
                        }
                    } else {
                        rc = fprintf(fp, " 0");
                        if (rc <= 0) {
                            return false;
                        }
                    }
                }
            }
            rc = fprintf(fp, "\n");
            if (rc <= 0) {
                return false;
            }
        }
    }

    return true;
}

const struct ggml_mulmat_tune_shape *
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
                           const int K, enum ggml_type src0_type,
                           enum ggml_type src1_type) {
    for (int i = 0; i < tune->n_shapes; i++) {
        const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
        if (s->src0_type != src0_type || s->src1_type != src1_type) {
            continue;
        }

        if (s->N > 0 && s->K > 0) {
            if (s->N == N && s->K == K) {
                return s;
            }
        }

        if (GGML_MULMAT_N_SHAPES == 6) {
            if (s->N > 0 && s->K == 0) {
                if (s->N == N) {
                    return s;
                }
            } else if (s->N == 0 && s->K > 0) {
                if (s->K == K) {
                    return s;
                }
            }
        }
    }

    return NULL;
}

// This is the experimental reference implementation.
// Requires both n_threads are same at bench time and runtime.
void ggml_mulmat_tune_estimate_time(
    const struct ggml_mulmat_tune_shape *shape, int M,
    struct ggml_mulmat_tune_time *profile_time) {

    GGML_ASSERT(shape);
    GGML_ASSERT(profile_time);

    const int m_num = shape->m_num;
    const int min_m = shape->items[0].M;
    const int max_m = shape->items[m_num - 1].M;

    for (int ip = 0; ip < shape->n_profiles; ip++) {
        const struct ggml_task_profile *profile = &shape->profiles[ip];
        profile_time[ip].total_time = 0;
        profile_time[ip].profile = profile;

        const int items_offset = ip * m_num;

        struct ggml_mulmat_tune_m *p0 = NULL;
        struct ggml_mulmat_tune_m *p1 = NULL;
        if (M < min_m) {
            // first two.
            p0 = &shape->items[items_offset];
            p1 = &shape->items[items_offset + 1];
        } else if (M > max_m) {
            // last two
            p0 = &shape->items[items_offset + m_num - 2];
            p1 = &shape->items[items_offset + m_num - 1];
        } else {
            for (int i = 0; i < m_num; i++) {
                p1 = &shape->items[items_offset + i];
                if (p1->M == M) {
                    p0 = p1;
                    break;
                }

                if (i > 0) {
                    p0 = (struct ggml_mulmat_tune_m *)(p1 - 1);
                    if (M > p0->M && M < p1->M) {
                        break;
                    }
                }
            }
        }

        GGML_ASSERT(p0 && p1);

        for (int i_stage = 0; i_stage < 3; i_stage++) {
            const struct ggml_task_stage *stage = &profile->stages[i_stage];
            if (!stage->valid) {
                continue;
            }

            int p0_v = p0->stages_time[i_stage];
            int p1_v = p1->stages_time[i_stage];

            GGML_ASSERT(p0_v >= 0);
            GGML_ASSERT(p1_v >= 0);

            // t = aM + b
            double a;
            double b;

            if (p0 == p1) {
                a = 0.0;
                b = p1_v;
            } else {
                a = 1.0 * (p1_v - p0_v) / (p1->M - p0->M);
                b = p1_v - a * p1->M;
            }
            int t = (int)(a * M + b);

            profile_time[ip].stage_time[i_stage] = t;
            profile_time[ip].total_time += t;
        }
    }
}

// Experimental: create mul_mat tensor.
static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
                                                  enum ggml_type src0_type,
                                                  struct ggml_context **ctx) {
    // At most 256, because in `ggml_quantize_qx_x`, the index type of hist is
    // either int8_t or uint8_t.
    // Use 1024 to avoid suddenly broken.
    int64_t hist[1024];

    bool src0_is_quantized = ggml_is_quantized(src0_type);

    size_t ctx_size = 0;
    ctx_size += (size_t)(M * N * ggml_type_sizef(GGML_TYPE_F32)); // src1
    ctx_size += (size_t)(N * K * ggml_type_sizef(src0_type));     // src0
    ctx_size += (size_t)(1024 * 1024 * 64); // experimental

    if (src0_is_quantized) {
        // quantize F32 to Qx_x
        ctx_size += (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32));
    }

    struct ggml_init_params init_params = {
        .mem_size = ctx_size,
        .mem_buffer = NULL,
        .no_alloc = 0,
    };

    *ctx = ggml_init(init_params);
    GGML_ASSERT(*ctx);

    // src0: N x K
    struct ggml_tensor *src0 =
        ggml_new_tensor_2d(*ctx, src0_type, (int64_t)K, (int64_t)N);

    // src1: M x K
    struct ggml_tensor *src1 =
        ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)M);
    ggml_set_f32(src1, 0.5f);

    if (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16) {
        ggml_set_f32(src0, 0.1f);
    } else if (src0_is_quantized) {
        struct ggml_tensor *src0_f32 =
            ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N);
        ggml_set_f32(src0_f32, 0.1f);

        const float *src_data = (const float *)src0_f32->data;
        int nxk = N * K;

        switch (src0_type) {
        case GGML_TYPE_Q4_0:
            ggml_quantize_q4_0(src_data, src0->data, nxk, K, hist);
            break;
        case GGML_TYPE_Q4_1:
            ggml_quantize_q4_1(src_data, src0->data, nxk, K, hist);
            break;
        case GGML_TYPE_Q5_0:
            ggml_quantize_q5_0(src_data, src0->data, nxk, K, hist);
            break;
        case GGML_TYPE_Q5_1:
            ggml_quantize_q5_1(src_data, src0->data, nxk, K, hist);
            break;
        case GGML_TYPE_Q8_0:
            ggml_quantize_q8_0(src_data, src0->data, nxk, K, hist);
            break;
#ifdef GGML_USE_K_QUANTS
        case GGML_TYPE_Q2_K:
            ggml_quantize_q2_K(src_data, src0->data, nxk, K, hist);
            break;
        case GGML_TYPE_Q3_K:
            ggml_quantize_q3_K(src_data, src0->data, nxk, K, hist);
            break;
        case GGML_TYPE_Q4_K:
            ggml_quantize_q4_K(src_data, src0->data, nxk, K, hist);
            break;
        case GGML_TYPE_Q5_K:
            ggml_quantize_q5_K(src_data, src0->data, nxk, K, hist);
            break;
        case GGML_TYPE_Q6_K:
            ggml_quantize_q6_K(src_data, src0->data, nxk, K, hist);
            break;
#endif
        default:
            GGML_ASSERT(false);
        }
    } else {
        GGML_ASSERT(false);
    }

    // node: M x N
    // Will compute z = y * xT, z: node, y: src1, x: src0
    return ggml_mul_mat(*ctx, src0, src1);
}

// Experimental: allocate memory for wdata with max possible size.
// This part of code is actually belongs to ggml compute graph.
static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
    // The size is actually determined by cgraph before computing.
    // Apart from the src0_type, wsize is affected by backend, cache line size,
    // n_threads etc.

    const size_t extra = 1024 * 1024;
    size_t sz = (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)) + extra;
    void *buf = malloc(sz);

    if (!buf) {
        fprintf(stderr, "[tune] error: failed to allocate %zu MiB memory",
                sz / 1024 / 1024);
        return 0;
    }

    memset(buf, 0, sz);
    *wdata = buf;
    return sz;
}

bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params) {
    GGML_ASSERT(tune);
    GGML_ASSERT(params);
    GGML_ASSERT(params->model.name);

    memset(tune, 0, sizeof(struct ggml_mulmat_tune));

    bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
    if (!ok) {
        return false;
    }

    {
        char buf[128] = {0};
        int offset = 0;

        for (int i = 0; i < tune->shapes[0].n_profiles; i++) {
            if (i > 0) {
                buf[offset++] = ',';
                buf[offset++] = ' ';
            }
            const char *name = tune->shapes[0].profiles[i].name;
            GGML_ASSERT(name != NULL && strcmp(name, "") != 0);
            size_t len = strlen(name);
            memcpy(&buf[offset], name, len);
            offset += (int)len;
        }

        fprintf(stdout,
                "[tune] model: %s, ggml ftype: %d, "
                "n_pass: %d, n_shapes: %d, n_threads: %d, profiles: %s\n",
                params->model.name, params->model.ftype, params->n_pass,
                tune->n_shapes, params->n_threads, buf);
    }

    int64_t stages_time[3];
    int64_t t0 = ggml_time_ms();

    struct ggml_threading_context *thrd_ctx =
        ggml_threading_start(tune->n_threads, NULL, NULL,
                             GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);

    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
        int M;
        int N = shape->N;
        int K = shape->K;

        char buf[20] = {0};
        int buf_len = sizeof(buf) - 1;
        int line_len = 0;

        for (int i_m = 0; i_m < shape->m_num; i_m++) {
            M = shape->arr_m[i_m];
            if (shape->N == 0) {
                N = M;
            } else if (shape->K == 0) {
                K = M;
            }

            if (params->progress) {
                line_len = snprintf(buf, buf_len, "%d %d %d ", N, K, M);
                fprintf(stdout, "%s", buf);
                fflush(stdout);
            }

            char *wdata = NULL;
            size_t wsize = ggml_mulmat_allocate_wdata(N, K, &wdata);
            if (wsize == 0) {
                return false;
            }

            struct ggml_context *ctx = NULL;
            struct ggml_tensor *node =
                ggml_mulmat_new_tensor(M, N, K, shape->src0_type, &ctx);

            for (int ip = 0; ip < shape->n_profiles; ip++) {
                const struct ggml_task_profile *profile = &shape->profiles[ip];
                // GGML_ASSERT(profile->runner);

                memcpy(&node->task_profile, profile,
                       sizeof(struct ggml_task_profile));

                struct ggml_mulmat_tune_m *item =
                    &shape->items[ip * shape->m_num + i_m];
                item->M = M;

                int min[3] = {INT32_MAX, INT32_MAX, INT32_MAX};

                for (int k = 0; k < params->n_pass; k++) {
                    for (int j = 0; j < 3; j++) {
                        stages_time[j] = 0;
                    }

                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);

                    if (memcmp(profile, &node->task_profile,
                               sizeof(struct ggml_task_profile)) != 0) {
                        printf("[tune] error: task profile changed, tensor op: "
                               "%d, original id: %d, current id: %d\n",
                               node->op, profile->id, node->task_profile.id);
                        exit(1);
                    }

                    for (int i = 0; i < 3; i++) {
                        int v = (int)stages_time[i];
                        if (v < min[i]) {
                            min[i] = v;
                        }
                    }

                    if (params->progress) {
                        fprintf(stdout, ".");
                        fflush(stdout);
                        line_len++;
                    }
                }
                for (int i = 0; i < 3; i++) {
                    item->stages_time[i] = min[i];
                }
            }

            ggml_free(ctx);
            free(wdata);

            if (params->progress) {
                line_len += 10;
                for (int j = 0; j < line_len; j++) {
                    fprintf(stdout, "\b \b");
                }
                fflush(stdout);
            }
        }
    }

    ggml_threading_stop(thrd_ctx);

    fprintf(stdout, "[tune] done, elapsed time: %d seconds.\n",
            (int)(ggml_time_ms() - t0) / 1000);

    // output

    if (params->fname && strcmp(params->fname, "") != 0) {
        FILE *fp = fopen(params->fname, "w");
        if (!fp) {
            fprintf(stderr,
                    "[tune] warn: failed to open file `%s`, print to "
                    "console instead\n\n",
                    params->fname);
            params->output_console = 1;
        } else {
            ok = ggml_mulmat_tune_write_data(tune, fp);
            fclose(fp);

            if (ok) {
                fprintf(stdout, "[tune] data was written to `%s`\n",
                        params->fname);
            } else {
                fprintf(stderr,
                        "[tune] warn: failed to write file `%s`, print to "
                        "console instead\n\n",
                        params->fname);
                params->output_console = 1;
            }
        }
    }

    if (params->output_console) {
        return ggml_mulmat_tune_write_data(tune, stdout);
    }

    return true;
}