llama.cpp/ggml-tune.h

138 lines
3.3 KiB
C

#pragma once
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_MULMAT_TUNE_VERSION 8
#define GGML_MULMAT_N_SHAPES 6
#define GGML_MULMAT_MAX_PASS 3
struct ggml_mulmat_tune_m {
int M;
int stages_time[3];
};
struct ggml_mulmat_tune_model {
const char *name;
enum ggml_ftype ftype;
int n_vocab;
int n_embd;
// n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
int n_ff;
// n_rot = n_embd/n_head;
int n_rot;
};
struct ggml_mulmat_tune_shape {
// For RoPE, one of N / K is 0.
int N;
int K;
enum ggml_type src0_type;
enum ggml_type src1_type;
int n_profiles;
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
int m_num;
int *arr_m;
struct ggml_mulmat_tune_m *items;
};
struct ggml_mulmat_tune {
int version;
char model[16];
enum ggml_ftype ftype;
int n_shapes;
// Given N/K, we bench for mul_mat [M,K] x [K,N].
struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES];
int n_threads;
};
struct ggml_mulmat_tune_time {
const struct ggml_task_profile *profile;
int stage_time[3];
int total_time;
};
struct mm_cache_element {
int M;
int N;
int K;
const struct ggml_task_profile *profile;
int stages_time[3];
};
// params for tune/bench.
struct ggml_mulmat_tune_params {
struct ggml_mulmat_tune_model model;
int m_num;
int n_pass;
int n_threads;
bool progress; // print and clear '.'
bool output_console; // also print result to console
const char *fname;
};
// NOTE: stages_time is filled if not null.
const struct ggml_task_profile *
ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
int N, int K, enum ggml_type src0_t,
enum ggml_type src1_t, int stages_time[3]);
bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
const char *model_name, int ftype,
int n_threads);
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
const char *name, enum ggml_ftype ftype);
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params,
ggml_task_profiles_provider *profiles_provider);
void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
const struct ggml_mulmat_tune_shape *
ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
enum ggml_type src0_type, enum ggml_type src1_type);
void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
int M,
struct ggml_mulmat_tune_time *profile_time);
const char *ggml_task_backend_name(enum ggml_task_backend backend);
int ggml_mulmat_tune_get_builtin_task_backends(
enum ggml_task_backend *backends);
bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
struct ggml_mulmat_tune_params *params);
#ifdef __cplusplus
}
#endif