mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 11:40:17 +00:00
YaRN : store rope scaling type as int32_t in memory (#5285)
* YaRN : store rope scaling type as int32_t in memory * llama : store mapped names as const char *
This commit is contained in:
parent
6a66c5071a
commit
1ec3332ade
@ -75,8 +75,7 @@ struct gpt_params {
|
|||||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
|
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||||
// pinging @cebtenzzre
|
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
|
24
llama.cpp
24
llama.cpp
@ -208,7 +208,7 @@ enum llm_arch {
|
|||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_LLAMA, "llama" },
|
{ LLM_ARCH_LLAMA, "llama" },
|
||||||
{ LLM_ARCH_FALCON, "falcon" },
|
{ LLM_ARCH_FALCON, "falcon" },
|
||||||
{ LLM_ARCH_GPT2, "gpt2" },
|
{ LLM_ARCH_GPT2, "gpt2" },
|
||||||
@ -285,7 +285,7 @@ enum llm_kv {
|
|||||||
LLM_KV_TOKENIZER_RWKV,
|
LLM_KV_TOKENIZER_RWKV,
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||||
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
||||||
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
||||||
@ -346,7 +346,7 @@ struct LLM_KV {
|
|||||||
llm_arch arch;
|
llm_arch arch;
|
||||||
|
|
||||||
std::string operator()(llm_kv kv) const {
|
std::string operator()(llm_kv kv) const {
|
||||||
return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
|
return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -747,13 +747,13 @@ struct LLM_TN {
|
|||||||
// gguf helpers
|
// gguf helpers
|
||||||
//
|
//
|
||||||
|
|
||||||
static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
|
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
||||||
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
||||||
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
||||||
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
||||||
};
|
};
|
||||||
|
|
||||||
static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||||||
if (kv.second == name) {
|
if (kv.second == name) {
|
||||||
return kv.first;
|
return kv.first;
|
||||||
@ -1415,6 +1415,7 @@ static const size_t GiB = 1024*MiB;
|
|||||||
|
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
bool vocab_only;
|
bool vocab_only;
|
||||||
|
bool rope_finetuned;
|
||||||
uint32_t n_vocab;
|
uint32_t n_vocab;
|
||||||
uint32_t n_ctx_train; // context size the model was trained on
|
uint32_t n_ctx_train; // context size the model was trained on
|
||||||
uint32_t n_embd;
|
uint32_t n_embd;
|
||||||
@ -1434,8 +1435,7 @@ struct llama_hparams {
|
|||||||
float rope_freq_base_train;
|
float rope_freq_base_train;
|
||||||
float rope_freq_scale_train;
|
float rope_freq_scale_train;
|
||||||
uint32_t n_yarn_orig_ctx;
|
uint32_t n_yarn_orig_ctx;
|
||||||
int8_t rope_scaling_type_train : 3;
|
int32_t rope_scaling_type_train;
|
||||||
bool rope_finetuned : 1;
|
|
||||||
|
|
||||||
float f_clamp_kqv;
|
float f_clamp_kqv;
|
||||||
float f_max_alibi_bias;
|
float f_max_alibi_bias;
|
||||||
@ -2701,7 +2701,7 @@ struct llama_model_loader {
|
|||||||
// load LLaMA models
|
// load LLaMA models
|
||||||
//
|
//
|
||||||
|
|
||||||
static std::string llama_model_arch_name(llm_arch arch) {
|
static const char * llama_model_arch_name(llm_arch arch) {
|
||||||
auto it = LLM_ARCH_NAMES.find(arch);
|
auto it = LLM_ARCH_NAMES.find(arch);
|
||||||
if (it == LLM_ARCH_NAMES.end()) {
|
if (it == LLM_ARCH_NAMES.end()) {
|
||||||
return "unknown";
|
return "unknown";
|
||||||
@ -3310,11 +3310,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
const auto & vocab = model.vocab;
|
const auto & vocab = model.vocab;
|
||||||
|
|
||||||
const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
||||||
|
|
||||||
// hparams
|
// hparams
|
||||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
|
||||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
||||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
||||||
@ -3336,7 +3336,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
||||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||||
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
||||||
@ -10735,7 +10735,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
|
|||||||
|
|
||||||
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||||
return snprintf(buf, buf_size, "%s %s %s",
|
return snprintf(buf, buf_size, "%s %s %s",
|
||||||
llama_model_arch_name(model->arch).c_str(),
|
llama_model_arch_name(model->arch),
|
||||||
llama_model_type_name(model->type),
|
llama_model_type_name(model->type),
|
||||||
llama_model_ftype_name(model->ftype).c_str());
|
llama_model_ftype_name(model->ftype).c_str());
|
||||||
}
|
}
|
||||||
|
2
llama.h
2
llama.h
@ -213,7 +213,7 @@ extern "C" {
|
|||||||
uint32_t n_batch; // prompt processing maximum batch size
|
uint32_t n_batch; // prompt processing maximum batch size
|
||||||
uint32_t n_threads; // number of threads to use for generation
|
uint32_t n_threads; // number of threads to use for generation
|
||||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||||
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
|
Loading…
Reference in New Issue
Block a user