Merge branch 'master' into gg/per-layer-kv

ggml-ci
This commit is contained in:
Georgi Gerganov 2023-12-07 12:33:11 +02:00
commit 680a99e792
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
15 changed files with 569 additions and 176 deletions

View File

@ -2,33 +2,14 @@
import PackageDescription import PackageDescription
#if arch(arm) || arch(arm64)
let platforms: [SupportedPlatform]? = [
.macOS(.v12),
.iOS(.v14),
.watchOS(.v4),
.tvOS(.v14)
]
let exclude: [String] = []
let resources: [Resource] = [
.process("ggml-metal.metal")
]
let additionalSources: [String] = ["ggml-metal.m"]
let additionalSettings: [CSetting] = [
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_USE_METAL")
]
#else
let platforms: [SupportedPlatform]? = nil
let exclude: [String] = ["ggml-metal.metal"]
let resources: [Resource] = []
let additionalSources: [String] = []
let additionalSettings: [CSetting] = []
#endif
let package = Package( let package = Package(
name: "llama", name: "llama",
platforms: platforms, platforms: [
.macOS(.v12),
.iOS(.v14),
.watchOS(.v4),
.tvOS(.v14)
],
products: [ products: [
.library(name: "llama", targets: ["llama"]), .library(name: "llama", targets: ["llama"]),
], ],
@ -36,25 +17,30 @@ let package = Package(
.target( .target(
name: "llama", name: "llama",
path: ".", path: ".",
exclude: exclude, exclude: [],
sources: [ sources: [
"ggml.c", "ggml.c",
"llama.cpp", "llama.cpp",
"ggml-alloc.c", "ggml-alloc.c",
"ggml-backend.c", "ggml-backend.c",
"ggml-quants.c", "ggml-quants.c",
] + additionalSources, "ggml-metal.m",
resources: resources, ],
resources: [
.process("ggml-metal.metal")
],
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [ cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_ACCELERATE") .define("GGML_USE_ACCELERATE"),
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_USE_METAL"),
// NOTE: NEW_LAPACK will required iOS version 16.4+ // NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14 // We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"), // .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64") // .define("ACCELERATE_LAPACK_ILP64")
] + additionalSettings, ],
linkerSettings: [ linkerSettings: [
.linkedFramework("Accelerate") .linkedFramework("Accelerate")
] ]

View File

@ -278,6 +278,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.yarn_beta_slow = std::stof(argv[i]); params.yarn_beta_slow = std::stof(argv[i]);
} else if (arg == "--samplers") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.samplers_sequence = parse_samplers_input(argv[i]);
} else if (arg == "--sampling-seq") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.samplers_sequence = argv[i];
} else if (arg == "--top-p") { } else if (arg == "--top-p") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -682,6 +694,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
std::istreambuf_iterator<char>(), std::istreambuf_iterator<char>(),
std::back_inserter(sparams.grammar) std::back_inserter(sparams.grammar)
); );
} else if (arg == "--override-kv") {
if (++i >= argc) {
invalid_param = true;
break;
}
char * sep = strchr(argv[i], '=');
if (sep == nullptr || sep - argv[i] >= 128) {
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
sep++;
if (strncmp(sep, "int:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_INT;
kvo.int_value = std::atol(sep);
} else if (strncmp(sep, "float:", 6) == 0) {
sep += 6;
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
kvo.float_value = std::atof(sep);
} else if (strncmp(sep, "bool:", 5) == 0) {
sep += 5;
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
if (std::strcmp(sep, "true") == 0) {
kvo.bool_value = true;
} else if (std::strcmp(sep, "false") == 0) {
kvo.bool_value = false;
} else {
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
} else {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true;
break;
}
params.kv_overrides.push_back(kvo);
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters // Parse args for logging parameters
} else if ( log_param_single_parse( argv[i] ) ) { } else if ( log_param_single_parse( argv[i] ) ) {
@ -725,6 +778,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
} }
} }
if (!params.kv_overrides.empty()) {
params.kv_overrides.emplace_back(llama_model_kv_override());
params.kv_overrides.back().key[0] = 0;
}
return true; return true;
} }
@ -765,6 +823,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@ -858,6 +918,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n"); printf(" path under which to save YAML logs (no logging if unset)\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf("\n"); printf("\n");
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
log_print_usage(); log_print_usage();
@ -894,6 +957,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
GGML_UNREACHABLE(); GGML_UNREACHABLE();
} }
//
// String parsing
//
std::string parse_samplers_input(std::string input) {
std::string output = "";
// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, char> samplers_symbols {
{"top_k", 'k'},
{"top-k", 'k'},
{"top_p", 'p'},
{"top-p", 'p'},
{"nucleus", 'p'},
{"typical_p", 'y'},
{"typical-p", 'y'},
{"typical", 'y'},
{"min_p", 'm'},
{"min-p", 'm'},
{"tfs_z", 'f'},
{"tfs-z", 'f'},
{"tfs", 'f'},
{"temp", 't'},
{"temperature",'t'}
};
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
size_t separator = input.find(';');
while (separator != input.npos) {
std::string name = input.substr(0,separator);
input = input.substr(separator+1);
separator = input.find(';');
if (samplers_symbols.find(name) != samplers_symbols.end()) {
output += samplers_symbols[name];
}
}
if (samplers_symbols.find(input) != samplers_symbols.end()) {
output += samplers_symbols[input];
}
return output;
}
// //
// Model utils // Model utils
// //
@ -908,6 +1013,12 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.tensor_split = params.tensor_split; mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap; mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock; mparams.use_mlock = params.use_mlock;
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
} else {
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
mparams.kv_overrides = params.kv_overrides.data();
}
return mparams; return mparams;
} }

View File

@ -86,6 +86,8 @@ struct gpt_params {
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files std::string logdir = ""; // directory in which to save YAML log files
std::vector<llama_model_kv_override> kv_overrides;
// TODO: avoid tuple, use struct // TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter std::string lora_base = ""; // base model path for the lora adapter
@ -144,6 +146,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input); void process_escapes(std::string& input);
//
// String parsing
//
std::string parse_samplers_input(std::string input);
// //
// Model utils // Model utils
// //

View File

@ -190,7 +190,7 @@ namespace grammar_parser {
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
if (last_sym_start == out_elements.size()) { if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos); throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
} }
// apply transformation to previous symbol (last_sym_start to end) according to // apply transformation to previous symbol (last_sym_start to end) according to

View File

@ -99,6 +99,56 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
return std::string(result); return std::string(result);
} }
std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) {
for (auto s : params.samplers_sequence) {
switch (s) {
case 'k': result += "-> top_k "; break;
case 'f': result += "-> tfs_z "; break;
case 'y': result += "-> typical_p "; break;
case 'p': result += "-> top_p "; break;
case 'm': result += "-> min_p "; break;
case 't': result += "-> temp "; break;
default : break;
}
}
} else {
result += "-> mirostat ";
}
return result;
}
// no reasons to expose this function in header
static void sampler_queue(
struct llama_context * ctx_main,
const llama_sampling_params & params,
llama_token_data_array & cur_p,
size_t & min_keep) {
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const std::string & samplers_sequence = params.samplers_sequence;
for (auto s : samplers_sequence) {
switch (s){
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
default : break;
}
}
}
llama_token llama_sampling_sample( llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling, struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main, struct llama_context * ctx_main,
@ -109,11 +159,6 @@ llama_token llama_sampling_sample(
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp; const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat; const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq; const float penalty_freq = params.penalty_freq;
@ -188,12 +233,7 @@ llama_token llama_sampling_sample(
// temperature sampling // temperature sampling
size_t min_keep = std::max(1, params.n_probs); size_t min_keep = std::max(1, params.n_probs);
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); sampler_queue(ctx_main, params, cur_p, min_keep);
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
llama_sample_temp (ctx_main, &cur_p, temp);
id = llama_sample_token(ctx_main, &cur_p); id = llama_sample_token(ctx_main, &cur_p);

View File

@ -10,22 +10,23 @@
// sampling parameters // sampling parameters
typedef struct llama_sampling_params { typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t top_k = 40; // <= 0 to use vocab size int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled float temp = 0.80f; // 1.0 = disabled
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.10f; // 1.0 = disabled float penalty_repeat = 1.10f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = true; // consider newlines as a repeatable token bool penalize_nl = true; // consider newlines as a repeatable token
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
std::string grammar; // optional BNF-like grammar to constrain sampling std::string grammar; // optional BNF-like grammar to constrain sampling
@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
// Print sampling parameters into a string // Print sampling parameters into a string
std::string llama_sampling_print(const llama_sampling_params & params); std::string llama_sampling_print(const llama_sampling_params & params);
// Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params);
// this is a common sampling function used across the examples for convenience // this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function // it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call // Note: When using multiple sequences, it is the caller's responsibility to call

View File

@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
llama_print_timings(context) llama_print_timings(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let n_tokens = text.count + (add_bos ? 1 : 0) let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens) let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
var swiftTokens: [llama_token] = [] var swiftTokens: [llama_token] = []
for i in 0 ..< tokenCount { for i in 0 ..< tokenCount {
swiftTokens.append(tokens[Int(i)]) swiftTokens.append(tokens[Int(i)])

View File

@ -11,6 +11,8 @@ actor LlamaContext {
private var context: OpaquePointer private var context: OpaquePointer
private var batch: llama_batch private var batch: llama_batch
private var tokens_list: [llama_token] private var tokens_list: [llama_token]
/// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar]
var n_len: Int32 = 512 var n_len: Int32 = 512
var n_cur: Int32 = 0 var n_cur: Int32 = 0
@ -21,6 +23,7 @@ actor LlamaContext {
self.context = context self.context = context
self.tokens_list = [] self.tokens_list = []
self.batch = llama_batch_init(512, 0, 1) self.batch = llama_batch_init(512, 0, 1)
self.temporary_invalid_cchars = []
} }
deinit { deinit {
@ -61,6 +64,7 @@ actor LlamaContext {
print("attempting to complete \"\(text)\"") print("attempting to complete \"\(text)\"")
tokens_list = tokenize(text: text, add_bos: true) tokens_list = tokenize(text: text, add_bos: true)
temporary_invalid_cchars = []
let n_ctx = llama_n_ctx(context) let n_ctx = llama_n_ctx(context)
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count) let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
@ -72,7 +76,7 @@ actor LlamaContext {
} }
for id in tokens_list { for id in tokens_list {
print(token_to_piece(token: id)) print(String(cString: token_to_piece(token: id) + [0]))
} }
// batch = llama_batch_init(512, 0) // done in init() // batch = llama_batch_init(512, 0) // done in init()
@ -115,10 +119,25 @@ actor LlamaContext {
if new_token_id == llama_token_eos(context) || n_cur == n_len { if new_token_id == llama_token_eos(context) || n_cur == n_len {
print("\n") print("\n")
return "" let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
return new_token_str
} }
let new_token_str = token_to_piece(token: new_token_id) let new_token_cchars = token_to_piece(token: new_token_id)
temporary_invalid_cchars.append(contentsOf: new_token_cchars)
let new_token_str: String
if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
temporary_invalid_cchars.removeAll()
new_token_str = string
} else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
// in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
let string = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
new_token_str = string
} else {
new_token_str = ""
}
print(new_token_str) print(new_token_str)
// tokens_list.append(new_token_id) // tokens_list.append(new_token_id)
@ -144,12 +163,14 @@ actor LlamaContext {
func clear() { func clear() {
tokens_list.removeAll() tokens_list.removeAll()
temporary_invalid_cchars.removeAll()
} }
private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let n_tokens = text.count + (add_bos ? 1 : 0) let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens) let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false) let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
var swiftTokens: [llama_token] = [] var swiftTokens: [llama_token] = []
for i in 0..<tokenCount { for i in 0..<tokenCount {
@ -161,7 +182,8 @@ actor LlamaContext {
return swiftTokens return swiftTokens
} }
private func token_to_piece(token: llama_token) -> String { /// - note: The result does not contain null-terminator
private func token_to_piece(token: llama_token) -> [CChar] {
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8) let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
result.initialize(repeating: Int8(0), count: 8) result.initialize(repeating: Int8(0), count: 8)
defer { defer {
@ -175,10 +197,12 @@ actor LlamaContext {
defer { defer {
newResult.deallocate() newResult.deallocate()
} }
_ = llama_token_to_piece(model, token, newResult, -nTokens) let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
return String(cString: newResult) let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else { } else {
return String(cString: result) let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
return Array(bufferPointer)
} }
} }
} }

View File

@ -437,6 +437,7 @@ int main(int argc, char ** argv) {
} }
} }
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n"); LOG_TEE("\n\n");

View File

@ -2383,6 +2383,7 @@ json oaicompat_completion_params_parse(
// Map OpenAI parameters to llama.cpp parameters // Map OpenAI parameters to llama.cpp parameters
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.8); llama_params["temperature"] = json_value(body, "temperature", 0.8);
llama_params["top_k"] = json_value(body, "top_k", 40); llama_params["top_k"] = json_value(body, "top_k", 40);
llama_params["top_p"] = json_value(body, "top_p", 0.95); llama_params["top_p"] = json_value(body, "top_p", 0.95);

View File

@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
// make sure the KV cache is big enough to hold all the prompt and generated tokens // make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) { if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
return 1; return 1;
} }

View File

@ -203,8 +203,9 @@ int main(int argc, char ** argv) {
const std::string token_str = llama_token_to_piece(ctx_tgt, id); const std::string token_str = llama_token_to_piece(ctx_tgt, id);
printf("%s", token_str.c_str()); if (!params.use_color) {
fflush(stdout); printf("%s", token_str.c_str());
}
if (id == llama_token_eos(model_tgt)) { if (id == llama_token_eos(model_tgt)) {
has_eos = true; has_eos = true;
@ -236,10 +237,18 @@ int main(int argc, char ** argv) {
++n_past_tgt; ++n_past_tgt;
++n_past_dft; ++n_past_dft;
++i_dft; ++i_dft;
if (params.use_color) {
// Color token according to its origin sequence
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
fflush(stdout);
}
continue; continue;
} }
} }
if (params.use_color) {
printf("%s", token_str.c_str());
}
fflush(stdout);
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());

View File

@ -1295,10 +1295,6 @@ int main(int argc, char ** argv) {
opt_cb_data.last_save_iter = opt->iter; opt_cb_data.last_save_iter = opt->iter;
} }
if (alloc) {
ggml_allocr_free(alloc);
}
ggml_free(opt->ctx); ggml_free(opt->ctx);
free_train_state(train); free_train_state(train);
ggml_free(model.ctx); ggml_free(model.ctx);

390
llama.cpp
View File

@ -74,6 +74,7 @@
#include <set> #include <set>
#include <sstream> #include <sstream>
#include <thread> #include <thread>
#include <type_traits>
#include <unordered_map> #include <unordered_map>
#if defined(_MSC_VER) #if defined(_MSC_VER)
@ -590,21 +591,6 @@ struct LLM_TN {
// gguf helpers // gguf helpers
// //
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
do { \
const std::string skey(key); \
const int kid = gguf_find_key(ctx, skey.c_str()); \
if (kid >= 0) { \
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
if (ktype != (type)) { \
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
} \
(dst) = func(ctx, kid); \
} else if (req) { \
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
} \
} while (0)
static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = { static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
{ LLAMA_ROPE_SCALING_NONE, "none" }, { LLAMA_ROPE_SCALING_NONE, "none" },
{ LLAMA_ROPE_SCALING_LINEAR, "linear" }, { LLAMA_ROPE_SCALING_LINEAR, "linear" },
@ -638,7 +624,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
} }
} }
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) { static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) { switch (type) {
@ -1809,6 +1795,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
return buf; return buf;
} }
namespace GGUFMeta {
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
struct GKV_Base_Type {
static constexpr gguf_type gt = gt_;
static T getter(const gguf_context * ctx, const int kid) {
return gfun(ctx, kid);
}
};
template<typename T> struct GKV_Base;
template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
template<> struct GKV_Base<std::string> {
static constexpr gguf_type gt = GGUF_TYPE_STRING;
static std::string getter(const gguf_context * ctx, const int kid) {
return gguf_get_val_str(ctx, kid);
}
};
struct ArrayInfo{
const gguf_type gt;
const size_t length;
const void * data;
};
template<> struct GKV_Base<ArrayInfo> {
public:
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
static ArrayInfo getter(const gguf_context *ctx, const int k) {
return ArrayInfo {
gguf_get_arr_type(ctx, k),
size_t(gguf_get_arr_n(ctx, k)),
gguf_get_arr_data(ctx, k),
};
}
};
template<typename T>
class GKV: public GKV_Base<T> {
GKV() = delete;
public:
static T get_kv(const gguf_context * ctx, const int k) {
const enum gguf_type kt = gguf_get_kv_type(ctx, k);
if (kt != GKV::gt) {
throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
}
return GKV::getter(ctx, k);
}
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
switch (ty) {
case LLAMA_KV_OVERRIDE_BOOL: return "bool";
case LLAMA_KV_OVERRIDE_INT: return "int";
case LLAMA_KV_OVERRIDE_FLOAT: return "float";
}
return "unknown";
}
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
if (!override) { return false; }
if (override->tag == expected_type) {
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
__func__, override_type_to_str(override->tag), override->key);
switch (override->tag) {
case LLAMA_KV_OVERRIDE_BOOL: {
printf("%s\n", override->bool_value ? "true" : "false");
} break;
case LLAMA_KV_OVERRIDE_INT: {
printf("%" PRId64 "\n", override->int_value);
} break;
case LLAMA_KV_OVERRIDE_FLOAT: {
printf("%.6f\n", override->float_value);
} break;
default:
// Shouldn't be possible to end up here, but just in case...
throw std::runtime_error(
format("Unsupported attempt to override %s type for metadata key %s\n",
override_type_to_str(override->tag), override->key));
}
return true;
}
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
__func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
return false;
}
template<typename OT>
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
try_override(OT & target, const struct llama_model_kv_override *override) {
if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
target = override->bool_value;
return true;
}
return true;
}
template<typename OT>
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
try_override(OT & target, const struct llama_model_kv_override *override) {
if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
target = override->int_value;
return true;
}
return false;
}
template<typename OT>
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
try_override(T & target, const struct llama_model_kv_override *override) {
if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
target = override->float_value;
return true;
}
return false;
}
template<typename OT>
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
try_override(T & target, const struct llama_model_kv_override *override) {
(void)target;
(void)override;
if (!override) { return false; }
// Currently, we should never end up here so it would be a bug if we do.
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
override ? override->key : "NULL"));
}
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
if (try_override<T>(target, override)) {
return true;
}
if (k < 0) { return false; }
target = get_kv(ctx, k);
return true;
}
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
return set(ctx, gguf_find_key(ctx, key), target, override);
}
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
return set(ctx, key.c_str(), target, override);
}
};
}
struct llama_model_loader { struct llama_model_loader {
int n_kv = 0; int n_kv = 0;
int n_tensors = 0; int n_tensors = 0;
@ -1824,21 +1973,34 @@ struct llama_model_loader {
llama_fver fver; llama_fver fver;
std::unique_ptr<llama_mmap> mapping; std::unique_ptr<llama_mmap> mapping;
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
struct gguf_context * ctx_gguf = NULL; struct gguf_context * ctx_gguf = NULL;
struct ggml_context * ctx_meta = NULL; struct ggml_context * ctx_meta = NULL;
llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") { std::string arch_name;
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
struct gguf_init_params params = { struct gguf_init_params params = {
/*.no_alloc = */ true, /*.no_alloc = */ true,
/*.ctx = */ &ctx_meta, /*.ctx = */ &ctx_meta,
}; };
if (param_overrides_p != nullptr) {
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
kv_overrides.insert({std::string(p->key), *p});
}
}
ctx_gguf = gguf_init_from_file(fname.c_str(), params); ctx_gguf = gguf_init_from_file(fname.c_str(), params);
if (!ctx_gguf) { if (!ctx_gguf) {
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
} }
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
n_kv = gguf_get_n_kv(ctx_gguf); n_kv = gguf_get_n_kv(ctx_gguf);
n_tensors = gguf_get_n_tensors(ctx_gguf); n_tensors = gguf_get_n_tensors(ctx_gguf);
@ -1906,6 +2068,7 @@ struct llama_model_loader {
} }
} }
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) { for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx_gguf, i); const char * name = gguf_get_key(ctx_gguf, i);
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
@ -1951,19 +2114,59 @@ struct llama_model_loader {
} }
} }
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
get_arr_n(const std::string & key, T & result, const bool required = true) {
const int kid = gguf_find_key(ctx_gguf, key.c_str());
if (kid < 0) {
if (required) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
}
return false;
}
struct GGUFMeta::ArrayInfo arr_info =
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
result = arr_info.length;
return true;
}
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
return get_arr_n(llm_kv(kid), result, required);
}
template<typename T>
bool get_key(const std::string & key, T & result, const bool required = true) {
auto it = kv_overrides.find(key);
const struct llama_model_kv_override * override =
it != kv_overrides.end() ? &it->second : nullptr;
const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
if (required && !found) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
}
return found;
}
template<typename T>
bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
return get_key(llm_kv(kid), result, required);
}
std::string get_arch_name() const { std::string get_arch_name() const {
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
std::string arch_name;
GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
return arch_name; return arch_name;
} }
enum llm_arch get_arch() const { enum llm_arch get_arch() const {
const std::string arch_name = get_arch_name(); return llm_kv.arch;
return llm_arch_from_string(arch_name);
} }
const char * get_tensor_name(int i) const { const char * get_tensor_name(int i) const {
@ -2213,11 +2416,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
static void llm_load_hparams( static void llm_load_hparams(
llama_model_loader & ml, llama_model_loader & ml,
llama_model & model) { llama_model & model) {
struct gguf_context * ctx = ml.ctx_gguf;
const auto kv = LLM_KV(model.arch);
auto & hparams = model.hparams; auto & hparams = model.hparams;
const gguf_context * ctx = ml.ctx_gguf;
// get metadata as string // get metadata as string
for (int i = 0; i < gguf_get_n_kv(ctx); i++) { for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@ -2231,42 +2431,41 @@ static void llm_load_hparams(
} }
// get general kv // get general kv
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
// get hparams kv // get hparams kv
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST)); ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH)); ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
// n_head_kv is optional, default to n_head // n_head_kv is optional, default to n_head
hparams.n_head_kv = hparams.n_head; hparams.n_head_kv = hparams.n_head;
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
hparams.rope_finetuned = false; bool rope_finetuned = false;
GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false, ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
kv(LLM_KV_ROPE_SCALING_FINETUNED)); hparams.rope_finetuned = rope_finetuned;
hparams.n_yarn_orig_ctx = hparams.n_ctx_train; hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
// rope_freq_base (optional) // rope_freq_base (optional)
hparams.rope_freq_base_train = 10000.0f; hparams.rope_freq_base_train = 10000.0f;
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
std::string rope_scaling("linear"); std::string rope_scaling("linear");
GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE)); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
// rope_freq_scale (inverse of the kv) is optional // rope_freq_scale (inverse of the kv) is optional
float ropescale = 0.0f; float ropescale = 0.0f;
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
if (ropescale == 0.0f) { // try the old key name // try the old key name
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
} }
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
@ -2274,7 +2473,7 @@ static void llm_load_hparams(
{ {
hparams.n_rot = hparams.n_embd / hparams.n_head; hparams.n_rot = hparams.n_embd / hparams.n_head;
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) { if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd / hparams.n_head) { if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
@ -2289,7 +2488,7 @@ static void llm_load_hparams(
switch (model.arch) { switch (model.arch) {
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 26: model.type = e_model::MODEL_3B; break; case 26: model.type = e_model::MODEL_3B; break;
@ -2303,7 +2502,7 @@ static void llm_load_hparams(
} break; } break;
case LLM_ARCH_FALCON: case LLM_ARCH_FALCON:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break;
@ -2313,7 +2512,7 @@ static void llm_load_hparams(
} break; } break;
case LLM_ARCH_BAICHUAN: case LLM_ARCH_BAICHUAN:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break; case 40: model.type = e_model::MODEL_13B; break;
@ -2322,7 +2521,7 @@ static void llm_load_hparams(
} break; } break;
case LLM_ARCH_STARCODER: case LLM_ARCH_STARCODER:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break; case 24: model.type = e_model::MODEL_1B; break;
case 36: model.type = e_model::MODEL_3B; break; case 36: model.type = e_model::MODEL_3B; break;
@ -2333,7 +2532,7 @@ static void llm_load_hparams(
} break; } break;
case LLM_ARCH_PERSIMMON: case LLM_ARCH_PERSIMMON:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 36: model.type = e_model::MODEL_8B; break; case 36: model.type = e_model::MODEL_8B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
@ -2341,7 +2540,7 @@ static void llm_load_hparams(
} break; } break;
case LLM_ARCH_REFACT: case LLM_ARCH_REFACT:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 32: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_1B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
@ -2349,7 +2548,7 @@ static void llm_load_hparams(
} break; } break;
case LLM_ARCH_BLOOM: case LLM_ARCH_BLOOM:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break; case 24: model.type = e_model::MODEL_1B; break;
@ -2364,9 +2563,9 @@ static void llm_load_hparams(
{ {
hparams.f_clamp_kqv = 0.0f; hparams.f_clamp_kqv = 0.0f;
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV)); ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS)); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break;
@ -2376,7 +2575,7 @@ static void llm_load_hparams(
} break; } break;
case LLM_ARCH_STABLELM: case LLM_ARCH_STABLELM:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 32: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_3B; break;
@ -2385,7 +2584,8 @@ static void llm_load_hparams(
} break; } break;
case LLM_ARCH_QWEN: case LLM_ARCH_QWEN:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break; case 40: model.type = e_model::MODEL_13B; break;
@ -2433,7 +2633,7 @@ static void llm_load_vocab(
{ {
std::string tokenizer_name; std::string tokenizer_name;
GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL)); ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
if (tokenizer_name == "llama") { if (tokenizer_name == "llama") {
vocab.type = LLAMA_VOCAB_TYPE_SPM; vocab.type = LLAMA_VOCAB_TYPE_SPM;
@ -2523,34 +2723,31 @@ static void llm_load_vocab(
}; };
for (const auto & it : special_token_types) { for (const auto & it : special_token_types) {
const std::string & key = kv(std::get<0>(it)); const std::string & key = kv(std::get<0>(it));
int32_t & id = std::get<1>(it), old_id = id; int32_t & id = std::get<1>(it);
GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key); uint32_t new_id;
// Must be >= -1 and < vocab size. Since the key is unsigned, -1 if (!ml.get_key(std::get<0>(it), new_id, false)) {
// can only come from the default value, so there's no point in continue;
// validating that. }
if (size_t(id + 1) > vocab.id_to_token.size()) { if (new_id >= vocab.id_to_token.size()) {
LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n", LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
__func__, key.c_str(), id, old_id); __func__, key.c_str(), new_id, id);
id = old_id; } else {
id = new_id;
} }
} }
// Handle add_bos_token and add_eos_token // Handle add_bos_token and add_eos_token
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS); {
int kid = gguf_find_key(ctx, key.c_str()); bool temp = true;
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1; if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) { vocab.special_add_bos = int(temp);
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str()); }
} if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
key = kv(LLM_KV_TOKENIZER_ADD_EOS); vocab.special_add_eos = int(temp);
kid = gguf_find_key(ctx, key.c_str()); }
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
} }
} }
@ -3429,7 +3626,7 @@ static void llm_load_tensors(
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
try { try {
llama_model_loader ml(fname, params.use_mmap); llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
model.hparams.vocab_only = params.vocab_only; model.hparams.vocab_only = params.vocab_only;
@ -6605,14 +6802,13 @@ struct llama_grammar_candidate {
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8( static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const char * src, const std::string & src,
size_t n_src,
llama_partial_utf8 partial_start) { llama_partial_utf8 partial_start) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
const char * pos = src; const char * pos = src.c_str();
std::vector<uint32_t> code_points; std::vector<uint32_t> code_points;
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0. // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
code_points.reserve(n_src + 1); code_points.reserve(src.size() + 1);
uint32_t value = partial_start.value; uint32_t value = partial_start.value;
int n_remain = partial_start.n_remain; int n_remain = partial_start.n_remain;
@ -6663,13 +6859,6 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain }); return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
} }
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
std::string src,
llama_partial_utf8 partial_start
) {
return decode_utf8(src.c_str(), src.size(), partial_start);
}
// returns true iff pos points to the end of one of the definitions of a rule // returns true iff pos points to the end of one of the definitions of a rule
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) { static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
switch (pos->type) { switch (pos->type) {
@ -7308,11 +7497,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
const llama_token eos = llama_token_eos(&ctx->model); const llama_token eos = llama_token_eos(&ctx->model);
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded; std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
candidates_decoded.reserve(candidates->size);
std::vector<llama_grammar_candidate> candidates_grammar; std::vector<llama_grammar_candidate> candidates_grammar;
candidates_grammar.reserve(candidates->size);
for (size_t i = 0; i < candidates->size; ++i) { for (size_t i = 0; i < candidates->size; ++i) {
const llama_token id = candidates->data[i].id; const llama_token id = candidates->data[i].id;
const std::string piece = llama_token_to_piece(ctx, id); const std::string & piece = ctx->model.vocab.id_to_token[id].text;
if (id == eos) { if (id == eos) {
if (!allow_eos) { if (!allow_eos) {
candidates->data[i].logit = -INFINITY; candidates->data[i].logit = -INFINITY;
@ -7524,7 +7715,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
GGML_ASSERT(false); GGML_ASSERT(false);
} }
const std::string piece = llama_token_to_piece(ctx, token); const std::string & piece = ctx->model.vocab.id_to_token[token].text;
// Note terminating 0 in decoded string // Note terminating 0 in decoded string
const auto decoded = decode_utf8(piece, grammar->partial_utf8); const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@ -8029,7 +8220,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
constexpr bool use_mmap = false; constexpr bool use_mmap = false;
#endif #endif
llama_model_loader ml(fname_inp, use_mmap); llama_model_loader ml(fname_inp, use_mmap, NULL);
if (ml.use_mmap) { if (ml.use_mmap) {
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
} }
@ -8325,7 +8516,7 @@ static int llama_apply_lora_from_file_internal(
std::vector<uint8_t> base_buf; std::vector<uint8_t> base_buf;
if (path_base_model) { if (path_base_model) {
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
size_t ctx_size; size_t ctx_size;
size_t mmapped_size; size_t mmapped_size;
@ -8553,6 +8744,7 @@ struct llama_model_params llama_model_default_params() {
/*.tensor_split =*/ nullptr, /*.tensor_split =*/ nullptr,
/*.progress_callback =*/ nullptr, /*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr, /*.progress_callback_user_data =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.vocab_only =*/ false, /*.vocab_only =*/ false,
/*.use_mmap =*/ true, /*.use_mmap =*/ true,
/*.use_mlock =*/ false, /*.use_mlock =*/ false,

20
llama.h
View File

@ -158,6 +158,22 @@ extern "C" {
llama_seq_id all_seq_id; // used if seq_id == NULL llama_seq_id all_seq_id; // used if seq_id == NULL
} llama_batch; } llama_batch;
enum llama_model_kv_override_type {
LLAMA_KV_OVERRIDE_INT,
LLAMA_KV_OVERRIDE_FLOAT,
LLAMA_KV_OVERRIDE_BOOL,
};
struct llama_model_kv_override {
char key[128];
enum llama_model_kv_override_type tag;
union {
int64_t int_value;
double float_value;
bool bool_value;
};
};
struct llama_model_params { struct llama_model_params {
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
@ -165,9 +181,13 @@ extern "C" {
// called with a progress value between 0 and 1, pass NULL to disable // called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback; llama_progress_callback progress_callback;
// context pointer passed to the progress callback // context pointer passed to the progress callback
void * progress_callback_user_data; void * progress_callback_user_data;
// override key-value pairs of the model meta data
const struct llama_model_kv_override * kv_overrides;
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible bool use_mmap; // use mmap if possible