mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 20:14:29 +00:00
llama : scatter llama.cpp into multiple modules (wip)
This commit is contained in:
parent
86bf31cfe6
commit
f9b0e3b382
@ -9,9 +9,16 @@ llama_add_compile_flags()
|
||||
add_library(llama
|
||||
../include/llama.h
|
||||
llama.cpp
|
||||
llama-vocab.cpp
|
||||
llama-arch.cpp
|
||||
llama-batch.cpp
|
||||
llama-context.cpp
|
||||
llama-control-vector.cpp
|
||||
llama-grammar.cpp
|
||||
llama-kv-cache.cpp
|
||||
llama-mmap.cpp
|
||||
llama-model.cpp
|
||||
llama-sampling.cpp
|
||||
llama-vocab.cpp
|
||||
unicode.h
|
||||
unicode.cpp
|
||||
unicode-data.cpp
|
||||
|
1
src/llama-arch.cpp
Normal file
1
src/llama-arch.cpp
Normal file
@ -0,0 +1 @@
|
||||
#include "llama-arch.h"
|
1714
src/llama-arch.h
Normal file
1714
src/llama-arch.h
Normal file
File diff suppressed because it is too large
Load Diff
1
src/llama-batch.cpp
Normal file
1
src/llama-batch.cpp
Normal file
@ -0,0 +1 @@
|
||||
#include "llama-batch.h"
|
330
src/llama-batch.h
Normal file
330
src/llama-batch.h
Normal file
@ -0,0 +1,330 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// very similar to llama_batch,
|
||||
// but has more metadata about sequences
|
||||
struct llama_ubatch {
|
||||
bool equal_seqs;
|
||||
// TODO: whole_seqs for embeddings?
|
||||
|
||||
uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
|
||||
uint32_t n_seq_tokens; // tokens per sequence
|
||||
uint32_t n_seqs;
|
||||
|
||||
llama_token * token; // [n_tokens]
|
||||
float * embd; // [n_embd, n_tokens]
|
||||
llama_pos * pos; // [n_tokens]
|
||||
int32_t * n_seq_id; // [n_seqs]
|
||||
llama_seq_id ** seq_id; // [n_seqs]
|
||||
int8_t * output; // [n_tokens]
|
||||
};
|
||||
|
||||
struct llama_sbatch_seq {
|
||||
int32_t n_seq_id;
|
||||
llama_seq_id * seq_id;
|
||||
size_t offset;
|
||||
size_t length;
|
||||
};
|
||||
|
||||
// sequence-length-aware batch splitting
|
||||
struct llama_sbatch {
|
||||
// tokens left in this batch
|
||||
size_t n_tokens;
|
||||
|
||||
size_t n_embd;
|
||||
|
||||
bool logits_all; // TODO: remove once lctx.logits_all is removed too
|
||||
|
||||
// sorted indices into the batch
|
||||
std::vector<size_t> ids;
|
||||
// batch indices of the output
|
||||
std::vector<size_t> out_ids;
|
||||
std::vector<llama_sbatch_seq> seq;
|
||||
|
||||
const llama_batch * batch = nullptr;
|
||||
|
||||
// buffers for the ubatch
|
||||
std::vector<llama_token> ubatch_token;
|
||||
std::vector<float> ubatch_embd;
|
||||
std::vector<llama_pos> ubatch_pos;
|
||||
std::vector<int32_t> ubatch_n_seq_id;
|
||||
std::vector<llama_seq_id *> ubatch_seq_id;
|
||||
std::vector<int8_t> ubatch_output;
|
||||
|
||||
llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false) {
|
||||
// clear empty sequences
|
||||
// the previous ubatch is assumed to be gone,
|
||||
// so nothing should refer to values in these sequences anymore.
|
||||
for (size_t i = seq.size(); i-- > 0;) {
|
||||
if (seq[i].length == 0) {
|
||||
seq.pop_back();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
ubatch_token.resize(!has_embd ? n_ubatch : 0);
|
||||
ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0);
|
||||
ubatch_pos.resize(n_ubatch);
|
||||
ubatch_n_seq_id.resize(n_ubatch);
|
||||
ubatch_seq_id.resize(n_ubatch);
|
||||
ubatch_output.resize(n_ubatch);
|
||||
llama_ubatch ubatch = {
|
||||
/*equal_seqs =*/ true,
|
||||
/*n_tokens =*/ 0,
|
||||
/*n_seq_tokens =*/ 0,
|
||||
/*n_seqs =*/ 0,
|
||||
/*token =*/ !has_embd ? ubatch_token.data() : nullptr,
|
||||
/*embd =*/ has_embd ? ubatch_embd.data() : nullptr,
|
||||
/*pos =*/ ubatch_pos.data(),
|
||||
/*n_seq_id =*/ ubatch_n_seq_id.data(),
|
||||
/*seq_id =*/ ubatch_seq_id.data(),
|
||||
/*output =*/ ubatch_output.data(),
|
||||
};
|
||||
return ubatch;
|
||||
}
|
||||
|
||||
void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
|
||||
GGML_ASSERT(batch != nullptr);
|
||||
GGML_ASSERT(length <= seq.length);
|
||||
// Can only add sequences of equal lengths to a batch,
|
||||
// otherwise it isn't clear to which sequence a token belongs
|
||||
GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
|
||||
GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
|
||||
// NOTE: loops are separated for cache-friendliness
|
||||
if (batch->token) {
|
||||
if (ubatch.equal_seqs) {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]];
|
||||
}
|
||||
} else {
|
||||
// simple split
|
||||
ubatch.token = batch->token + seq.offset;
|
||||
}
|
||||
} else {
|
||||
ubatch.token = nullptr;
|
||||
}
|
||||
if (batch->embd) {
|
||||
if (ubatch.equal_seqs) {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
memcpy(
|
||||
ubatch.embd + n_embd * (ubatch.n_tokens + i),
|
||||
batch->embd + n_embd * ids[seq.offset + i],
|
||||
n_embd * sizeof(float)
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// simple split
|
||||
ubatch.embd = batch->embd + (n_embd * seq.offset);
|
||||
}
|
||||
} else {
|
||||
ubatch.embd = nullptr;
|
||||
}
|
||||
if (ubatch.equal_seqs) {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
|
||||
}
|
||||
} else {
|
||||
// simple split
|
||||
ubatch.pos = batch->pos + seq.offset;
|
||||
}
|
||||
if (ubatch.equal_seqs) {
|
||||
ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
|
||||
if (seq.seq_id) {
|
||||
ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
|
||||
}
|
||||
} else {
|
||||
// simple split
|
||||
if (batch->n_seq_id) {
|
||||
ubatch.n_seq_id = batch->n_seq_id + seq.offset;
|
||||
} else {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
|
||||
}
|
||||
}
|
||||
if (batch->seq_id) {
|
||||
ubatch.seq_id = batch->seq_id + seq.offset;
|
||||
}
|
||||
}
|
||||
if (logits_all) {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
ubatch.output[ubatch.n_tokens + i] = 1;
|
||||
out_ids.push_back(ids[seq.offset + i]);
|
||||
}
|
||||
} else if (batch->logits) {
|
||||
if (ubatch.equal_seqs) {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
size_t id = ids[seq.offset + i];
|
||||
int8_t is_output = batch->logits[id];
|
||||
ubatch.output[ubatch.n_tokens + i] = is_output;
|
||||
if (is_output) { out_ids.push_back(id); }
|
||||
}
|
||||
} else {
|
||||
// simple split
|
||||
ubatch.output = batch->logits + seq.offset;
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// only get last output
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
size_t id = ids[seq.offset + i];
|
||||
int8_t is_last = id == ids.size() - 1;
|
||||
ubatch.output[ubatch.n_tokens + i] = is_last;
|
||||
if (is_last) { out_ids.push_back(id); }
|
||||
}
|
||||
}
|
||||
if (ubatch.n_tokens == 0 && ubatch.n_seqs == 0) {
|
||||
ubatch.n_seq_tokens = ubatch.equal_seqs ? length : 1;
|
||||
}
|
||||
ubatch.n_tokens += length;
|
||||
ubatch.n_seqs += ubatch.equal_seqs ? 1 : length; // virtual sequences for simple splits
|
||||
seq.offset += length;
|
||||
seq.length -= length;
|
||||
n_tokens -= length;
|
||||
GGML_ASSERT(ubatch.n_tokens == ubatch.n_seq_tokens * ubatch.n_seqs);
|
||||
}
|
||||
|
||||
// simple split, unknown number of sequences of unequal lengths
|
||||
llama_ubatch split_simple(size_t n_ubatch) {
|
||||
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
|
||||
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
|
||||
ubatch.equal_seqs = false;
|
||||
if (!seq.empty()) {
|
||||
llama_sbatch_seq & s = seq[0];
|
||||
size_t length = s.length < n_ubatch ? s.length : n_ubatch;
|
||||
GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
|
||||
add_seq_to_ubatch(ubatch, s, length);
|
||||
}
|
||||
return ubatch;
|
||||
}
|
||||
|
||||
// make batches of equal-length sequences
|
||||
llama_ubatch split_equal(size_t n_ubatch) {
|
||||
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
|
||||
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
|
||||
if (!seq.empty()) {
|
||||
size_t length = 0;
|
||||
size_t n_tokens_in_ubatch = 0;
|
||||
GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits
|
||||
// smallest first, because it's easier to split this way;
|
||||
// starting from the end to pop in constant time.
|
||||
for (size_t i = seq.size(); i-- > 0;) {
|
||||
llama_sbatch_seq & s = seq[i];
|
||||
GGML_ASSERT(s.length > 0);
|
||||
if (length == 0) {
|
||||
length = s.length < n_ubatch ? s.length : n_ubatch;
|
||||
}
|
||||
add_seq_to_ubatch(ubatch, s, length);
|
||||
n_tokens_in_ubatch += length;
|
||||
// shared prompts can't be mixed with any of their sequences,
|
||||
// so it's safer to compute them in their own ubatch
|
||||
if (s.n_seq_id > 1) { break; }
|
||||
// stop when there isn't enough space for another sequence
|
||||
if (length + n_tokens_in_ubatch > n_ubatch) { break; }
|
||||
}
|
||||
}
|
||||
return ubatch;
|
||||
}
|
||||
|
||||
// sequence-wise split
|
||||
llama_ubatch split_seq(size_t n_ubatch) {
|
||||
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
|
||||
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
|
||||
if (!seq.empty()) {
|
||||
llama_sbatch_seq & s = seq[seq.size() - 1];
|
||||
size_t length = s.length < n_ubatch ? s.length : n_ubatch;
|
||||
GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
|
||||
add_seq_to_ubatch(ubatch, s, length);
|
||||
}
|
||||
return ubatch;
|
||||
}
|
||||
|
||||
void from_batch(const llama_batch & batch, const size_t n_embd, const bool simple_split = false, const bool logits_all = false) {
|
||||
GGML_ASSERT(batch.n_tokens >= 0);
|
||||
this->batch = &batch;
|
||||
this->n_embd = n_embd;
|
||||
this->logits_all = logits_all;
|
||||
|
||||
n_tokens = batch.n_tokens;
|
||||
ids.resize(n_tokens);
|
||||
out_ids.clear();
|
||||
// TODO: reserve out_ids and seq
|
||||
|
||||
for (size_t i = 0; i < n_tokens; ++i) {
|
||||
ids[i] = i;
|
||||
}
|
||||
if (simple_split) {
|
||||
seq.resize(1);
|
||||
llama_sbatch_seq & s = seq[0];
|
||||
s.n_seq_id = 0;
|
||||
s.seq_id = nullptr;
|
||||
s.offset = 0;
|
||||
s.length = n_tokens;
|
||||
return;
|
||||
}
|
||||
std::sort(ids.begin(), ids.end(),
|
||||
[&batch](size_t a, size_t b) {
|
||||
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
||||
int32_t n_seq_b = batch.n_seq_id ? batch.n_seq_id[b] : 1;
|
||||
// sort by seq_id, then by pos
|
||||
if (n_seq_a == n_seq_b) {
|
||||
if (batch.seq_id) {
|
||||
for (int32_t i = 0; i < n_seq_a; ++i) {
|
||||
llama_seq_id seq_id_a = batch.seq_id[a][i];
|
||||
llama_seq_id seq_id_b = batch.seq_id[b][i];
|
||||
// smaller seq_ids go first
|
||||
if (seq_id_a != seq_id_b) {
|
||||
return seq_id_a < seq_id_b;
|
||||
}
|
||||
}
|
||||
}
|
||||
// when all else is equal, sort by pos
|
||||
if (batch.pos) {
|
||||
return batch.pos[a] < batch.pos[b];
|
||||
}
|
||||
// no pos, sort by id
|
||||
return a < b;
|
||||
}
|
||||
// shared prompts go first
|
||||
return n_seq_a > n_seq_b;
|
||||
}
|
||||
);
|
||||
// init seq
|
||||
llama_sbatch_seq * last_seq = nullptr;
|
||||
|
||||
for (size_t i = 0; i < n_tokens; ++i) {
|
||||
const size_t bi = ids[i];
|
||||
const int32_t n_seqs = batch.n_seq_id[bi];
|
||||
llama_seq_id * seq_ids = batch.seq_id[bi];
|
||||
if (last_seq != nullptr) {
|
||||
bool same = n_seqs == last_seq->n_seq_id;
|
||||
for (int32_t j = 0; same && j < n_seqs; ++j) {
|
||||
if (seq_ids[j] != last_seq->seq_id[j]) {
|
||||
same = false;
|
||||
}
|
||||
}
|
||||
if (same) {
|
||||
last_seq->length += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
|
||||
seq.push_back(new_seq);
|
||||
last_seq = &seq.back();
|
||||
}
|
||||
// keep shared prompts first at the end, then sort by length descending.
|
||||
std::sort(seq.begin(), seq.end(),
|
||||
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
||||
if (a.n_seq_id == b.n_seq_id) {
|
||||
return a.length > b.length;
|
||||
}
|
||||
return a.n_seq_id < b.n_seq_id;
|
||||
}
|
||||
);
|
||||
}
|
||||
};
|
||||
|
970
src/llama-context.cpp
Normal file
970
src/llama-context.cpp
Normal file
@ -0,0 +1,970 @@
|
||||
#include "llama-context.h"
|
||||
|
||||
// deprecated
|
||||
size_t llama_get_state_size(struct llama_context * ctx) {
|
||||
return llama_state_get_size(ctx);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||
return llama_state_get_data(ctx, dst, -1);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
return llama_state_set_data(ctx, src, -1);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||
return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
||||
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
|
||||
}
|
||||
|
||||
// TODO: replace all non-fatal assertions with returned errors or exceptions
|
||||
struct llama_data_write {
|
||||
virtual void write(const void * src, size_t size) = 0;
|
||||
virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
|
||||
virtual size_t get_size_written() = 0;
|
||||
virtual ~llama_data_write() = default;
|
||||
|
||||
void write_string(const std::string & str) {
|
||||
uint32_t str_size = str.size();
|
||||
|
||||
write(&str_size, sizeof(str_size));
|
||||
write(str.data(), str_size);
|
||||
}
|
||||
|
||||
void write_model_info(const struct llama_context * ctx) {
|
||||
std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
|
||||
write_string(arch_str);
|
||||
// TODO: add more model-specific info which should prevent loading the session file if not identical
|
||||
}
|
||||
|
||||
//void write_rng(const std::mt19937 & rng) {
|
||||
// std::ostringstream rng_ss;
|
||||
// rng_ss << rng;
|
||||
|
||||
// const std::string & rng_str = rng_ss.str();
|
||||
|
||||
// write_string(rng_str);
|
||||
//}
|
||||
|
||||
void write_output_ids(struct llama_context * ctx) {
|
||||
llama_output_reorder(ctx);
|
||||
|
||||
const uint32_t n_outputs = ctx->n_outputs;
|
||||
|
||||
std::vector<int32_t> output_pos;
|
||||
|
||||
const size_t n_batch = ctx->cparams.n_batch;
|
||||
const auto & output_ids = ctx->output_ids;
|
||||
|
||||
GGML_ASSERT(n_outputs <= ctx->output_size);
|
||||
|
||||
output_pos.resize(n_outputs);
|
||||
|
||||
// build a more compact representation of the output ids
|
||||
for (size_t i = 0; i < n_batch; ++i) {
|
||||
// map an output id to a position in the batch
|
||||
int32_t pos = output_ids[i];
|
||||
if (pos >= 0) {
|
||||
GGML_ASSERT((uint32_t) pos < n_outputs);
|
||||
output_pos[pos] = i;
|
||||
}
|
||||
}
|
||||
|
||||
write(&n_outputs, sizeof(n_outputs));
|
||||
|
||||
if (n_outputs) {
|
||||
write(output_pos.data(), n_outputs * sizeof(int32_t));
|
||||
}
|
||||
}
|
||||
|
||||
void write_logits(const struct llama_context * ctx) {
|
||||
const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
|
||||
|
||||
write(&logits_size, sizeof(logits_size));
|
||||
|
||||
if (logits_size) {
|
||||
write(ctx->logits, logits_size * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
void write_embeddings(const struct llama_context * ctx) {
|
||||
const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
|
||||
|
||||
write(&embeddings_size, sizeof(embeddings_size));
|
||||
|
||||
if (embeddings_size) {
|
||||
write(ctx->embd, embeddings_size * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
|
||||
|
||||
for (const auto & range : cell_ranges) {
|
||||
for (uint32_t i = range.first; i < range.second; ++i) {
|
||||
const auto & cell = kv_self.cells[i];
|
||||
const llama_pos pos = cell.pos;
|
||||
const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
|
||||
|
||||
write(&pos, sizeof(pos));
|
||||
write(&n_seq_id, sizeof(n_seq_id));
|
||||
|
||||
if (n_seq_id) {
|
||||
for (auto seq_id : cell.seq_id) {
|
||||
write(&seq_id, sizeof(seq_id));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
|
||||
const struct llama_kv_cache & kv_self = ctx->kv_self;
|
||||
const struct llama_hparams & hparams = ctx->model.hparams;
|
||||
|
||||
const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
||||
write(&v_trans, sizeof(v_trans));
|
||||
write(&n_layer, sizeof(n_layer));
|
||||
|
||||
std::vector<uint8_t> tmp_buf;
|
||||
|
||||
// Iterate and write all the keys first, each row is a cell
|
||||
// Get whole range at a time
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
|
||||
|
||||
// Write key type
|
||||
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
||||
write(&k_type_i, sizeof(k_type_i));
|
||||
|
||||
// Write row size of key
|
||||
const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
||||
write(&k_size_row, sizeof(k_size_row));
|
||||
|
||||
// Read each range of cells of k_size length each into tmp_buf and write out
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * k_size_row;
|
||||
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (!kv_self.v_trans) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
|
||||
|
||||
// Write value type
|
||||
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
||||
write(&v_type_i, sizeof(v_type_i));
|
||||
|
||||
// Write row size of value
|
||||
const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
||||
write(&v_size_row, sizeof(v_size_row));
|
||||
|
||||
// Read each range of cells of v_size length each into tmp_buf and write out
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * v_size_row;
|
||||
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// When v is transposed, we also need the element size and get the element ranges from each row
|
||||
const uint32_t kv_size = kv_self.size;
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
|
||||
|
||||
// Write value type
|
||||
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
||||
write(&v_type_i, sizeof(v_type_i));
|
||||
|
||||
// Write element size
|
||||
const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
||||
write(&v_size_el, sizeof(v_size_el));
|
||||
|
||||
// Write GQA embedding size
|
||||
write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
|
||||
|
||||
// For each row, we get the element values of each cell
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
||||
const size_t buf_size = range_size * v_size_el;
|
||||
write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
|
||||
const struct llama_kv_cache & kv_self = ctx->kv_self;
|
||||
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
||||
uint32_t cell_count = 0;
|
||||
|
||||
// Count the number of cells with the specified seq_id
|
||||
// Find all the ranges of cells with this seq id (or all, when -1)
|
||||
uint32_t cell_range_begin = kv_self.size;
|
||||
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||
const auto & cell = kv_self.cells[i];
|
||||
if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
|
||||
++cell_count;
|
||||
if (cell_range_begin == kv_self.size) {
|
||||
cell_range_begin = i;
|
||||
}
|
||||
} else {
|
||||
if (cell_range_begin != kv_self.size) {
|
||||
cell_ranges.emplace_back(cell_range_begin, i);
|
||||
cell_range_begin = kv_self.size;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cell_range_begin != kv_self.size) {
|
||||
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
||||
}
|
||||
|
||||
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
||||
uint32_t cell_count_check = 0;
|
||||
for (const auto & range : cell_ranges) {
|
||||
cell_count_check += range.second - range.first;
|
||||
}
|
||||
GGML_ASSERT(cell_count == cell_count_check);
|
||||
|
||||
write(&cell_count, sizeof(cell_count));
|
||||
|
||||
write_kv_cache_meta(kv_self, cell_ranges, seq_id);
|
||||
write_kv_cache_data(ctx, cell_ranges);
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_data_read {
|
||||
virtual const uint8_t * read(size_t size) = 0;
|
||||
virtual void read_to(void * dst, size_t size) = 0;
|
||||
virtual size_t get_size_read() = 0;
|
||||
virtual ~llama_data_read() = default;
|
||||
|
||||
void read_string(std::string & str) {
|
||||
uint32_t str_size;
|
||||
read_to(&str_size, sizeof(str_size));
|
||||
|
||||
str.assign((const char *) read(str_size), str_size);
|
||||
}
|
||||
|
||||
// validate model information
|
||||
void read_model_info(const struct llama_context * ctx) {
|
||||
std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
|
||||
std::string arch_str;
|
||||
read_string(arch_str);
|
||||
if (cur_arch_str != arch_str) {
|
||||
throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
|
||||
}
|
||||
// TODO: add more info which needs to be identical but which is not verified otherwise
|
||||
}
|
||||
|
||||
//void read_rng(std::mt19937 & rng) {
|
||||
// std::string rng_str;
|
||||
// read_string(rng_str);
|
||||
|
||||
// std::istringstream rng_ss(rng_str);
|
||||
// rng_ss >> rng;
|
||||
|
||||
// if (rng_ss.fail()) {
|
||||
// throw std::runtime_error("failed to load RNG state");
|
||||
// }
|
||||
//}
|
||||
|
||||
void read_output_ids(struct llama_context * ctx) {
|
||||
std::vector<int32_t> output_pos;
|
||||
|
||||
uint32_t n_outputs;
|
||||
read_to(&n_outputs, sizeof(n_outputs));
|
||||
|
||||
if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
|
||||
throw std::runtime_error("could not reserve outputs");
|
||||
}
|
||||
|
||||
if (n_outputs) {
|
||||
output_pos.resize(n_outputs);
|
||||
read_to(output_pos.data(), n_outputs * sizeof(int32_t));
|
||||
|
||||
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
|
||||
int32_t id = output_pos[i];
|
||||
if ((uint32_t) id >= ctx->cparams.n_batch) {
|
||||
throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
|
||||
}
|
||||
ctx->output_ids[id] = i;
|
||||
}
|
||||
|
||||
ctx->n_outputs = n_outputs;
|
||||
}
|
||||
}
|
||||
|
||||
void read_logits(struct llama_context * ctx) {
|
||||
uint64_t logits_size;
|
||||
read_to(&logits_size, sizeof(logits_size));
|
||||
|
||||
if (ctx->logits_size < logits_size) {
|
||||
throw std::runtime_error("logits buffer too small");
|
||||
}
|
||||
|
||||
if (logits_size) {
|
||||
read_to(ctx->logits, logits_size * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
void read_embeddings(struct llama_context * ctx) {
|
||||
uint64_t embeddings_size;
|
||||
read_to(&embeddings_size, sizeof(embeddings_size));
|
||||
|
||||
if (ctx->embd_size < embeddings_size) {
|
||||
throw std::runtime_error("embeddings buffer too small");
|
||||
}
|
||||
|
||||
if (embeddings_size) {
|
||||
read_to(ctx->embd, embeddings_size * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
|
||||
struct llama_kv_cache & kv_self = ctx->kv_self;
|
||||
|
||||
if (dest_seq_id != -1) {
|
||||
// single sequence
|
||||
|
||||
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
||||
|
||||
llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
|
||||
batch.n_tokens = cell_count;
|
||||
batch.n_seq_tokens = cell_count;
|
||||
batch.n_seqs = 1;
|
||||
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
llama_pos pos;
|
||||
uint32_t n_seq_id;
|
||||
|
||||
read_to(&pos, sizeof(pos));
|
||||
read_to(&n_seq_id, sizeof(n_seq_id));
|
||||
|
||||
if (n_seq_id != 0) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
batch.pos[i] = pos;
|
||||
}
|
||||
batch.n_seq_id[0] = 1;
|
||||
batch.seq_id[0] = &dest_seq_id;
|
||||
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
// DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
||||
// Assume that this is one contiguous block of cells
|
||||
GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
|
||||
GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
|
||||
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
|
||||
GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
|
||||
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
|
||||
} else {
|
||||
// whole KV cache restore
|
||||
|
||||
if (cell_count > kv_self.size) {
|
||||
LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
llama_kv_cache_clear(kv_self);
|
||||
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
llama_kv_cell & cell = kv_self.cells[i];
|
||||
|
||||
llama_pos pos;
|
||||
uint32_t n_seq_id;
|
||||
|
||||
read_to(&pos, sizeof(pos));
|
||||
read_to(&n_seq_id, sizeof(n_seq_id));
|
||||
|
||||
cell.pos = pos;
|
||||
|
||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||
llama_seq_id seq_id;
|
||||
read_to(&seq_id, sizeof(seq_id));
|
||||
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
|
||||
return false;
|
||||
}
|
||||
|
||||
cell.seq_id.insert(seq_id);
|
||||
|
||||
if (kv_self.recurrent) {
|
||||
int32_t & tail = kv_self.cells[seq_id].tail;
|
||||
if (tail != -1) {
|
||||
LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
|
||||
return false;
|
||||
}
|
||||
tail = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kv_self.head = 0;
|
||||
kv_self.used = cell_count;
|
||||
}
|
||||
|
||||
if (kv_self.recurrent) {
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
uint32_t cell_id = kv_self.head + i;
|
||||
// make sure the recurrent states will keep their restored state
|
||||
kv_self.cells[cell_id].src = cell_id;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
|
||||
const struct llama_hparams & hparams = ctx->model.hparams;
|
||||
struct llama_kv_cache & kv_self = ctx->kv_self;
|
||||
uint32_t v_trans;
|
||||
uint32_t n_layer;
|
||||
read_to(&v_trans, sizeof(v_trans));
|
||||
read_to(&n_layer, sizeof(n_layer));
|
||||
|
||||
if (n_layer != hparams.n_layer) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
|
||||
return false;
|
||||
}
|
||||
if (cell_count > kv_self.size) {
|
||||
LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
|
||||
return false;
|
||||
}
|
||||
if (kv_self.v_trans != (bool) v_trans) {
|
||||
LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
|
||||
|
||||
// Read type of key
|
||||
int32_t k_type_i_ref;
|
||||
read_to(&k_type_i_ref, sizeof(k_type_i_ref));
|
||||
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
||||
if (k_type_i != k_type_i_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read row size of key
|
||||
uint64_t k_size_row_ref;
|
||||
read_to(&k_size_row_ref, sizeof(k_size_row_ref));
|
||||
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
||||
if (k_size_row != k_size_row_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cell_count) {
|
||||
// Read and set the keys for the whole cell range
|
||||
ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
|
||||
}
|
||||
}
|
||||
|
||||
if (!kv_self.v_trans) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
|
||||
|
||||
// Read type of value
|
||||
int32_t v_type_i_ref;
|
||||
read_to(&v_type_i_ref, sizeof(v_type_i_ref));
|
||||
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
||||
if (v_type_i != v_type_i_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read row size of value
|
||||
uint64_t v_size_row_ref;
|
||||
read_to(&v_size_row_ref, sizeof(v_size_row_ref));
|
||||
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
||||
if (v_size_row != v_size_row_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cell_count) {
|
||||
// Read and set the values for the whole cell range
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// For each layer, read the values for each cell (transposed)
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
|
||||
|
||||
// Read type of value
|
||||
int32_t v_type_i_ref;
|
||||
read_to(&v_type_i_ref, sizeof(v_type_i_ref));
|
||||
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
||||
if (v_type_i != v_type_i_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read element size of value
|
||||
uint32_t v_size_el_ref;
|
||||
read_to(&v_size_el_ref, sizeof(v_size_el_ref));
|
||||
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
||||
if (v_size_el != v_size_el_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read GQA embedding size
|
||||
uint32_t n_embd_v_gqa_ref;
|
||||
read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
|
||||
if (n_embd_v_gqa != n_embd_v_gqa_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cell_count) {
|
||||
// For each row in the transposed matrix, read the values for the whole cell range
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
|
||||
uint32_t cell_count;
|
||||
read_to(&cell_count, sizeof(cell_count));
|
||||
|
||||
bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
|
||||
|
||||
if (!res) {
|
||||
if (seq_id == -1) {
|
||||
llama_kv_cache_clear(ctx);
|
||||
} else {
|
||||
llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
|
||||
}
|
||||
throw std::runtime_error("failed to restore kv cache");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_data_write_dummy : llama_data_write {
|
||||
size_t size_written = 0;
|
||||
|
||||
llama_data_write_dummy() {}
|
||||
|
||||
void write(const void * /* src */, size_t size) override {
|
||||
size_written += size;
|
||||
}
|
||||
|
||||
void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
|
||||
size_written += size;
|
||||
}
|
||||
|
||||
size_t get_size_written() override {
|
||||
return size_written;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_data_write_buffer : llama_data_write {
|
||||
uint8_t * ptr;
|
||||
size_t buf_size = 0;
|
||||
size_t size_written = 0;
|
||||
|
||||
llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
|
||||
|
||||
void write(const void * src, size_t size) override {
|
||||
if (size > buf_size) {
|
||||
throw std::runtime_error("unexpectedly reached end of buffer");
|
||||
}
|
||||
memcpy(ptr, src, size);
|
||||
ptr += size;
|
||||
size_written += size;
|
||||
buf_size -= size;
|
||||
}
|
||||
|
||||
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
|
||||
if (size > buf_size) {
|
||||
throw std::runtime_error("unexpectedly reached end of buffer");
|
||||
}
|
||||
ggml_backend_tensor_get(tensor, ptr, offset, size);
|
||||
ptr += size;
|
||||
size_written += size;
|
||||
buf_size -= size;
|
||||
}
|
||||
|
||||
size_t get_size_written() override {
|
||||
return size_written;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_data_read_buffer : llama_data_read {
|
||||
const uint8_t * ptr;
|
||||
size_t buf_size = 0;
|
||||
size_t size_read = 0;
|
||||
|
||||
llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
|
||||
|
||||
const uint8_t * read(size_t size) override {
|
||||
const uint8_t * base_ptr = ptr;
|
||||
if (size > buf_size) {
|
||||
throw std::runtime_error("unexpectedly reached end of buffer");
|
||||
}
|
||||
ptr += size;
|
||||
size_read += size;
|
||||
buf_size -= size;
|
||||
return base_ptr;
|
||||
}
|
||||
|
||||
void read_to(void * dst, size_t size) override {
|
||||
memcpy(dst, read(size), size);
|
||||
}
|
||||
|
||||
size_t get_size_read() override {
|
||||
return size_read;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_data_write_file : llama_data_write {
|
||||
llama_file * file;
|
||||
size_t size_written = 0;
|
||||
std::vector<uint8_t> temp_buffer;
|
||||
|
||||
llama_data_write_file(llama_file * f) : file(f) {}
|
||||
|
||||
void write(const void * src, size_t size) override {
|
||||
file->write_raw(src, size);
|
||||
size_written += size;
|
||||
}
|
||||
|
||||
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
|
||||
temp_buffer.resize(size);
|
||||
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
|
||||
write(temp_buffer.data(), temp_buffer.size());
|
||||
}
|
||||
|
||||
size_t get_size_written() override {
|
||||
return size_written;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_data_read_file : llama_data_read {
|
||||
llama_file * file;
|
||||
size_t size_read = 0;
|
||||
std::vector<uint8_t> temp_buffer;
|
||||
|
||||
llama_data_read_file(llama_file * f) : file(f) {}
|
||||
|
||||
void read_to(void * dst, size_t size) override {
|
||||
file->read_raw(dst, size);
|
||||
size_read += size;
|
||||
}
|
||||
|
||||
const uint8_t * read(size_t size) override {
|
||||
temp_buffer.resize(size);
|
||||
read_to(temp_buffer.data(), size);
|
||||
return temp_buffer.data();
|
||||
}
|
||||
|
||||
size_t get_size_read() override {
|
||||
return size_read;
|
||||
}
|
||||
};
|
||||
|
||||
/** copy state data into either a buffer or file depending on the passed in context
|
||||
*
|
||||
* file context:
|
||||
* llama_file file("/path", "wb");
|
||||
* llama_data_write_file data_ctx(&file);
|
||||
* llama_state_get_data_internal(ctx, data_ctx);
|
||||
*
|
||||
* buffer context:
|
||||
* std::vector<uint8_t> buf(max_size, 0);
|
||||
* llama_data_write_buffer data_ctx(buf.data(), max_size);
|
||||
* llama_state_get_data_internal(ctx, data_ctx);
|
||||
*
|
||||
*/
|
||||
static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
data_ctx.write_model_info(ctx);
|
||||
|
||||
// copy outputs
|
||||
data_ctx.write_output_ids(ctx);
|
||||
data_ctx.write_logits(ctx);
|
||||
data_ctx.write_embeddings(ctx);
|
||||
|
||||
data_ctx.write_kv_cache(ctx);
|
||||
|
||||
return data_ctx.get_size_written();
|
||||
}
|
||||
|
||||
size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
|
||||
llama_data_write_buffer data_ctx(dst, size);
|
||||
try {
|
||||
return llama_state_get_data_internal(ctx, data_ctx);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the *actual* size of the state.
|
||||
// Intended to be used when saving to state to a buffer.
|
||||
size_t llama_state_get_size(struct llama_context * ctx) {
|
||||
llama_data_write_dummy data_ctx;
|
||||
try {
|
||||
return llama_state_get_data_internal(ctx, data_ctx);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
data_ctx.read_model_info(ctx);
|
||||
|
||||
// set outputs
|
||||
data_ctx.read_output_ids(ctx);
|
||||
data_ctx.read_logits(ctx);
|
||||
data_ctx.read_embeddings(ctx);
|
||||
|
||||
data_ctx.read_kv_cache(ctx);
|
||||
|
||||
return data_ctx.get_size_read();
|
||||
}
|
||||
|
||||
// Sets the state reading from the specified source address
|
||||
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
|
||||
llama_data_read_buffer data_ctx(src, size);
|
||||
try {
|
||||
return llama_state_set_data_internal(ctx, data_ctx);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||
llama_file file(path_session, "rb");
|
||||
|
||||
// sanity checks
|
||||
{
|
||||
const uint32_t magic = file.read_u32();
|
||||
const uint32_t version = file.read_u32();
|
||||
|
||||
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
||||
LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// load the prompt
|
||||
{
|
||||
const uint32_t n_token_count = file.read_u32();
|
||||
|
||||
if (n_token_count > n_token_capacity) {
|
||||
LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
||||
return false;
|
||||
}
|
||||
|
||||
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
||||
*n_token_count_out = n_token_count;
|
||||
}
|
||||
|
||||
// restore the context state
|
||||
{
|
||||
const size_t n_state_size_cur = file.size - file.tell();
|
||||
|
||||
llama_data_read_file data_ctx(&file);
|
||||
const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
|
||||
|
||||
if (n_read != n_state_size_cur) {
|
||||
LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||
try {
|
||||
return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
||||
llama_file file(path_session, "wb");
|
||||
|
||||
file.write_u32(LLAMA_SESSION_MAGIC);
|
||||
file.write_u32(LLAMA_SESSION_VERSION);
|
||||
|
||||
// save the prompt
|
||||
file.write_u32((uint32_t) n_token_count);
|
||||
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
||||
|
||||
// save the context state using stream saving
|
||||
llama_data_write_file data_ctx(&file);
|
||||
llama_state_get_data_internal(ctx, data_ctx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
||||
try {
|
||||
return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
data_ctx.write_kv_cache(ctx, seq_id);
|
||||
|
||||
return data_ctx.get_size_written();
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
|
||||
llama_data_write_dummy data_ctx;
|
||||
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
|
||||
llama_data_write_buffer data_ctx(dst, size);
|
||||
try {
|
||||
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
data_ctx.read_kv_cache(ctx, dest_seq_id);
|
||||
|
||||
return data_ctx.get_size_read();
|
||||
}
|
||||
|
||||
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
|
||||
llama_data_read_buffer data_ctx(src, size);
|
||||
try {
|
||||
return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
||||
llama_file file(filepath, "wb");
|
||||
|
||||
file.write_u32(LLAMA_STATE_SEQ_MAGIC);
|
||||
file.write_u32(LLAMA_STATE_SEQ_VERSION);
|
||||
|
||||
// save the prompt
|
||||
file.write_u32((uint32_t) n_token_count);
|
||||
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
||||
|
||||
// save the context state using stream saving
|
||||
llama_data_write_file data_ctx(&file);
|
||||
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
||||
|
||||
const size_t res = file.tell();
|
||||
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
|
||||
return res;
|
||||
}
|
||||
|
||||
static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||
llama_file file(filepath, "rb");
|
||||
|
||||
// version checks
|
||||
{
|
||||
const uint32_t magic = file.read_u32();
|
||||
const uint32_t version = file.read_u32();
|
||||
|
||||
if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
|
||||
LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// load the prompt
|
||||
{
|
||||
const uint32_t n_token_count = file.read_u32();
|
||||
|
||||
if (n_token_count > n_token_capacity) {
|
||||
LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
||||
return 0;
|
||||
}
|
||||
|
||||
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
||||
*n_token_count_out = n_token_count;
|
||||
}
|
||||
|
||||
// restore the context state
|
||||
{
|
||||
const size_t state_size = file.size - file.tell();
|
||||
llama_data_read_file data_ctx(&file);
|
||||
const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
|
||||
if (!nread) {
|
||||
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
GGML_ASSERT(nread <= state_size);
|
||||
GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
|
||||
}
|
||||
|
||||
return file.tell();
|
||||
}
|
||||
|
||||
size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
||||
try {
|
||||
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||
try {
|
||||
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
358
src/llama-context.h
Normal file
358
src/llama-context.h
Normal file
@ -0,0 +1,358 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-kv-cache.h"
|
||||
#include "llama-control-vector.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
struct llama_cparams {
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
int n_threads; // number of threads to use for generation
|
||||
int n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
// These hyperparameters are not exposed in GGUF, because all
|
||||
// existing YaRN models use the same values for them.
|
||||
float yarn_ext_factor;
|
||||
float yarn_attn_factor;
|
||||
float yarn_beta_fast;
|
||||
float yarn_beta_slow;
|
||||
float defrag_thold;
|
||||
|
||||
bool embeddings;
|
||||
bool causal_attn;
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
llama_context(const llama_model & model)
|
||||
: model(model)
|
||||
, t_start_us(model.t_start_us)
|
||||
, t_load_us(model.t_load_us) {}
|
||||
|
||||
const struct llama_model & model;
|
||||
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sbatch sbatch;
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_control_vector cvec;
|
||||
|
||||
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
|
||||
|
||||
std::vector<ggml_backend_ptr> backends;
|
||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
|
||||
ggml_threadpool_t threadpool = nullptr;
|
||||
ggml_threadpool_t threadpool_batch = nullptr;
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
mutable int64_t t_start_us;
|
||||
mutable int64_t t_load_us;
|
||||
mutable int64_t t_p_eval_us = 0;
|
||||
mutable int64_t t_eval_us = 0;
|
||||
|
||||
mutable int64_t t_compute_start_us = 0;
|
||||
mutable int64_t n_queued_tokens = 0;
|
||||
|
||||
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||
mutable int32_t n_eval = 0; // number of eval calls
|
||||
|
||||
// host buffer for the model output (logits and embeddings)
|
||||
ggml_backend_buffer_ptr buf_output;
|
||||
|
||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||
size_t logits_size = 0; // capacity (of floats) for logits
|
||||
float * logits = nullptr;
|
||||
|
||||
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
||||
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
||||
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||
|
||||
bool logits_all = false;
|
||||
|
||||
// embeddings output (2-dimensional array: [n_outputs][n_embd])
|
||||
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
||||
size_t embd_size = 0; // capacity (of floats) for embeddings
|
||||
float * embd = nullptr;
|
||||
|
||||
// sequence embeddings output (map of [n_embd] vectors)
|
||||
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
||||
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
||||
|
||||
// whether we are computing encoder output or decoder output
|
||||
bool is_encoding = false;
|
||||
|
||||
// TODO: find a better way to accommodate mutli-dimension position encoding methods
|
||||
// number of position id each token get, 1 for each token in most cases.
|
||||
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
|
||||
int n_pos_per_token = 1;
|
||||
|
||||
// output of the encoder part of the encoder-decoder models
|
||||
std::vector<float> embd_enc;
|
||||
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
ggml_abort_callback abort_callback = nullptr;
|
||||
void * abort_callback_data = nullptr;
|
||||
|
||||
// input tensors
|
||||
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
||||
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
||||
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
||||
struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
|
||||
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
||||
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
||||
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
||||
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
|
||||
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
|
||||
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
|
||||
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
||||
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
||||
};
|
||||
|
||||
static bool llama_kv_cache_init(
|
||||
struct llama_kv_cache & cache,
|
||||
const llama_context * ctx,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
uint32_t kv_size,
|
||||
bool offload) {
|
||||
const llama_model & model = ctx->model;
|
||||
const llama_cparams & cparams = ctx->cparams;
|
||||
|
||||
const struct llama_hparams & hparams = model.hparams;
|
||||
|
||||
const int32_t n_layer = hparams.n_layer;
|
||||
|
||||
LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
|
||||
|
||||
cache.has_shift = false;
|
||||
|
||||
cache.recurrent = llama_model_is_recurrent(&model);
|
||||
cache.v_trans = !cache.recurrent && !cparams.flash_attn;
|
||||
|
||||
cache.head = 0;
|
||||
cache.size = kv_size;
|
||||
cache.used = 0;
|
||||
|
||||
cache.type_k = type_k;
|
||||
cache.type_v = type_v;
|
||||
|
||||
cache.cells.clear();
|
||||
cache.cells.resize(kv_size);
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
return nullptr;
|
||||
}
|
||||
ctx_map[buft] = ctx;
|
||||
cache.ctxs.emplace_back(ctx);
|
||||
return ctx;
|
||||
}
|
||||
return it->second;
|
||||
};
|
||||
|
||||
cache.k_l.reserve(n_layer);
|
||||
cache.v_l.reserve(n_layer);
|
||||
|
||||
for (int i = 0; i < n_layer; i++) {
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
|
||||
|
||||
ggml_backend_buffer_type_t buft;
|
||||
if (offload) {
|
||||
auto * dev = model.dev_layer.at(i).dev;
|
||||
buft = ggml_backend_dev_buffer_type(dev);
|
||||
} else {
|
||||
buft = ggml_backend_cpu_buffer_type();
|
||||
}
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.k_l.push_back(k);
|
||||
cache.v_l.push_back(v);
|
||||
}
|
||||
|
||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||
for (auto it : ctx_map) {
|
||||
auto * buft = it.first;
|
||||
auto * ctx = it.second;
|
||||
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (!buf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||
cache.bufs.emplace_back(buf);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
|
||||
// the FA kernels require padding to avoid extra runtime boundary checks
|
||||
return cparams.flash_attn ? 256u : 32u;
|
||||
}
|
||||
|
||||
// Make sure enough space is available for outputs.
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||
const auto & cparams = lctx.cparams;
|
||||
const auto & hparams = lctx.model.hparams;
|
||||
|
||||
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
||||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = hparams.n_vocab;
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
const bool has_logits = !cparams.embeddings;
|
||||
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
||||
|
||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
||||
|
||||
if (lctx.output_ids.empty()) {
|
||||
// init, never resized afterwards
|
||||
lctx.output_ids.resize(n_batch);
|
||||
}
|
||||
|
||||
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
|
||||
const size_t new_size = (logits_size + embd_size) * sizeof(float);
|
||||
|
||||
// alloc only when more than the current capacity is required
|
||||
// TODO: also consider shrinking the buffer
|
||||
if (!lctx.buf_output || prev_size < new_size) {
|
||||
if (lctx.buf_output) {
|
||||
#ifndef NDEBUG
|
||||
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
||||
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
lctx.buf_output = nullptr;
|
||||
lctx.logits = nullptr;
|
||||
lctx.embd = nullptr;
|
||||
}
|
||||
|
||||
auto * buft = ggml_backend_cpu_buffer_type();
|
||||
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
||||
auto * output_dev = lctx.model.dev_output.dev;
|
||||
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
||||
if (output_dev_host_buft) {
|
||||
buft = output_dev_host_buft;
|
||||
}
|
||||
lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
|
||||
if (lctx.buf_output == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
|
||||
|
||||
lctx.logits = has_logits ? output_base : nullptr;
|
||||
lctx.embd = has_embd ? output_base + logits_size : nullptr;
|
||||
|
||||
lctx.output_size = n_outputs_max;
|
||||
lctx.logits_size = logits_size;
|
||||
lctx.embd_size = embd_size;
|
||||
|
||||
// set all ids as invalid (negative)
|
||||
std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
|
||||
|
||||
ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
|
||||
|
||||
lctx.n_outputs = 0;
|
||||
|
||||
return n_outputs_max;
|
||||
}
|
||||
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
static void llama_output_reorder(struct llama_context * ctx) {
|
||||
std::vector<size_t> & out_ids = ctx->sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
uint32_t n_vocab = ctx->model.hparams.n_vocab;
|
||||
uint32_t n_embd = ctx->model.hparams.n_embd;
|
||||
int32_t n_outputs = ctx->n_outputs;
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
|
||||
for (int32_t i = 0; i < n_outputs - 1; ++i) {
|
||||
int32_t j_min = i;
|
||||
for (int32_t j = i + 1; j < n_outputs; ++j) {
|
||||
if (out_ids[j] < out_ids[j_min]) {
|
||||
j_min = j;
|
||||
}
|
||||
}
|
||||
if (j_min == i) { continue; }
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (ctx->logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(ctx->logits[i*n_vocab + k], ctx->logits[j_min*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
if (ctx->embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(ctx->embd[i*n_embd + k], ctx->embd[j_min*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::fill(ctx->output_ids.begin(), ctx->output_ids.end(), -1);
|
||||
for (int32_t i = 0; i < n_outputs; ++i) {
|
||||
ctx->output_ids[out_ids[i]] = i;
|
||||
}
|
||||
out_ids.clear();
|
||||
}
|
||||
}
|
1
src/llama-control-vector.cpp
Normal file
1
src/llama-control-vector.cpp
Normal file
@ -0,0 +1 @@
|
||||
#include "llama-control-vector.h"
|
130
src/llama-control-vector.h
Normal file
130
src/llama-control-vector.h
Normal file
@ -0,0 +1,130 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include "llama-model.h" // TODO: need only hparams
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
struct llama_control_vector {
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
int32_t layer_start = -1;
|
||||
int32_t layer_end = -1;
|
||||
|
||||
struct ggml_tensor * tensor_for(int il) const {
|
||||
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
return tensors[il];
|
||||
}
|
||||
|
||||
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
||||
ggml_tensor * layer_dir = tensor_for(il);
|
||||
if (layer_dir != nullptr) {
|
||||
cur = ggml_add(ctx, cur, layer_dir);
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
};
|
||||
|
||||
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
||||
GGML_ASSERT(cvec.tensors.empty());
|
||||
GGML_ASSERT(cvec.ctxs.empty());
|
||||
GGML_ASSERT(cvec.bufs.empty());
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ model.hparams.n_layer*ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
return nullptr;
|
||||
}
|
||||
ctx_map[buft] = ctx;
|
||||
cvec.ctxs.emplace_back(ctx);
|
||||
return ctx;
|
||||
}
|
||||
return it->second;
|
||||
};
|
||||
|
||||
// make tensors
|
||||
cvec.tensors.reserve(model.hparams.n_layer);
|
||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||
ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
|
||||
[&](ggml_context * ctx) {
|
||||
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||
return ggml_add(ctx, cur, layer_dir);
|
||||
});
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||
cvec.tensors.push_back(tensor);
|
||||
}
|
||||
|
||||
// allocate tensors / buffers and zero
|
||||
cvec.bufs.reserve(ctx_map.size());
|
||||
for (auto it : ctx_map) {
|
||||
ggml_backend_buffer_type_t buft = it.first;
|
||||
ggml_context * ctx = it.second;
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (!buf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
cvec.bufs.emplace_back(buf);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int32_t llama_control_vector_apply(struct llama_control_vector & cvec, const llama_model & model, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
|
||||
if (data == nullptr) {
|
||||
// disable the current control vector (but leave allocated for later)
|
||||
cvec.layer_start = -1;
|
||||
cvec.layer_end = -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (n_embd != (int) model.hparams.n_embd) {
|
||||
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (cvec.tensors.empty()) {
|
||||
if (!llama_control_vector_init(cvec, model)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
cvec.layer_start = il_start;
|
||||
cvec.layer_end = il_end;
|
||||
|
||||
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||
assert(cvec.tensors[il] != nullptr);
|
||||
|
||||
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
||||
if (off + n_embd <= len) {
|
||||
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -24,6 +24,23 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
||||
void llama_log_internal (ggml_log_level level, const char * format, ...);
|
||||
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
// TODO: move to source
|
||||
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
GGML_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
}
|
||||
|
||||
#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||
|
2
src/llama-kv-cache.cpp
Normal file
2
src/llama-kv-cache.cpp
Normal file
@ -0,0 +1,2 @@
|
||||
#include "llama-kv-cache.h"
|
||||
|
625
src/llama-kv-cache.h
Normal file
625
src/llama-kv-cache.h
Normal file
@ -0,0 +1,625 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-model.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
struct llama_kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
int32_t src = -1; // used by recurrent state models to copy states
|
||||
int32_t tail = -1;
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
|
||||
bool has_seq_id(const llama_seq_id & id) const {
|
||||
return seq_id.find(id) != seq_id.end();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return seq_id.empty();
|
||||
}
|
||||
|
||||
bool is_same_seq(const llama_kv_cell & other) const {
|
||||
return seq_id == other.seq_id;
|
||||
}
|
||||
};
|
||||
|
||||
// ring-buffer of cached KV data
|
||||
struct llama_kv_cache {
|
||||
bool has_shift = false;
|
||||
bool do_defrag = false;
|
||||
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
|
||||
// Note: The value of head isn't only used to optimize searching
|
||||
// for a free KV slot. llama_decode_internal also uses it, so it
|
||||
// cannot be freely changed after a slot has been allocated.
|
||||
uint32_t head = 0;
|
||||
uint32_t size = 0;
|
||||
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
|
||||
std::vector<llama_kv_cell> cells;
|
||||
|
||||
std::vector<struct ggml_tensor *> k_l; // per layer
|
||||
std::vector<struct ggml_tensor *> v_l;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
size_t total_size() {
|
||||
size_t size = 0;
|
||||
for (auto & buf : bufs) {
|
||||
size += ggml_backend_buffer_get_size(buf.get());
|
||||
}
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// kv cache helpers
|
||||
//
|
||||
|
||||
// a structure holds information about the slot found in llama_kv_cache_find_slot
|
||||
struct llama_kv_cache_slot_info {
|
||||
std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
|
||||
bool found = false; // the slot was found
|
||||
|
||||
explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
|
||||
llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
|
||||
|
||||
operator bool() const { return found; }
|
||||
};
|
||||
static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
|
||||
|
||||
// find an empty slot of size "n_tokens" in the cache
|
||||
// updates the cache head
|
||||
// returns a structure holding information about the slot found
|
||||
// Note: On success, it's important that cache.head points
|
||||
// to the first cell of the slot.
|
||||
static struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
struct llama_kv_cache & cache,
|
||||
const struct llama_ubatch & batch) {
|
||||
const uint32_t n_tokens = batch.n_tokens;
|
||||
const uint32_t n_seqs = batch.n_seqs;
|
||||
const uint32_t n_seq_tokens = batch.n_seq_tokens;
|
||||
|
||||
if (cache.recurrent) {
|
||||
// For recurrent state architectures (like Mamba or RWKV),
|
||||
// each cache cell can store the state for a whole sequence.
|
||||
// A slot should be always be contiguous.
|
||||
|
||||
// can only process batches with an equal number of new tokens in each sequence
|
||||
GGML_ASSERT(batch.equal_seqs);
|
||||
|
||||
int32_t min = cache.size - 1;
|
||||
int32_t max = 0;
|
||||
|
||||
// everything should fit if all seq_ids are smaller than the max
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const uint32_t n_seq_id = batch.n_seq_id[s];
|
||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
||||
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
|
||||
// too big seq_id
|
||||
// TODO: would it be possible to resize the cache instead?
|
||||
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
|
||||
return llama_kv_cache_slot_info_failed;
|
||||
}
|
||||
if (j > 0) {
|
||||
llama_kv_cell & seq = cache.cells[seq_id];
|
||||
if (seq.tail >= 0) {
|
||||
llama_kv_cell & cell = cache.cells[seq.tail];
|
||||
// clear cells from seq_ids that become shared
|
||||
// (should not normally happen, but let's handle it anyway)
|
||||
cell.seq_id.erase(seq_id);
|
||||
seq.tail = -1;
|
||||
if (cell.seq_id.empty()) {
|
||||
cell.pos = -1;
|
||||
cell.src = -1;
|
||||
cache.used -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
{
|
||||
std::vector<int32_t> tails_verif;
|
||||
tails_verif.assign(cache.size, -1);
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
llama_kv_cell & cell = cache.cells[i];
|
||||
for (llama_seq_id seq_id : cell.seq_id) {
|
||||
if (tails_verif[seq_id] != -1) {
|
||||
LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
|
||||
}
|
||||
tails_verif[seq_id] = i;
|
||||
}
|
||||
}
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (tails_verif[i] != cache.cells[i].tail) {
|
||||
LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// find next empty cell
|
||||
uint32_t next_empty_cell = cache.head;
|
||||
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
|
||||
llama_kv_cell & cell = cache.cells[next_empty_cell];
|
||||
if (cell.is_empty()) { break; }
|
||||
next_empty_cell += 1;
|
||||
}
|
||||
|
||||
// find usable cell range
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][0];
|
||||
llama_kv_cell & seq_meta = cache.cells[seq_id];
|
||||
bool has_cell = false;
|
||||
if (seq_meta.tail >= 0) {
|
||||
llama_kv_cell & cell = cache.cells[seq_meta.tail];
|
||||
GGML_ASSERT(cell.has_seq_id(seq_id));
|
||||
// does this seq_id "own" the cell?
|
||||
if (cell.seq_id.size() == 1) { has_cell = true; }
|
||||
}
|
||||
if (!has_cell) {
|
||||
llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
|
||||
GGML_ASSERT(empty_cell.is_empty());
|
||||
// copy old tail into the empty cell
|
||||
if (seq_meta.tail >= 0) {
|
||||
llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
|
||||
empty_cell.pos = orig_cell.pos;
|
||||
empty_cell.src = orig_cell.src;
|
||||
orig_cell.seq_id.erase(seq_id);
|
||||
empty_cell.seq_id.insert(seq_id); // will be overwritten
|
||||
}
|
||||
seq_meta.tail = next_empty_cell;
|
||||
// find next empty cell
|
||||
if (s + 1 < n_seqs) {
|
||||
next_empty_cell += 1;
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
|
||||
llama_kv_cell & cell = cache.cells[next_empty_cell];
|
||||
if (cell.is_empty()) { break; }
|
||||
next_empty_cell += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (min > seq_meta.tail) { min = seq_meta.tail; }
|
||||
if (max < seq_meta.tail) { max = seq_meta.tail; }
|
||||
}
|
||||
|
||||
// gather and re-order
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
int32_t dst_id = s + min;
|
||||
int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
|
||||
if (dst_id != src_id) {
|
||||
llama_kv_cell & dst_cell = cache.cells[dst_id];
|
||||
llama_kv_cell & src_cell = cache.cells[src_id];
|
||||
|
||||
std::swap(dst_cell.pos, src_cell.pos);
|
||||
std::swap(dst_cell.src, src_cell.src);
|
||||
std::swap(dst_cell.seq_id, src_cell.seq_id);
|
||||
|
||||
// swap tails (assuming they NEVER overlap)
|
||||
for (const llama_seq_id seq_id : src_cell.seq_id) {
|
||||
cache.cells[seq_id].tail = src_id;
|
||||
}
|
||||
for (const llama_seq_id seq_id : dst_cell.seq_id) {
|
||||
cache.cells[seq_id].tail = dst_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update the pos of the used seqs
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||
int32_t cell_id = s + min;
|
||||
llama_kv_cell & cell = cache.cells[cell_id];
|
||||
|
||||
if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
|
||||
// What should happen when the pos backtracks or skips a value?
|
||||
// Clearing the state mid-batch would require special-casing which isn't done.
|
||||
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
|
||||
__func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
|
||||
}
|
||||
cell.pos = last_pos;
|
||||
cell.seq_id.clear();
|
||||
for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
||||
cell.seq_id.insert(seq_id);
|
||||
cache.cells[seq_id].tail = cell_id;
|
||||
}
|
||||
}
|
||||
|
||||
// allow getting the range of used cells, from head to head + n
|
||||
cache.head = min;
|
||||
cache.n = max - min + 1;
|
||||
cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
|
||||
[](const llama_kv_cell& cell){ return !cell.is_empty(); });
|
||||
|
||||
// sanity check
|
||||
return llama_kv_cache_slot_info(cache.n >= n_seqs);
|
||||
}
|
||||
// otherwise, one cell per token.
|
||||
|
||||
if (n_tokens > cache.size) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
||||
return llama_kv_cache_slot_info_failed;
|
||||
}
|
||||
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
while (true) {
|
||||
if (cache.head + n_tokens > cache.size) {
|
||||
n_tested += cache.size - cache.head;
|
||||
cache.head = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
bool found = true;
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
if (cache.cells[cache.head + i].pos >= 0) {
|
||||
found = false;
|
||||
cache.head += i + 1;
|
||||
n_tested += i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (n_tested >= cache.size) {
|
||||
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||
return llama_kv_cache_slot_info_failed;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t s = 0; s < n_seqs; s++) {
|
||||
for (uint32_t i = 0; i < n_seq_tokens; ++i) {
|
||||
uint32_t k = s*n_seq_tokens + i;
|
||||
cache.cells[cache.head + k].pos = batch.pos[k];
|
||||
|
||||
for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
|
||||
cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cache.used += n_tokens;
|
||||
|
||||
return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
|
||||
}
|
||||
|
||||
// find how many cells are currently in use
|
||||
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
||||
for (uint32_t i = cache.size; i > 0; --i) {
|
||||
const llama_kv_cell & cell = cache.cells[i - 1];
|
||||
|
||||
if (cell.pos >= 0 && !cell.is_empty()) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
||||
for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
|
||||
cache.cells[i].pos = -1;
|
||||
cache.cells[i].seq_id.clear();
|
||||
cache.cells[i].src = -1;
|
||||
cache.cells[i].tail = -1;
|
||||
}
|
||||
cache.head = 0;
|
||||
cache.used = 0;
|
||||
|
||||
for (auto & buf : cache.bufs) {
|
||||
ggml_backend_buffer_clear(buf.get(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
static bool llama_kv_cache_seq_rm(
|
||||
struct llama_kv_cache & cache,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1) {
|
||||
uint32_t new_head = cache.size;
|
||||
|
||||
if (p0 < 0) p0 = 0;
|
||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||
|
||||
// models like Mamba or RWKV can't have a state partially erased
|
||||
if (cache.recurrent) {
|
||||
if (seq_id >= (int64_t) cache.size) {
|
||||
// could be fatal
|
||||
return false;
|
||||
}
|
||||
if (0 <= seq_id) {
|
||||
int32_t & tail_id = cache.cells[seq_id].tail;
|
||||
if (tail_id >= 0) {
|
||||
const llama_kv_cell & cell = cache.cells[tail_id];
|
||||
// partial intersection is invalid
|
||||
if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
|
||||
return false;
|
||||
}
|
||||
// invalidate tails which will be cleared
|
||||
if (p0 <= cell.pos && cell.pos < p1) {
|
||||
tail_id = -1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// seq_id is negative, then the range should include everything or nothing
|
||||
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||
if (seq_id < 0) {
|
||||
cache.cells[i].seq_id.clear();
|
||||
} else if (cache.cells[i].has_seq_id(seq_id)) {
|
||||
cache.cells[i].seq_id.erase(seq_id);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
if (cache.cells[i].is_empty()) {
|
||||
// keep count of the number of used cells
|
||||
if (cache.cells[i].pos >= 0) cache.used--;
|
||||
|
||||
cache.cells[i].pos = -1;
|
||||
cache.cells[i].src = -1;
|
||||
if (new_head == cache.size) new_head = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we freed up a slot, set head to it so searching can start there.
|
||||
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void llama_kv_cache_seq_cp(
|
||||
struct llama_kv_cache & cache,
|
||||
llama_seq_id seq_id_src,
|
||||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1) {
|
||||
if (p0 < 0) p0 = 0;
|
||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||
|
||||
if (cache.recurrent) {
|
||||
if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
|
||||
llama_kv_cell & tail_src = cache.cells[seq_id_src];
|
||||
llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
|
||||
if (tail_dst.tail >= 0) {
|
||||
// clear destination seq_id if it wasn't empty
|
||||
llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
|
||||
|
||||
cell_dst.seq_id.erase(seq_id_dst);
|
||||
tail_dst.tail = -1;
|
||||
if (cell_dst.seq_id.empty()) {
|
||||
cell_dst.pos = -1;
|
||||
cell_dst.delta = -1;
|
||||
cell_dst.src = -1;
|
||||
cache.used -= 1;
|
||||
}
|
||||
}
|
||||
if (tail_src.tail >= 0) {
|
||||
llama_kv_cell & cell_src = cache.cells[tail_src.tail];
|
||||
|
||||
cell_src.seq_id.insert(seq_id_dst);
|
||||
tail_dst.tail = tail_src.tail;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
// otherwise, this is the KV cache of a Transformer-like model
|
||||
|
||||
cache.head = 0;
|
||||
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||
cache.cells[i].seq_id.insert(seq_id_dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
||||
uint32_t new_head = cache.size;
|
||||
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (cache.recurrent && (llama_seq_id) i != seq_id) {
|
||||
cache.cells[i].tail = -1;
|
||||
}
|
||||
if (!cache.cells[i].has_seq_id(seq_id)) {
|
||||
if (cache.cells[i].pos >= 0) cache.used--;
|
||||
cache.cells[i].pos = -1;
|
||||
cache.cells[i].src = -1;
|
||||
cache.cells[i].seq_id.clear();
|
||||
if (new_head == cache.size) new_head = i;
|
||||
} else {
|
||||
cache.cells[i].seq_id.clear();
|
||||
cache.cells[i].seq_id.insert(seq_id);
|
||||
}
|
||||
}
|
||||
|
||||
// If we freed up a slot, set head to it so searching can start there.
|
||||
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
||||
}
|
||||
|
||||
static void llama_kv_cache_seq_add(
|
||||
struct llama_kv_cache & cache,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
llama_pos delta) {
|
||||
uint32_t new_head = cache.size;
|
||||
|
||||
if (p0 < 0) p0 = 0;
|
||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||
// If there is no range then return early to avoid looping over the cache.
|
||||
if (p0 == p1) return;
|
||||
|
||||
if (cache.recurrent) {
|
||||
// for Mamba-like or RWKV models, only the pos needs to be shifted
|
||||
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
||||
const int32_t tail_id = cache.cells[seq_id].tail;
|
||||
if (tail_id >= 0) {
|
||||
llama_kv_cell & cell = cache.cells[tail_id];
|
||||
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
|
||||
cell.pos += delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||
cache.has_shift = true;
|
||||
cache.cells[i].pos += delta;
|
||||
cache.cells[i].delta += delta;
|
||||
|
||||
if (cache.cells[i].pos < 0) {
|
||||
if (!cache.cells[i].is_empty()) {
|
||||
cache.used--;
|
||||
}
|
||||
cache.cells[i].pos = -1;
|
||||
cache.cells[i].seq_id.clear();
|
||||
if (new_head == cache.size) {
|
||||
new_head = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we freed up a slot, set head to it so searching can start there.
|
||||
// Otherwise we just start the next search from the beginning.
|
||||
cache.head = new_head != cache.size ? new_head : 0;
|
||||
}
|
||||
|
||||
static void llama_kv_cache_seq_div(
|
||||
struct llama_kv_cache & cache,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
int d) {
|
||||
if (p0 < 0) p0 = 0;
|
||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||
// If there is no range then return early to avoid looping over the cache.
|
||||
if (p0 == p1) return;
|
||||
|
||||
if (cache.recurrent) {
|
||||
// for Mamba-like or RWKV models, only the pos needs to be changed
|
||||
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
||||
const int32_t tail_id = cache.cells[seq_id].tail;
|
||||
if (tail_id >= 0) {
|
||||
llama_kv_cell & cell = cache.cells[tail_id];
|
||||
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
|
||||
cell.pos /= d;
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||
cache.has_shift = true;
|
||||
|
||||
{
|
||||
llama_pos p_old = cache.cells[i].pos;
|
||||
cache.cells[i].pos /= d;
|
||||
cache.cells[i].delta += cache.cells[i].pos - p_old;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
||||
llama_pos result = 0;
|
||||
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (cache.cells[i].has_seq_id(seq_id)) {
|
||||
result = std::max(result, cache.cells[i].pos);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
||||
if (!cache.recurrent) {
|
||||
cache.do_defrag = true;
|
||||
}
|
||||
}
|
||||
|
||||
// saves the kv_cache state for future recovery.
|
||||
// used to rollback llama_kv_cache_find_slot changes.
|
||||
struct llama_kv_slot_restorer {
|
||||
struct llama_kv_cache_state {
|
||||
uint32_t head = 0;
|
||||
uint32_t n = 0;
|
||||
} old_state;
|
||||
|
||||
// for non-recurrent models only
|
||||
// list of slots to restore
|
||||
std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
|
||||
|
||||
bool do_restore = false;
|
||||
|
||||
explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
|
||||
old_state.head = cache.head;
|
||||
old_state.n = cache.n;
|
||||
}
|
||||
|
||||
// saves a slot information for future restoration
|
||||
void save(const struct llama_kv_cache_slot_info & slot) {
|
||||
if (slot) {
|
||||
do_restore = true;
|
||||
if (slot.boundaries.first != slot.boundaries.second) {
|
||||
slot_boundaries.push_back(slot.boundaries);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// must be explicitly called to restore the kv_cache state
|
||||
// and rollback changes from all llama_kv_cache_find_slot calls
|
||||
void restore(struct llama_kv_cache & cache) {
|
||||
if (do_restore) {
|
||||
cache.head = old_state.head;
|
||||
cache.n = old_state.n;
|
||||
|
||||
if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
|
||||
llama_kv_cache_seq_rm(cache, -1, -1, -1);
|
||||
} else {
|
||||
for (auto & slot : slot_boundaries) {
|
||||
llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
1
src/llama-mmap.cpp
Normal file
1
src/llama-mmap.cpp
Normal file
@ -0,0 +1 @@
|
||||
#include "llama-mmap.h"
|
587
src/llama-mmap.h
Normal file
587
src/llama-mmap.h
Normal file
@ -0,0 +1,587 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
#if defined(_POSIX_MEMLOCK_RANGE)
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX MAX_PATH
|
||||
#endif
|
||||
#include <io.h>
|
||||
#endif
|
||||
|
||||
struct llama_file {
|
||||
|
||||
#if defined(_WIN32)
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
HANDLE fp_win32;
|
||||
size_t size;
|
||||
|
||||
private:
|
||||
std::string GetErrorMessageWin32(DWORD error_code) const {
|
||||
std::string ret;
|
||||
LPSTR lpMsgBuf = NULL;
|
||||
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
||||
if (!bufLen) {
|
||||
ret = format("Win32 error code: %lx", error_code);
|
||||
} else {
|
||||
ret = lpMsgBuf;
|
||||
LocalFree(lpMsgBuf);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
llama_file(const char * fname, const char * mode) {
|
||||
fp = ggml_fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
}
|
||||
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
// SetFilePointerEx returns the current position when seeking relative 0 bytes
|
||||
LARGE_INTEGER li;
|
||||
li.QuadPart = 0;
|
||||
BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
|
||||
if (!ret) {
|
||||
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
||||
}
|
||||
|
||||
return li.QuadPart;
|
||||
}
|
||||
|
||||
void seek(size_t offset, int whence) const {
|
||||
// no need to convert SEEK_* to FILE_*. The enums are the same.
|
||||
// Still, keep static asserts to avoid failures in the future.
|
||||
static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
|
||||
static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
|
||||
static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
|
||||
|
||||
LARGE_INTEGER li;
|
||||
li.QuadPart = offset;
|
||||
BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
|
||||
if (!ret) {
|
||||
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
// On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
|
||||
// use the Win32 API to do file io instead of the C/C++ library functions.
|
||||
|
||||
// There are conditions under which ReadFile cannot read chunks >64MB.
|
||||
// Thus split the operation into smaller chunks if len exceeds this limit.
|
||||
size_t bytes_read = 0;
|
||||
while (bytes_read < len) {
|
||||
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
||||
DWORD chunk_read = 0;
|
||||
BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
|
||||
if (!result) {
|
||||
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
||||
}
|
||||
if (chunk_read < chunk_size || chunk_read == 0) {
|
||||
throw std::runtime_error("unexpectedly reached end of file");
|
||||
}
|
||||
|
||||
bytes_read += chunk_read;
|
||||
} ;
|
||||
}
|
||||
|
||||
uint32_t read_u32() const {
|
||||
uint32_t val;
|
||||
read_raw(&val, sizeof(val));
|
||||
return val;
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const {
|
||||
// There are conditions under which WriteFile cannot write chunks >64MB.
|
||||
// Thus split the operation into smaller chunks if len exceeds this limit.
|
||||
size_t bytes_written = 0;
|
||||
while (bytes_written < len) {
|
||||
size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
|
||||
DWORD chunk_written = 0;
|
||||
BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
|
||||
if (!result) {
|
||||
throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
||||
}
|
||||
if (chunk_written < chunk_size || chunk_written == 0) {
|
||||
throw std::runtime_error("unexpectedly failed to write bytes");
|
||||
}
|
||||
|
||||
bytes_written += chunk_written;
|
||||
}
|
||||
}
|
||||
|
||||
void write_u32(std::uint32_t val) const {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
|
||||
llama_file(const char * fname, const char * mode) {
|
||||
fp = ggml_fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
#ifdef _WIN32
|
||||
__int64 ret = _ftelli64(fp);
|
||||
#else
|
||||
long ret = std::ftell(fp);
|
||||
#endif
|
||||
if (ret == -1) {
|
||||
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
return (size_t) ret;
|
||||
}
|
||||
|
||||
void seek(size_t offset, int whence) const {
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||
#else
|
||||
int ret = std::fseek(fp, (long) offset, whence);
|
||||
#endif
|
||||
if (ret != 0) {
|
||||
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error("unexpectedly reached end of file");
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t read_u32() const {
|
||||
uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const {
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, len, 1, fp);
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
void write_u32(std::uint32_t val) const {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
||||
|
||||
struct llama_mmap {
|
||||
void * addr;
|
||||
size_t size;
|
||||
|
||||
llama_mmap(const llama_mmap &) = delete;
|
||||
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
// list of mapped fragments (first_offset, last_offset)
|
||||
std::vector<std::pair<size_t, size_t>> mapped_fragments;
|
||||
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
// prefetch/readahead impairs performance on NUMA systems
|
||||
if (numa) { prefetch = 0; }
|
||||
#ifdef __linux__
|
||||
// advise the kernel to read the file sequentially (increases readahead)
|
||||
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
||||
LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
if (prefetch) { flags |= MAP_POPULATE; }
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) { // NOLINT
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
if (prefetch > 0) {
|
||||
// advise the kernel to preload the mapped memory
|
||||
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
||||
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
if (numa) {
|
||||
// advise the kernel not to use readahead
|
||||
// (because the next page might not belong on the same node)
|
||||
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
||||
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
// initialize list of mapped_fragments
|
||||
mapped_fragments.emplace_back(0, file->size);
|
||||
}
|
||||
|
||||
static void align_range(size_t * first, size_t * last, size_t page_size) {
|
||||
// align first to the next page
|
||||
size_t offset_in_page = *first & (page_size - 1);
|
||||
size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
|
||||
*first += offset_to_page;
|
||||
|
||||
// align last to the previous page
|
||||
*last = *last & ~(page_size - 1);
|
||||
|
||||
if (*last <= *first) {
|
||||
*last = *first;
|
||||
}
|
||||
}
|
||||
|
||||
// partially unmap the file in the range [first, last)
|
||||
void unmap_fragment(size_t first, size_t last) {
|
||||
// note: this function must not be called multiple times with overlapping ranges
|
||||
// otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
|
||||
int page_size = sysconf(_SC_PAGESIZE);
|
||||
align_range(&first, &last, page_size);
|
||||
size_t len = last - first;
|
||||
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(first % page_size == 0);
|
||||
GGML_ASSERT(last % page_size == 0);
|
||||
GGML_ASSERT(last > first);
|
||||
|
||||
void * next_page_start = (uint8_t *) addr + first;
|
||||
|
||||
// unmap the range
|
||||
if (munmap(next_page_start, len)) {
|
||||
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
|
||||
}
|
||||
|
||||
// update the list of mapped fragments to avoid unmapping the same range again in the destructor
|
||||
std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
|
||||
for (const auto & frag : mapped_fragments) {
|
||||
if (frag.first < first && frag.second > last) {
|
||||
// the range is in the middle of the fragment, split it
|
||||
new_mapped_fragments.emplace_back(frag.first, first);
|
||||
new_mapped_fragments.emplace_back(last, frag.second);
|
||||
} else if (frag.first < first && frag.second > first) {
|
||||
// the range starts in the middle of the fragment
|
||||
new_mapped_fragments.emplace_back(frag.first, first);
|
||||
} else if (frag.first < last && frag.second > last) {
|
||||
// the range ends in the middle of the fragment
|
||||
new_mapped_fragments.emplace_back(last, frag.second);
|
||||
} else if (frag.first >= first && frag.second <= last) {
|
||||
// the range covers the entire fragment
|
||||
} else {
|
||||
// the range is outside the fragment
|
||||
new_mapped_fragments.push_back(frag);
|
||||
}
|
||||
}
|
||||
mapped_fragments = std::move(new_mapped_fragments);
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
for (const auto & frag : mapped_fragments) {
|
||||
if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
|
||||
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
|
||||
GGML_UNUSED(numa);
|
||||
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
|
||||
if (hMapping == NULL) {
|
||||
DWORD error = GetLastError();
|
||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
DWORD error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
if (addr == NULL) {
|
||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
if (prefetch > 0) {
|
||||
#if _WIN32_WINNT >= 0x602
|
||||
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
|
||||
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
|
||||
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
||||
|
||||
// may fail on pre-Windows 8 systems
|
||||
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
|
||||
|
||||
if (pPrefetchVirtualMemory) {
|
||||
// advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
|
||||
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
throw std::runtime_error("PrefetchVirtualMemory unavailable");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void unmap_fragment(size_t first, size_t last) {
|
||||
// not supported
|
||||
GGML_UNUSED(first);
|
||||
GGML_UNUSED(last);
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
if (!UnmapViewOfFile(addr)) {
|
||||
LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
|
||||
GGML_UNUSED(file);
|
||||
GGML_UNUSED(prefetch);
|
||||
GGML_UNUSED(numa);
|
||||
|
||||
throw std::runtime_error("mmap not supported");
|
||||
}
|
||||
|
||||
void unmap_fragment(size_t first, size_t last) {
|
||||
GGML_UNUSED(first);
|
||||
GGML_UNUSED(last);
|
||||
|
||||
throw std::runtime_error("mmap not supported");
|
||||
}
|
||||
#endif
|
||||
};
|
||||
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
||||
|
||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
||||
// will automatically unlock on destruction.
|
||||
struct llama_mlock {
|
||||
void * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
bool failed_already = false;
|
||||
|
||||
llama_mlock() {}
|
||||
llama_mlock(const llama_mlock &) = delete;
|
||||
|
||||
~llama_mlock() {
|
||||
if (size) {
|
||||
raw_unlock(addr, size);
|
||||
}
|
||||
}
|
||||
|
||||
void init(void * ptr) {
|
||||
GGML_ASSERT(addr == NULL && size == 0); // NOLINT
|
||||
addr = ptr;
|
||||
}
|
||||
|
||||
void grow_to(size_t target_size) {
|
||||
GGML_ASSERT(addr);
|
||||
if (failed_already) {
|
||||
return;
|
||||
}
|
||||
size_t granularity = lock_granularity();
|
||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
||||
if (target_size > size) {
|
||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
||||
size = target_size;
|
||||
} else {
|
||||
failed_already = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _POSIX_MEMLOCK_RANGE
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
static size_t lock_granularity() {
|
||||
return (size_t) sysconf(_SC_PAGESIZE);
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
|
||||
#else
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
|
||||
#endif
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) const {
|
||||
if (!mlock(addr, size)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
char* errmsg = std::strerror(errno);
|
||||
bool suggest = (errno == ENOMEM);
|
||||
|
||||
// Check if the resource limit is fine after all
|
||||
struct rlimit lock_limit;
|
||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
||||
suggest = false;
|
||||
}
|
||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
|
||||
suggest = false;
|
||||
}
|
||||
|
||||
LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
||||
return false;
|
||||
}
|
||||
|
||||
#undef MLOCK_SUGGESTION
|
||||
|
||||
static void raw_unlock(void * addr, size_t size) {
|
||||
if (munlock(addr, size)) {
|
||||
LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
||||
}
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
static size_t lock_granularity() {
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
return (size_t) si.dwPageSize;
|
||||
}
|
||||
|
||||
bool raw_lock(void * ptr, size_t len) const {
|
||||
for (int tries = 1; ; tries++) {
|
||||
if (VirtualLock(ptr, len)) {
|
||||
return true;
|
||||
}
|
||||
if (tries == 2) {
|
||||
LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||
len, size, llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// It failed but this was only the first try; increase the working
|
||||
// set size and try again.
|
||||
SIZE_T min_ws_size, max_ws_size;
|
||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
||||
LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
// Per MSDN: "The maximum number of pages that a process can lock
|
||||
// is equal to the number of pages in its minimum working set minus
|
||||
// a small overhead."
|
||||
// Hopefully a megabyte is enough overhead:
|
||||
size_t increment = len + 1048576;
|
||||
// The minimum must be <= the maximum, so we need to increase both:
|
||||
min_ws_size += increment;
|
||||
max_ws_size += increment;
|
||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
||||
LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void raw_unlock(void * ptr, size_t len) {
|
||||
if (!VirtualUnlock(ptr, len)) {
|
||||
LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
static size_t lock_granularity() {
|
||||
return (size_t) 65536;
|
||||
}
|
||||
|
||||
bool raw_lock(const void * addr, size_t len) const {
|
||||
LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static void raw_unlock(const void * addr, size_t len) {}
|
||||
#endif
|
||||
};
|
||||
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||
|
1
src/llama-model.cpp
Normal file
1
src/llama-model.cpp
Normal file
@ -0,0 +1 @@
|
||||
#include "llama-model.h"
|
650
src/llama-model.h
Normal file
650
src/llama-model.h
Normal file
@ -0,0 +1,650 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
|
||||
// bump if necessary
|
||||
#define LLAMA_MAX_LAYERS 512
|
||||
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
||||
|
||||
// available llama models
|
||||
enum e_model {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_14M,
|
||||
MODEL_17M,
|
||||
MODEL_22M,
|
||||
MODEL_33M,
|
||||
MODEL_60M,
|
||||
MODEL_70M,
|
||||
MODEL_80M,
|
||||
MODEL_109M,
|
||||
MODEL_137M,
|
||||
MODEL_160M,
|
||||
MODEL_220M,
|
||||
MODEL_250M,
|
||||
MODEL_270M,
|
||||
MODEL_335M,
|
||||
MODEL_410M,
|
||||
MODEL_450M,
|
||||
MODEL_770M,
|
||||
MODEL_780M,
|
||||
MODEL_0_5B,
|
||||
MODEL_1B,
|
||||
MODEL_1_3B,
|
||||
MODEL_1_4B,
|
||||
MODEL_1_5B,
|
||||
MODEL_1_6B,
|
||||
MODEL_2B,
|
||||
MODEL_2_8B,
|
||||
MODEL_3B,
|
||||
MODEL_4B,
|
||||
MODEL_6B,
|
||||
MODEL_6_9B,
|
||||
MODEL_7B,
|
||||
MODEL_8B,
|
||||
MODEL_9B,
|
||||
MODEL_11B,
|
||||
MODEL_12B,
|
||||
MODEL_13B,
|
||||
MODEL_14B,
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
MODEL_35B,
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_SMALL,
|
||||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
MODEL_XL,
|
||||
MODEL_A1_7B,
|
||||
MODEL_A2_7B,
|
||||
MODEL_8x7B,
|
||||
MODEL_8x22B,
|
||||
MODEL_16x12B,
|
||||
MODEL_10B_128x3_66B,
|
||||
MODEL_57B_A14B,
|
||||
MODEL_27B,
|
||||
};
|
||||
|
||||
static const char * llama_model_type_name(e_model type) {
|
||||
switch (type) {
|
||||
case MODEL_14M: return "14M";
|
||||
case MODEL_17M: return "17M";
|
||||
case MODEL_22M: return "22M";
|
||||
case MODEL_33M: return "33M";
|
||||
case MODEL_60M: return "60M";
|
||||
case MODEL_70M: return "70M";
|
||||
case MODEL_80M: return "80M";
|
||||
case MODEL_109M: return "109M";
|
||||
case MODEL_137M: return "137M";
|
||||
case MODEL_160M: return "160M";
|
||||
case MODEL_220M: return "220M";
|
||||
case MODEL_250M: return "250M";
|
||||
case MODEL_270M: return "270M";
|
||||
case MODEL_335M: return "335M";
|
||||
case MODEL_410M: return "410M";
|
||||
case MODEL_450M: return "450M";
|
||||
case MODEL_770M: return "770M";
|
||||
case MODEL_780M: return "780M";
|
||||
case MODEL_0_5B: return "0.5B";
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_1_3B: return "1.3B";
|
||||
case MODEL_1_4B: return "1.4B";
|
||||
case MODEL_1_5B: return "1.5B";
|
||||
case MODEL_1_6B: return "1.6B";
|
||||
case MODEL_2B: return "2B";
|
||||
case MODEL_2_8B: return "2.8B";
|
||||
case MODEL_3B: return "3B";
|
||||
case MODEL_4B: return "4B";
|
||||
case MODEL_6B: return "6B";
|
||||
case MODEL_6_9B: return "6.9B";
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_8B: return "8B";
|
||||
case MODEL_9B: return "9B";
|
||||
case MODEL_11B: return "11B";
|
||||
case MODEL_12B: return "12B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_14B: return "14B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_16B: return "16B";
|
||||
case MODEL_20B: return "20B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_32B: return "32B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_35B: return "35B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
case MODEL_236B: return "236B";
|
||||
case MODEL_314B: return "314B";
|
||||
case MODEL_SMALL: return "0.1B";
|
||||
case MODEL_MEDIUM: return "0.4B";
|
||||
case MODEL_LARGE: return "0.8B";
|
||||
case MODEL_XL: return "1.5B";
|
||||
case MODEL_A1_7B: return "A1.7B";
|
||||
case MODEL_A2_7B: return "A2.7B";
|
||||
case MODEL_8x7B: return "8x7B";
|
||||
case MODEL_8x22B: return "8x22B";
|
||||
case MODEL_16x12B: return "16x12B";
|
||||
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
||||
case MODEL_57B_A14B: return "57B.A14B";
|
||||
case MODEL_27B: return "27B";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
|
||||
struct llama_hparams_posnet {
|
||||
uint32_t n_embd;
|
||||
uint32_t n_layer;
|
||||
};
|
||||
|
||||
struct llama_hparams_convnext {
|
||||
uint32_t n_embd;
|
||||
uint32_t n_layer;
|
||||
};
|
||||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
bool rope_finetuned;
|
||||
bool use_par_res;
|
||||
bool swin_norm;
|
||||
|
||||
uint32_t n_vocab = 0;
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
uint32_t n_embd_features = 0;
|
||||
uint32_t n_layer;
|
||||
uint32_t n_rot;
|
||||
uint32_t n_swa = 0; // sliding window attention (SWA)
|
||||
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_vocab_type = 0; // for BERT-style token types
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
|
||||
// for WavTokenizer
|
||||
struct llama_hparams_posnet posnet;
|
||||
struct llama_hparams_convnext convnext;
|
||||
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
uint32_t n_ff_exp = 0;
|
||||
uint32_t n_ff_shexp = 0;
|
||||
uint32_t n_expert_shared = 0;
|
||||
float expert_weights_scale = 0.0;
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
float f_norm_group_eps;
|
||||
|
||||
uint32_t n_norm_groups;
|
||||
|
||||
float f_attn_logit_softcapping = 50.0f;
|
||||
float f_final_logit_softcapping = 30.0f;
|
||||
|
||||
// for RWKV
|
||||
uint32_t rescale_every_n_layers = 0;
|
||||
uint32_t time_mix_extra_dim = 0;
|
||||
uint32_t time_decay_extra_dim = 0;
|
||||
uint32_t wkv_head_size = 0;
|
||||
|
||||
float rope_attn_factor = 1.0f;
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_scale_train;
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul;
|
||||
int rope_sections[4];
|
||||
|
||||
// for State Space Models
|
||||
uint32_t ssm_d_conv = 0;
|
||||
uint32_t ssm_d_inner = 0;
|
||||
uint32_t ssm_d_state = 0;
|
||||
uint32_t ssm_dt_rank = 0;
|
||||
bool ssm_dt_b_c_rms = false;
|
||||
|
||||
float f_clamp_kqv = 0.0f;
|
||||
float f_max_alibi_bias = 0.0f;
|
||||
float f_logit_scale = 0.0f;
|
||||
|
||||
// Additional scale factors (Granite/Granite MoE)
|
||||
float f_residual_scale = 0.0f;
|
||||
float f_embedding_scale = 0.0f;
|
||||
float f_attention_scale = 0.0f;
|
||||
|
||||
bool causal_attn = true;
|
||||
bool use_alibi = false;
|
||||
bool attn_soft_cap = false;
|
||||
|
||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
||||
|
||||
uint32_t n_head(uint32_t il = 0) const {
|
||||
if (il < n_layer) {
|
||||
return n_head_arr[il];
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t n_head_kv(uint32_t il = 0) const {
|
||||
if (il < n_layer) {
|
||||
return n_head_kv_arr[il];
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t n_ff(uint32_t il = 0) const {
|
||||
if (il < n_layer) {
|
||||
return n_ff_arr[il];
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t n_gqa(uint32_t il = 0) const {
|
||||
const uint32_t n_head = this->n_head(il);
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
if (n_head_kv == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return n_head/n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const { // dimension of key embeddings across all k-v heads
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
return n_embd_head_k * n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_v_gqa(uint32_t il = 0) const { // dimension of value embeddings across all k-v heads
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
return n_embd_head_v * n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
|
||||
// corresponds to Mamba's conv_states size or RWKV's token_shift states size
|
||||
if (wkv_head_size != 0) {
|
||||
// for RWKV models
|
||||
return 2 * n_embd;
|
||||
}
|
||||
|
||||
// TODO: maybe support other convolution strides than 1
|
||||
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
||||
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
||||
}
|
||||
|
||||
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
|
||||
if (wkv_head_size != 0) {
|
||||
// corresponds to RWKV's wkv_states size
|
||||
return n_embd * wkv_head_size;
|
||||
}
|
||||
|
||||
// corresponds to Mamba's ssm_states size
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
|
||||
struct llama_layer_posnet {
|
||||
// resnet
|
||||
struct ggml_tensor * norm1 = nullptr;
|
||||
struct ggml_tensor * norm1_b = nullptr;
|
||||
|
||||
struct ggml_tensor * conv1 = nullptr;
|
||||
struct ggml_tensor * conv1_b = nullptr;
|
||||
|
||||
struct ggml_tensor * norm2 = nullptr;
|
||||
struct ggml_tensor * norm2_b = nullptr;
|
||||
|
||||
struct ggml_tensor * conv2 = nullptr;
|
||||
struct ggml_tensor * conv2_b = nullptr;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * attn_norm = nullptr;
|
||||
struct ggml_tensor * attn_norm_b = nullptr;
|
||||
|
||||
struct ggml_tensor * attn_q = nullptr;
|
||||
struct ggml_tensor * attn_q_b = nullptr;
|
||||
|
||||
struct ggml_tensor * attn_k = nullptr;
|
||||
struct ggml_tensor * attn_k_b = nullptr;
|
||||
|
||||
struct ggml_tensor * attn_v = nullptr;
|
||||
struct ggml_tensor * attn_v_b = nullptr;
|
||||
|
||||
struct ggml_tensor * attn_o = nullptr;
|
||||
struct ggml_tensor * attn_o_b = nullptr;
|
||||
|
||||
// normalize
|
||||
struct ggml_tensor * norm = nullptr;
|
||||
struct ggml_tensor * norm_b = nullptr;
|
||||
};
|
||||
|
||||
struct llama_layer_convnext {
|
||||
struct ggml_tensor * dw = nullptr;
|
||||
struct ggml_tensor * dw_b = nullptr;
|
||||
|
||||
struct ggml_tensor * norm = nullptr;
|
||||
struct ggml_tensor * norm_b = nullptr;
|
||||
|
||||
struct ggml_tensor * pw1 = nullptr;
|
||||
struct ggml_tensor * pw1_b = nullptr;
|
||||
|
||||
struct ggml_tensor * pw2 = nullptr;
|
||||
struct ggml_tensor * pw2_b = nullptr;
|
||||
|
||||
struct ggml_tensor * gamma = nullptr;
|
||||
};
|
||||
|
||||
struct llama_layer {
|
||||
// normalization
|
||||
struct ggml_tensor * attn_norm = nullptr;
|
||||
struct ggml_tensor * attn_norm_b = nullptr;
|
||||
struct ggml_tensor * attn_norm_2 = nullptr;
|
||||
struct ggml_tensor * attn_norm_2_b = nullptr;
|
||||
struct ggml_tensor * attn_q_norm = nullptr;
|
||||
struct ggml_tensor * attn_q_norm_b = nullptr;
|
||||
struct ggml_tensor * attn_k_norm = nullptr;
|
||||
struct ggml_tensor * attn_k_norm_b = nullptr;
|
||||
struct ggml_tensor * attn_out_norm = nullptr;
|
||||
struct ggml_tensor * attn_out_norm_b = nullptr;
|
||||
struct ggml_tensor * attn_q_a_norm = nullptr;
|
||||
struct ggml_tensor * attn_kv_a_norm = nullptr;
|
||||
struct ggml_tensor * attn_sub_norm = nullptr;
|
||||
struct ggml_tensor * attn_post_norm = nullptr;
|
||||
struct ggml_tensor * ffn_sub_norm = nullptr;
|
||||
struct ggml_tensor * attn_norm_cross = nullptr;
|
||||
struct ggml_tensor * attn_norm_enc = nullptr;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * wq = nullptr;
|
||||
struct ggml_tensor * wk = nullptr;
|
||||
struct ggml_tensor * wv = nullptr;
|
||||
struct ggml_tensor * wo = nullptr;
|
||||
struct ggml_tensor * wqkv = nullptr;
|
||||
struct ggml_tensor * wq_a = nullptr;
|
||||
struct ggml_tensor * wq_b = nullptr;
|
||||
struct ggml_tensor * wkv_a_mqa = nullptr;
|
||||
struct ggml_tensor * wkv_b = nullptr;
|
||||
struct ggml_tensor * wq_cross = nullptr;
|
||||
struct ggml_tensor * wk_cross = nullptr;
|
||||
struct ggml_tensor * wv_cross = nullptr;
|
||||
struct ggml_tensor * wo_cross = nullptr;
|
||||
struct ggml_tensor * wq_enc = nullptr;
|
||||
struct ggml_tensor * wk_enc = nullptr;
|
||||
struct ggml_tensor * wv_enc = nullptr;
|
||||
struct ggml_tensor * wo_enc = nullptr;
|
||||
|
||||
// attention bias
|
||||
struct ggml_tensor * bq = nullptr;
|
||||
struct ggml_tensor * bk = nullptr;
|
||||
struct ggml_tensor * bv = nullptr;
|
||||
struct ggml_tensor * bo = nullptr;
|
||||
struct ggml_tensor * bqkv = nullptr;
|
||||
|
||||
// relative position bias
|
||||
struct ggml_tensor * attn_rel_b = nullptr;
|
||||
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
||||
struct ggml_tensor * attn_rel_b_cross = nullptr;
|
||||
|
||||
// normalization
|
||||
struct ggml_tensor * ffn_norm = nullptr;
|
||||
struct ggml_tensor * ffn_norm_b = nullptr;
|
||||
struct ggml_tensor * ffn_post_norm = nullptr;
|
||||
struct ggml_tensor * layer_out_norm = nullptr;
|
||||
struct ggml_tensor * layer_out_norm_b = nullptr;
|
||||
struct ggml_tensor * ffn_norm_exps = nullptr;
|
||||
struct ggml_tensor * ffn_norm_enc = nullptr;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * ffn_gate = nullptr; // w1
|
||||
struct ggml_tensor * ffn_down = nullptr; // w2
|
||||
struct ggml_tensor * ffn_up = nullptr; // w3
|
||||
struct ggml_tensor * ffn_gate_enc = nullptr;
|
||||
struct ggml_tensor * ffn_down_enc = nullptr;
|
||||
struct ggml_tensor * ffn_up_enc = nullptr;
|
||||
|
||||
// ff MoE
|
||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||
|
||||
// ff shared expert (shexp)
|
||||
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_down_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp = nullptr;
|
||||
|
||||
// ff bias
|
||||
struct ggml_tensor * ffn_gate_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
||||
struct ggml_tensor * ffn_act = nullptr;
|
||||
|
||||
// mamba proj
|
||||
struct ggml_tensor * ssm_in = nullptr;
|
||||
struct ggml_tensor * ssm_x = nullptr;
|
||||
struct ggml_tensor * ssm_dt = nullptr;
|
||||
struct ggml_tensor * ssm_out = nullptr;
|
||||
|
||||
// mamba
|
||||
struct ggml_tensor * ssm_conv1d = nullptr;
|
||||
struct ggml_tensor * ssm_a = nullptr;
|
||||
struct ggml_tensor * ssm_d = nullptr;
|
||||
|
||||
// mamba bias
|
||||
struct ggml_tensor * ssm_conv1d_b = nullptr;
|
||||
struct ggml_tensor * ssm_dt_b = nullptr;
|
||||
|
||||
// rwkv
|
||||
struct ggml_tensor * time_mix_w1 = nullptr;
|
||||
struct ggml_tensor * time_mix_w2 = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_x = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_w = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_k = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_v = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_r = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_g = nullptr;
|
||||
|
||||
struct ggml_tensor * time_mix_first = nullptr;
|
||||
struct ggml_tensor * time_mix_decay = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w1 = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w2 = nullptr;
|
||||
struct ggml_tensor * time_mix_key = nullptr;
|
||||
struct ggml_tensor * time_mix_value = nullptr;
|
||||
struct ggml_tensor * time_mix_receptance = nullptr;
|
||||
struct ggml_tensor * time_mix_gate = nullptr;
|
||||
|
||||
struct ggml_tensor * time_mix_ln = nullptr;
|
||||
struct ggml_tensor * time_mix_ln_b = nullptr;
|
||||
struct ggml_tensor * time_mix_output = nullptr;
|
||||
|
||||
struct ggml_tensor * channel_mix_lerp_k = nullptr;
|
||||
struct ggml_tensor * channel_mix_lerp_r = nullptr;
|
||||
|
||||
struct ggml_tensor * channel_mix_key = nullptr;
|
||||
struct ggml_tensor * channel_mix_receptance = nullptr;
|
||||
struct ggml_tensor * channel_mix_value = nullptr;
|
||||
|
||||
// long rope factors
|
||||
struct ggml_tensor * rope_long = nullptr;
|
||||
struct ggml_tensor * rope_short = nullptr;
|
||||
struct ggml_tensor * rope_freqs = nullptr;
|
||||
|
||||
// bitnet scale
|
||||
struct ggml_tensor * wq_scale = nullptr;
|
||||
struct ggml_tensor * wk_scale = nullptr;
|
||||
struct ggml_tensor * wv_scale = nullptr;
|
||||
struct ggml_tensor * wo_scale = nullptr;
|
||||
struct ggml_tensor * ffn_gate_scale = nullptr;
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
};
|
||||
|
||||
struct llama_model {
|
||||
e_model type = MODEL_UNKNOWN;
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||
|
||||
std::string name = "n/a";
|
||||
|
||||
llama_hparams hparams = {};
|
||||
llama_vocab vocab;
|
||||
|
||||
struct ggml_tensor * tok_embd = nullptr;
|
||||
struct ggml_tensor * type_embd = nullptr;
|
||||
struct ggml_tensor * pos_embd = nullptr;
|
||||
struct ggml_tensor * tok_norm = nullptr;
|
||||
struct ggml_tensor * tok_norm_b = nullptr;
|
||||
|
||||
struct ggml_tensor * output_norm = nullptr;
|
||||
struct ggml_tensor * output_norm_b = nullptr;
|
||||
struct ggml_tensor * output = nullptr;
|
||||
struct ggml_tensor * output_b = nullptr;
|
||||
struct ggml_tensor * output_norm_enc = nullptr;
|
||||
|
||||
// classifier
|
||||
struct ggml_tensor * cls = nullptr;
|
||||
struct ggml_tensor * cls_b = nullptr;
|
||||
struct ggml_tensor * cls_out = nullptr;
|
||||
struct ggml_tensor * cls_out_b = nullptr;
|
||||
|
||||
struct ggml_tensor * conv1d = nullptr;
|
||||
struct ggml_tensor * conv1d_b = nullptr;
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
int n_gpu_layers;
|
||||
|
||||
std::vector<std::string> rpc_servers;
|
||||
|
||||
// list of devices used in this model
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
buft_list_t cpu_buft_list;
|
||||
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
||||
|
||||
struct layer_dev {
|
||||
ggml_backend_dev_t dev;
|
||||
buft_list_t * buft_list;
|
||||
};
|
||||
layer_dev dev_input = {};
|
||||
layer_dev dev_output = {};
|
||||
std::vector<layer_dev> dev_layer;
|
||||
|
||||
// contexts where the model tensors metadata is stored
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
|
||||
// the model memory buffers for the tensor data
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// model memory mapped files
|
||||
llama_mmaps mappings;
|
||||
|
||||
// objects representing data potentially being locked in memory
|
||||
llama_mlocks mlock_bufs;
|
||||
llama_mlocks mlock_mmaps;
|
||||
|
||||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements = 0;
|
||||
|
||||
// total size of all the tensors in the model in bytes
|
||||
size_t n_bytes = 0;
|
||||
|
||||
// keep track of loaded lora adapters
|
||||
std::set<struct llama_lora_adapter *> lora_adapters;
|
||||
|
||||
~llama_model() {
|
||||
while (!lora_adapters.empty()) {
|
||||
llama_lora_adapter_free(*lora_adapters.begin());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename F>
|
||||
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context_ptr ctx { ggml_init(params) };
|
||||
if (!ctx) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
|
||||
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
||||
ggml_tensor * op_tensor = fn(ctx.get());
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op_tensor->src[i] != nullptr) {
|
||||
assert(op_tensor->src[i]->buffer == nullptr);
|
||||
op_tensor->src[i]->buffer = buf.get();
|
||||
}
|
||||
}
|
||||
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
|
||||
return op_supported;
|
||||
}
|
||||
|
||||
template<typename F>
|
||||
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
|
||||
for (const auto & cur : buft_list) {
|
||||
ggml_backend_dev_t cur_dev = cur.first;
|
||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
if (buft_supported(cur_buft, cur_dev, fn)) {
|
||||
return cur_buft;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error(format("no suitable buffer type found"));
|
||||
}
|
||||
|
@ -1,5 +1,7 @@
|
||||
#include "llama-vocab.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "unicode.h"
|
||||
|
||||
#include <algorithm>
|
||||
@ -16,22 +18,6 @@
|
||||
// helpers
|
||||
//
|
||||
|
||||
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
GGML_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
}
|
||||
|
||||
struct naive_trie {
|
||||
naive_trie() : has_value(false), value(0) {
|
||||
}
|
||||
|
@ -8,6 +8,18 @@
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
switch (type) {
|
||||
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
||||
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
struct llm_tokenizer;
|
||||
|
||||
struct llama_vocab {
|
||||
|
5373
src/llama.cpp
5373
src/llama.cpp
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user