llama : scatter llama.cpp into multiple modules (wip)

This commit is contained in:
Georgi Gerganov 2024-12-11 18:29:23 +02:00
parent 86bf31cfe6
commit f9b0e3b382
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
19 changed files with 5425 additions and 5375 deletions

View File

@ -9,9 +9,16 @@ llama_add_compile_flags()
add_library(llama
../include/llama.h
llama.cpp
llama-vocab.cpp
llama-arch.cpp
llama-batch.cpp
llama-context.cpp
llama-control-vector.cpp
llama-grammar.cpp
llama-kv-cache.cpp
llama-mmap.cpp
llama-model.cpp
llama-sampling.cpp
llama-vocab.cpp
unicode.h
unicode.cpp
unicode-data.cpp

1
src/llama-arch.cpp Normal file
View File

@ -0,0 +1 @@
#include "llama-arch.h"

1714
src/llama-arch.h Normal file

File diff suppressed because it is too large Load Diff

1
src/llama-batch.cpp Normal file
View File

@ -0,0 +1 @@
#include "llama-batch.h"

330
src/llama-batch.h Normal file
View File

@ -0,0 +1,330 @@
#pragma once
#include "llama.h"
#include <vector>
// very similar to llama_batch,
// but has more metadata about sequences
struct llama_ubatch {
bool equal_seqs;
// TODO: whole_seqs for embeddings?
uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
uint32_t n_seq_tokens; // tokens per sequence
uint32_t n_seqs;
llama_token * token; // [n_tokens]
float * embd; // [n_embd, n_tokens]
llama_pos * pos; // [n_tokens]
int32_t * n_seq_id; // [n_seqs]
llama_seq_id ** seq_id; // [n_seqs]
int8_t * output; // [n_tokens]
};
struct llama_sbatch_seq {
int32_t n_seq_id;
llama_seq_id * seq_id;
size_t offset;
size_t length;
};
// sequence-length-aware batch splitting
struct llama_sbatch {
// tokens left in this batch
size_t n_tokens;
size_t n_embd;
bool logits_all; // TODO: remove once lctx.logits_all is removed too
// sorted indices into the batch
std::vector<size_t> ids;
// batch indices of the output
std::vector<size_t> out_ids;
std::vector<llama_sbatch_seq> seq;
const llama_batch * batch = nullptr;
// buffers for the ubatch
std::vector<llama_token> ubatch_token;
std::vector<float> ubatch_embd;
std::vector<llama_pos> ubatch_pos;
std::vector<int32_t> ubatch_n_seq_id;
std::vector<llama_seq_id *> ubatch_seq_id;
std::vector<int8_t> ubatch_output;
llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false) {
// clear empty sequences
// the previous ubatch is assumed to be gone,
// so nothing should refer to values in these sequences anymore.
for (size_t i = seq.size(); i-- > 0;) {
if (seq[i].length == 0) {
seq.pop_back();
} else {
break;
}
}
ubatch_token.resize(!has_embd ? n_ubatch : 0);
ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0);
ubatch_pos.resize(n_ubatch);
ubatch_n_seq_id.resize(n_ubatch);
ubatch_seq_id.resize(n_ubatch);
ubatch_output.resize(n_ubatch);
llama_ubatch ubatch = {
/*equal_seqs =*/ true,
/*n_tokens =*/ 0,
/*n_seq_tokens =*/ 0,
/*n_seqs =*/ 0,
/*token =*/ !has_embd ? ubatch_token.data() : nullptr,
/*embd =*/ has_embd ? ubatch_embd.data() : nullptr,
/*pos =*/ ubatch_pos.data(),
/*n_seq_id =*/ ubatch_n_seq_id.data(),
/*seq_id =*/ ubatch_seq_id.data(),
/*output =*/ ubatch_output.data(),
};
return ubatch;
}
void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
GGML_ASSERT(batch != nullptr);
GGML_ASSERT(length <= seq.length);
// Can only add sequences of equal lengths to a batch,
// otherwise it isn't clear to which sequence a token belongs
GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
// NOTE: loops are separated for cache-friendliness
if (batch->token) {
if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) {
ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]];
}
} else {
// simple split
ubatch.token = batch->token + seq.offset;
}
} else {
ubatch.token = nullptr;
}
if (batch->embd) {
if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) {
memcpy(
ubatch.embd + n_embd * (ubatch.n_tokens + i),
batch->embd + n_embd * ids[seq.offset + i],
n_embd * sizeof(float)
);
}
} else {
// simple split
ubatch.embd = batch->embd + (n_embd * seq.offset);
}
} else {
ubatch.embd = nullptr;
}
if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) {
ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
}
} else {
// simple split
ubatch.pos = batch->pos + seq.offset;
}
if (ubatch.equal_seqs) {
ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
if (seq.seq_id) {
ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
}
} else {
// simple split
if (batch->n_seq_id) {
ubatch.n_seq_id = batch->n_seq_id + seq.offset;
} else {
for (size_t i = 0; i < length; ++i) {
ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
}
}
if (batch->seq_id) {
ubatch.seq_id = batch->seq_id + seq.offset;
}
}
if (logits_all) {
for (size_t i = 0; i < length; ++i) {
ubatch.output[ubatch.n_tokens + i] = 1;
out_ids.push_back(ids[seq.offset + i]);
}
} else if (batch->logits) {
if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) {
size_t id = ids[seq.offset + i];
int8_t is_output = batch->logits[id];
ubatch.output[ubatch.n_tokens + i] = is_output;
if (is_output) { out_ids.push_back(id); }
}
} else {
// simple split
ubatch.output = batch->logits + seq.offset;
for (size_t i = 0; i < length; ++i) {
if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
}
}
} else {
// only get last output
for (size_t i = 0; i < length; ++i) {
size_t id = ids[seq.offset + i];
int8_t is_last = id == ids.size() - 1;
ubatch.output[ubatch.n_tokens + i] = is_last;
if (is_last) { out_ids.push_back(id); }
}
}
if (ubatch.n_tokens == 0 && ubatch.n_seqs == 0) {
ubatch.n_seq_tokens = ubatch.equal_seqs ? length : 1;
}
ubatch.n_tokens += length;
ubatch.n_seqs += ubatch.equal_seqs ? 1 : length; // virtual sequences for simple splits
seq.offset += length;
seq.length -= length;
n_tokens -= length;
GGML_ASSERT(ubatch.n_tokens == ubatch.n_seq_tokens * ubatch.n_seqs);
}
// simple split, unknown number of sequences of unequal lengths
llama_ubatch split_simple(size_t n_ubatch) {
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
ubatch.equal_seqs = false;
if (!seq.empty()) {
llama_sbatch_seq & s = seq[0];
size_t length = s.length < n_ubatch ? s.length : n_ubatch;
GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
add_seq_to_ubatch(ubatch, s, length);
}
return ubatch;
}
// make batches of equal-length sequences
llama_ubatch split_equal(size_t n_ubatch) {
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
if (!seq.empty()) {
size_t length = 0;
size_t n_tokens_in_ubatch = 0;
GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits
// smallest first, because it's easier to split this way;
// starting from the end to pop in constant time.
for (size_t i = seq.size(); i-- > 0;) {
llama_sbatch_seq & s = seq[i];
GGML_ASSERT(s.length > 0);
if (length == 0) {
length = s.length < n_ubatch ? s.length : n_ubatch;
}
add_seq_to_ubatch(ubatch, s, length);
n_tokens_in_ubatch += length;
// shared prompts can't be mixed with any of their sequences,
// so it's safer to compute them in their own ubatch
if (s.n_seq_id > 1) { break; }
// stop when there isn't enough space for another sequence
if (length + n_tokens_in_ubatch > n_ubatch) { break; }
}
}
return ubatch;
}
// sequence-wise split
llama_ubatch split_seq(size_t n_ubatch) {
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
if (!seq.empty()) {
llama_sbatch_seq & s = seq[seq.size() - 1];
size_t length = s.length < n_ubatch ? s.length : n_ubatch;
GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
add_seq_to_ubatch(ubatch, s, length);
}
return ubatch;
}
void from_batch(const llama_batch & batch, const size_t n_embd, const bool simple_split = false, const bool logits_all = false) {
GGML_ASSERT(batch.n_tokens >= 0);
this->batch = &batch;
this->n_embd = n_embd;
this->logits_all = logits_all;
n_tokens = batch.n_tokens;
ids.resize(n_tokens);
out_ids.clear();
// TODO: reserve out_ids and seq
for (size_t i = 0; i < n_tokens; ++i) {
ids[i] = i;
}
if (simple_split) {
seq.resize(1);
llama_sbatch_seq & s = seq[0];
s.n_seq_id = 0;
s.seq_id = nullptr;
s.offset = 0;
s.length = n_tokens;
return;
}
std::sort(ids.begin(), ids.end(),
[&batch](size_t a, size_t b) {
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
int32_t n_seq_b = batch.n_seq_id ? batch.n_seq_id[b] : 1;
// sort by seq_id, then by pos
if (n_seq_a == n_seq_b) {
if (batch.seq_id) {
for (int32_t i = 0; i < n_seq_a; ++i) {
llama_seq_id seq_id_a = batch.seq_id[a][i];
llama_seq_id seq_id_b = batch.seq_id[b][i];
// smaller seq_ids go first
if (seq_id_a != seq_id_b) {
return seq_id_a < seq_id_b;
}
}
}
// when all else is equal, sort by pos
if (batch.pos) {
return batch.pos[a] < batch.pos[b];
}
// no pos, sort by id
return a < b;
}
// shared prompts go first
return n_seq_a > n_seq_b;
}
);
// init seq
llama_sbatch_seq * last_seq = nullptr;
for (size_t i = 0; i < n_tokens; ++i) {
const size_t bi = ids[i];
const int32_t n_seqs = batch.n_seq_id[bi];
llama_seq_id * seq_ids = batch.seq_id[bi];
if (last_seq != nullptr) {
bool same = n_seqs == last_seq->n_seq_id;
for (int32_t j = 0; same && j < n_seqs; ++j) {
if (seq_ids[j] != last_seq->seq_id[j]) {
same = false;
}
}
if (same) {
last_seq->length += 1;
continue;
}
}
llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
seq.push_back(new_seq);
last_seq = &seq.back();
}
// keep shared prompts first at the end, then sort by length descending.
std::sort(seq.begin(), seq.end(),
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
if (a.n_seq_id == b.n_seq_id) {
return a.length > b.length;
}
return a.n_seq_id < b.n_seq_id;
}
);
}
};

970
src/llama-context.cpp Normal file
View File

@ -0,0 +1,970 @@
#include "llama-context.h"
// deprecated
size_t llama_get_state_size(struct llama_context * ctx) {
return llama_state_get_size(ctx);
}
// deprecated
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
return llama_state_get_data(ctx, dst, -1);
}
// deprecated
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
return llama_state_set_data(ctx, src, -1);
}
// deprecated
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
}
// deprecated
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
}
// TODO: replace all non-fatal assertions with returned errors or exceptions
struct llama_data_write {
virtual void write(const void * src, size_t size) = 0;
virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
virtual size_t get_size_written() = 0;
virtual ~llama_data_write() = default;
void write_string(const std::string & str) {
uint32_t str_size = str.size();
write(&str_size, sizeof(str_size));
write(str.data(), str_size);
}
void write_model_info(const struct llama_context * ctx) {
std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
write_string(arch_str);
// TODO: add more model-specific info which should prevent loading the session file if not identical
}
//void write_rng(const std::mt19937 & rng) {
// std::ostringstream rng_ss;
// rng_ss << rng;
// const std::string & rng_str = rng_ss.str();
// write_string(rng_str);
//}
void write_output_ids(struct llama_context * ctx) {
llama_output_reorder(ctx);
const uint32_t n_outputs = ctx->n_outputs;
std::vector<int32_t> output_pos;
const size_t n_batch = ctx->cparams.n_batch;
const auto & output_ids = ctx->output_ids;
GGML_ASSERT(n_outputs <= ctx->output_size);
output_pos.resize(n_outputs);
// build a more compact representation of the output ids
for (size_t i = 0; i < n_batch; ++i) {
// map an output id to a position in the batch
int32_t pos = output_ids[i];
if (pos >= 0) {
GGML_ASSERT((uint32_t) pos < n_outputs);
output_pos[pos] = i;
}
}
write(&n_outputs, sizeof(n_outputs));
if (n_outputs) {
write(output_pos.data(), n_outputs * sizeof(int32_t));
}
}
void write_logits(const struct llama_context * ctx) {
const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
write(&logits_size, sizeof(logits_size));
if (logits_size) {
write(ctx->logits, logits_size * sizeof(float));
}
}
void write_embeddings(const struct llama_context * ctx) {
const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
write(&embeddings_size, sizeof(embeddings_size));
if (embeddings_size) {
write(ctx->embd, embeddings_size * sizeof(float));
}
}
void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
for (const auto & range : cell_ranges) {
for (uint32_t i = range.first; i < range.second; ++i) {
const auto & cell = kv_self.cells[i];
const llama_pos pos = cell.pos;
const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
write(&pos, sizeof(pos));
write(&n_seq_id, sizeof(n_seq_id));
if (n_seq_id) {
for (auto seq_id : cell.seq_id) {
write(&seq_id, sizeof(seq_id));
}
}
}
}
}
void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
const struct llama_kv_cache & kv_self = ctx->kv_self;
const struct llama_hparams & hparams = ctx->model.hparams;
const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
const uint32_t n_layer = hparams.n_layer;
write(&v_trans, sizeof(v_trans));
write(&n_layer, sizeof(n_layer));
std::vector<uint8_t> tmp_buf;
// Iterate and write all the keys first, each row is a cell
// Get whole range at a time
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
// Write key type
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
write(&k_type_i, sizeof(k_type_i));
// Write row size of key
const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
write(&k_size_row, sizeof(k_size_row));
// Read each range of cells of k_size length each into tmp_buf and write out
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * k_size_row;
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
}
}
if (!kv_self.v_trans) {
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
// Write value type
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
write(&v_type_i, sizeof(v_type_i));
// Write row size of value
const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
write(&v_size_row, sizeof(v_size_row));
// Read each range of cells of v_size length each into tmp_buf and write out
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * v_size_row;
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
}
}
} else {
// When v is transposed, we also need the element size and get the element ranges from each row
const uint32_t kv_size = kv_self.size;
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
// Write value type
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
write(&v_type_i, sizeof(v_type_i));
// Write element size
const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
write(&v_size_el, sizeof(v_size_el));
// Write GQA embedding size
write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
// For each row, we get the element values of each cell
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
// Read each range of cells of v_size_el length each into tmp_buf and write out
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
const size_t buf_size = range_size * v_size_el;
write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
}
}
}
}
}
void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
const struct llama_kv_cache & kv_self = ctx->kv_self;
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
uint32_t cell_count = 0;
// Count the number of cells with the specified seq_id
// Find all the ranges of cells with this seq id (or all, when -1)
uint32_t cell_range_begin = kv_self.size;
for (uint32_t i = 0; i < kv_self.size; ++i) {
const auto & cell = kv_self.cells[i];
if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
++cell_count;
if (cell_range_begin == kv_self.size) {
cell_range_begin = i;
}
} else {
if (cell_range_begin != kv_self.size) {
cell_ranges.emplace_back(cell_range_begin, i);
cell_range_begin = kv_self.size;
}
}
}
if (cell_range_begin != kv_self.size) {
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
}
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
uint32_t cell_count_check = 0;
for (const auto & range : cell_ranges) {
cell_count_check += range.second - range.first;
}
GGML_ASSERT(cell_count == cell_count_check);
write(&cell_count, sizeof(cell_count));
write_kv_cache_meta(kv_self, cell_ranges, seq_id);
write_kv_cache_data(ctx, cell_ranges);
}
};
struct llama_data_read {
virtual const uint8_t * read(size_t size) = 0;
virtual void read_to(void * dst, size_t size) = 0;
virtual size_t get_size_read() = 0;
virtual ~llama_data_read() = default;
void read_string(std::string & str) {
uint32_t str_size;
read_to(&str_size, sizeof(str_size));
str.assign((const char *) read(str_size), str_size);
}
// validate model information
void read_model_info(const struct llama_context * ctx) {
std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
std::string arch_str;
read_string(arch_str);
if (cur_arch_str != arch_str) {
throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
}
// TODO: add more info which needs to be identical but which is not verified otherwise
}
//void read_rng(std::mt19937 & rng) {
// std::string rng_str;
// read_string(rng_str);
// std::istringstream rng_ss(rng_str);
// rng_ss >> rng;
// if (rng_ss.fail()) {
// throw std::runtime_error("failed to load RNG state");
// }
//}
void read_output_ids(struct llama_context * ctx) {
std::vector<int32_t> output_pos;
uint32_t n_outputs;
read_to(&n_outputs, sizeof(n_outputs));
if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
throw std::runtime_error("could not reserve outputs");
}
if (n_outputs) {
output_pos.resize(n_outputs);
read_to(output_pos.data(), n_outputs * sizeof(int32_t));
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
int32_t id = output_pos[i];
if ((uint32_t) id >= ctx->cparams.n_batch) {
throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
}
ctx->output_ids[id] = i;
}
ctx->n_outputs = n_outputs;
}
}
void read_logits(struct llama_context * ctx) {
uint64_t logits_size;
read_to(&logits_size, sizeof(logits_size));
if (ctx->logits_size < logits_size) {
throw std::runtime_error("logits buffer too small");
}
if (logits_size) {
read_to(ctx->logits, logits_size * sizeof(float));
}
}
void read_embeddings(struct llama_context * ctx) {
uint64_t embeddings_size;
read_to(&embeddings_size, sizeof(embeddings_size));
if (ctx->embd_size < embeddings_size) {
throw std::runtime_error("embeddings buffer too small");
}
if (embeddings_size) {
read_to(ctx->embd, embeddings_size * sizeof(float));
}
}
bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
struct llama_kv_cache & kv_self = ctx->kv_self;
if (dest_seq_id != -1) {
// single sequence
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
batch.n_tokens = cell_count;
batch.n_seq_tokens = cell_count;
batch.n_seqs = 1;
for (uint32_t i = 0; i < cell_count; ++i) {
llama_pos pos;
uint32_t n_seq_id;
read_to(&pos, sizeof(pos));
read_to(&n_seq_id, sizeof(n_seq_id));
if (n_seq_id != 0) {
LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
return false;
}
batch.pos[i] = pos;
}
batch.n_seq_id[0] = 1;
batch.seq_id[0] = &dest_seq_id;
if (!llama_kv_cache_find_slot(kv_self, batch)) {
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
return false;
}
// DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
// Assume that this is one contiguous block of cells
GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
} else {
// whole KV cache restore
if (cell_count > kv_self.size) {
LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
return false;
}
llama_kv_cache_clear(kv_self);
for (uint32_t i = 0; i < cell_count; ++i) {
llama_kv_cell & cell = kv_self.cells[i];
llama_pos pos;
uint32_t n_seq_id;
read_to(&pos, sizeof(pos));
read_to(&n_seq_id, sizeof(n_seq_id));
cell.pos = pos;
for (uint32_t j = 0; j < n_seq_id; ++j) {
llama_seq_id seq_id;
read_to(&seq_id, sizeof(seq_id));
if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
return false;
}
cell.seq_id.insert(seq_id);
if (kv_self.recurrent) {
int32_t & tail = kv_self.cells[seq_id].tail;
if (tail != -1) {
LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
return false;
}
tail = i;
}
}
}
kv_self.head = 0;
kv_self.used = cell_count;
}
if (kv_self.recurrent) {
for (uint32_t i = 0; i < cell_count; ++i) {
uint32_t cell_id = kv_self.head + i;
// make sure the recurrent states will keep their restored state
kv_self.cells[cell_id].src = cell_id;
}
}
return true;
}
bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
const struct llama_hparams & hparams = ctx->model.hparams;
struct llama_kv_cache & kv_self = ctx->kv_self;
uint32_t v_trans;
uint32_t n_layer;
read_to(&v_trans, sizeof(v_trans));
read_to(&n_layer, sizeof(n_layer));
if (n_layer != hparams.n_layer) {
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
return false;
}
if (cell_count > kv_self.size) {
LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
return false;
}
if (kv_self.v_trans != (bool) v_trans) {
LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
return false;
}
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
// Read type of key
int32_t k_type_i_ref;
read_to(&k_type_i_ref, sizeof(k_type_i_ref));
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
if (k_type_i != k_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
return false;
}
// Read row size of key
uint64_t k_size_row_ref;
read_to(&k_size_row_ref, sizeof(k_size_row_ref));
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
if (k_size_row != k_size_row_ref) {
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
return false;
}
if (cell_count) {
// Read and set the keys for the whole cell range
ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
}
}
if (!kv_self.v_trans) {
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
// Read type of value
int32_t v_type_i_ref;
read_to(&v_type_i_ref, sizeof(v_type_i_ref));
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
if (v_type_i != v_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
return false;
}
// Read row size of value
uint64_t v_size_row_ref;
read_to(&v_size_row_ref, sizeof(v_size_row_ref));
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
if (v_size_row != v_size_row_ref) {
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
return false;
}
if (cell_count) {
// Read and set the values for the whole cell range
ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
}
}
} else {
// For each layer, read the values for each cell (transposed)
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
// Read type of value
int32_t v_type_i_ref;
read_to(&v_type_i_ref, sizeof(v_type_i_ref));
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
if (v_type_i != v_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
return false;
}
// Read element size of value
uint32_t v_size_el_ref;
read_to(&v_size_el_ref, sizeof(v_size_el_ref));
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
if (v_size_el != v_size_el_ref) {
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
return false;
}
// Read GQA embedding size
uint32_t n_embd_v_gqa_ref;
read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
if (n_embd_v_gqa != n_embd_v_gqa_ref) {
LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
return false;
}
if (cell_count) {
// For each row in the transposed matrix, read the values for the whole cell range
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
}
}
}
}
return true;
}
void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
uint32_t cell_count;
read_to(&cell_count, sizeof(cell_count));
bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
if (!res) {
if (seq_id == -1) {
llama_kv_cache_clear(ctx);
} else {
llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
}
throw std::runtime_error("failed to restore kv cache");
}
}
};
struct llama_data_write_dummy : llama_data_write {
size_t size_written = 0;
llama_data_write_dummy() {}
void write(const void * /* src */, size_t size) override {
size_written += size;
}
void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
size_written += size;
}
size_t get_size_written() override {
return size_written;
}
};
struct llama_data_write_buffer : llama_data_write {
uint8_t * ptr;
size_t buf_size = 0;
size_t size_written = 0;
llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
void write(const void * src, size_t size) override {
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
memcpy(ptr, src, size);
ptr += size;
size_written += size;
buf_size -= size;
}
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
ggml_backend_tensor_get(tensor, ptr, offset, size);
ptr += size;
size_written += size;
buf_size -= size;
}
size_t get_size_written() override {
return size_written;
}
};
struct llama_data_read_buffer : llama_data_read {
const uint8_t * ptr;
size_t buf_size = 0;
size_t size_read = 0;
llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
const uint8_t * read(size_t size) override {
const uint8_t * base_ptr = ptr;
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
ptr += size;
size_read += size;
buf_size -= size;
return base_ptr;
}
void read_to(void * dst, size_t size) override {
memcpy(dst, read(size), size);
}
size_t get_size_read() override {
return size_read;
}
};
struct llama_data_write_file : llama_data_write {
llama_file * file;
size_t size_written = 0;
std::vector<uint8_t> temp_buffer;
llama_data_write_file(llama_file * f) : file(f) {}
void write(const void * src, size_t size) override {
file->write_raw(src, size);
size_written += size;
}
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
temp_buffer.resize(size);
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
write(temp_buffer.data(), temp_buffer.size());
}
size_t get_size_written() override {
return size_written;
}
};
struct llama_data_read_file : llama_data_read {
llama_file * file;
size_t size_read = 0;
std::vector<uint8_t> temp_buffer;
llama_data_read_file(llama_file * f) : file(f) {}
void read_to(void * dst, size_t size) override {
file->read_raw(dst, size);
size_read += size;
}
const uint8_t * read(size_t size) override {
temp_buffer.resize(size);
read_to(temp_buffer.data(), size);
return temp_buffer.data();
}
size_t get_size_read() override {
return size_read;
}
};
/** copy state data into either a buffer or file depending on the passed in context
*
* file context:
* llama_file file("/path", "wb");
* llama_data_write_file data_ctx(&file);
* llama_state_get_data_internal(ctx, data_ctx);
*
* buffer context:
* std::vector<uint8_t> buf(max_size, 0);
* llama_data_write_buffer data_ctx(buf.data(), max_size);
* llama_state_get_data_internal(ctx, data_ctx);
*
*/
static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
llama_synchronize(ctx);
data_ctx.write_model_info(ctx);
// copy outputs
data_ctx.write_output_ids(ctx);
data_ctx.write_logits(ctx);
data_ctx.write_embeddings(ctx);
data_ctx.write_kv_cache(ctx);
return data_ctx.get_size_written();
}
size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
llama_data_write_buffer data_ctx(dst, size);
try {
return llama_state_get_data_internal(ctx, data_ctx);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
return 0;
}
}
// Returns the *actual* size of the state.
// Intended to be used when saving to state to a buffer.
size_t llama_state_get_size(struct llama_context * ctx) {
llama_data_write_dummy data_ctx;
try {
return llama_state_get_data_internal(ctx, data_ctx);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
return 0;
}
}
static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
llama_synchronize(ctx);
data_ctx.read_model_info(ctx);
// set outputs
data_ctx.read_output_ids(ctx);
data_ctx.read_logits(ctx);
data_ctx.read_embeddings(ctx);
data_ctx.read_kv_cache(ctx);
return data_ctx.get_size_read();
}
// Sets the state reading from the specified source address
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
llama_data_read_buffer data_ctx(src, size);
try {
return llama_state_set_data_internal(ctx, data_ctx);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
return 0;
}
}
static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_file file(path_session, "rb");
// sanity checks
{
const uint32_t magic = file.read_u32();
const uint32_t version = file.read_u32();
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
return false;
}
}
// load the prompt
{
const uint32_t n_token_count = file.read_u32();
if (n_token_count > n_token_capacity) {
LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
return false;
}
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
*n_token_count_out = n_token_count;
}
// restore the context state
{
const size_t n_state_size_cur = file.size - file.tell();
llama_data_read_file data_ctx(&file);
const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
if (n_read != n_state_size_cur) {
LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
return false;
}
}
return true;
}
bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
try {
return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
return false;
}
}
static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
llama_file file(path_session, "wb");
file.write_u32(LLAMA_SESSION_MAGIC);
file.write_u32(LLAMA_SESSION_VERSION);
// save the prompt
file.write_u32((uint32_t) n_token_count);
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
// save the context state using stream saving
llama_data_write_file data_ctx(&file);
llama_state_get_data_internal(ctx, data_ctx);
return true;
}
bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
try {
return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
return false;
}
}
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
llama_synchronize(ctx);
data_ctx.write_kv_cache(ctx, seq_id);
return data_ctx.get_size_written();
}
size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
llama_data_write_dummy data_ctx;
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
}
size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
llama_data_write_buffer data_ctx(dst, size);
try {
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
return 0;
}
}
static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
llama_synchronize(ctx);
data_ctx.read_kv_cache(ctx, dest_seq_id);
return data_ctx.get_size_read();
}
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
llama_data_read_buffer data_ctx(src, size);
try {
return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
return 0;
}
}
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
llama_file file(filepath, "wb");
file.write_u32(LLAMA_STATE_SEQ_MAGIC);
file.write_u32(LLAMA_STATE_SEQ_VERSION);
// save the prompt
file.write_u32((uint32_t) n_token_count);
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
// save the context state using stream saving
llama_data_write_file data_ctx(&file);
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
const size_t res = file.tell();
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
return res;
}
static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_file file(filepath, "rb");
// version checks
{
const uint32_t magic = file.read_u32();
const uint32_t version = file.read_u32();
if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
return 0;
}
}
// load the prompt
{
const uint32_t n_token_count = file.read_u32();
if (n_token_count > n_token_capacity) {
LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
return 0;
}
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
*n_token_count_out = n_token_count;
}
// restore the context state
{
const size_t state_size = file.size - file.tell();
llama_data_read_file data_ctx(&file);
const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
if (!nread) {
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
return 0;
}
GGML_ASSERT(nread <= state_size);
GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
}
return file.tell();
}
size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
try {
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
return 0;
}
}
size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
try {
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
return 0;
}
}

358
src/llama-context.h Normal file
View File

@ -0,0 +1,358 @@
#pragma once
#include "llama-impl.h"
#include "llama-batch.h"
#include "llama-model.h"
#include "llama-kv-cache.h"
#include "llama-control-vector.h"
#include "ggml-cpp.h"
#include <map>
#include <unordered_map>
#include <vector>
#include <set>
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
uint32_t n_batch;
uint32_t n_ubatch;
uint32_t n_seq_max;
int n_threads; // number of threads to use for generation
int n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
uint32_t n_ctx_orig_yarn;
// These hyperparameters are not exposed in GGUF, because all
// existing YaRN models use the same values for them.
float yarn_ext_factor;
float yarn_attn_factor;
float yarn_beta_fast;
float yarn_beta_slow;
float defrag_thold;
bool embeddings;
bool causal_attn;
bool offload_kqv;
bool flash_attn;
bool no_perf;
enum llama_pooling_type pooling_type;
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
};
struct llama_context {
llama_context(const llama_model & model)
: model(model)
, t_start_us(model.t_start_us)
, t_load_us(model.t_load_us) {}
const struct llama_model & model;
struct llama_cparams cparams;
struct llama_sbatch sbatch;
struct llama_kv_cache kv_self;
struct llama_control_vector cvec;
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
std::vector<ggml_backend_ptr> backends;
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
ggml_backend_t backend_cpu = nullptr;
ggml_threadpool_t threadpool = nullptr;
ggml_threadpool_t threadpool_batch = nullptr;
bool has_evaluated_once = false;
mutable int64_t t_start_us;
mutable int64_t t_load_us;
mutable int64_t t_p_eval_us = 0;
mutable int64_t t_eval_us = 0;
mutable int64_t t_compute_start_us = 0;
mutable int64_t n_queued_tokens = 0;
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
mutable int32_t n_eval = 0; // number of eval calls
// host buffer for the model output (logits and embeddings)
ggml_backend_buffer_ptr buf_output;
// decode output (2-dimensional array: [n_outputs][n_vocab])
size_t logits_size = 0; // capacity (of floats) for logits
float * logits = nullptr;
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
bool logits_all = false;
// embeddings output (2-dimensional array: [n_outputs][n_embd])
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
size_t embd_size = 0; // capacity (of floats) for embeddings
float * embd = nullptr;
// sequence embeddings output (map of [n_embd] vectors)
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
std::map<llama_seq_id, std::vector<float>> embd_seq;
// whether we are computing encoder output or decoder output
bool is_encoding = false;
// TODO: find a better way to accommodate mutli-dimension position encoding methods
// number of position id each token get, 1 for each token in most cases.
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
int n_pos_per_token = 1;
// output of the encoder part of the encoder-decoder models
std::vector<float> embd_enc;
std::vector<std::set<llama_seq_id>> seq_ids_enc;
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_ptr sched;
ggml_abort_callback abort_callback = nullptr;
void * abort_callback_data = nullptr;
// input tensors
struct ggml_tensor * inp_tokens; // I32 [n_batch]
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
struct ggml_tensor * inp_pos; // I32 [n_batch]
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
struct ggml_tensor * inp_cls; // I32 [n_batch]
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
};
static bool llama_kv_cache_init(
struct llama_kv_cache & cache,
const llama_context * ctx,
ggml_type type_k,
ggml_type type_v,
uint32_t kv_size,
bool offload) {
const llama_model & model = ctx->model;
const llama_cparams & cparams = ctx->cparams;
const struct llama_hparams & hparams = model.hparams;
const int32_t n_layer = hparams.n_layer;
LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
cache.has_shift = false;
cache.recurrent = llama_model_is_recurrent(&model);
cache.v_trans = !cache.recurrent && !cparams.flash_attn;
cache.head = 0;
cache.size = kv_size;
cache.used = 0;
cache.type_k = type_k;
cache.type_v = type_v;
cache.cells.clear();
cache.cells.resize(kv_size);
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
struct ggml_init_params params = {
/*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
return nullptr;
}
ctx_map[buft] = ctx;
cache.ctxs.emplace_back(ctx);
return ctx;
}
return it->second;
};
cache.k_l.reserve(n_layer);
cache.v_l.reserve(n_layer);
for (int i = 0; i < n_layer; i++) {
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
ggml_backend_buffer_type_t buft;
if (offload) {
auto * dev = model.dev_layer.at(i).dev;
buft = ggml_backend_dev_buffer_type(dev);
} else {
buft = ggml_backend_cpu_buffer_type();
}
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
return false;
}
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
cache.v_l.push_back(v);
}
// allocate tensors and initialize the buffers to avoid NaNs in the padding
for (auto it : ctx_map) {
auto * buft = it.first;
auto * ctx = it.second;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
return false;
}
ggml_backend_buffer_clear(buf, 0);
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
cache.bufs.emplace_back(buf);
}
return true;
}
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
// the FA kernels require padding to avoid extra runtime boundary checks
return cparams.flash_attn ? 256u : 32u;
}
// Make sure enough space is available for outputs.
// Returns max number of outputs for which space was reserved.
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto & cparams = lctx.cparams;
const auto & hparams = lctx.model.hparams;
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
const auto n_batch = cparams.n_batch;
const auto n_vocab = hparams.n_vocab;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
const bool has_logits = !cparams.embeddings;
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
if (lctx.output_ids.empty()) {
// init, never resized afterwards
lctx.output_ids.resize(n_batch);
}
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
const size_t new_size = (logits_size + embd_size) * sizeof(float);
// alloc only when more than the current capacity is required
// TODO: also consider shrinking the buffer
if (!lctx.buf_output || prev_size < new_size) {
if (lctx.buf_output) {
#ifndef NDEBUG
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
lctx.buf_output = nullptr;
lctx.logits = nullptr;
lctx.embd = nullptr;
}
auto * buft = ggml_backend_cpu_buffer_type();
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
auto * output_dev = lctx.model.dev_output.dev;
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
if (output_dev_host_buft) {
buft = output_dev_host_buft;
}
lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
if (lctx.buf_output == nullptr) {
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
return 0;
}
}
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
lctx.logits = has_logits ? output_base : nullptr;
lctx.embd = has_embd ? output_base + logits_size : nullptr;
lctx.output_size = n_outputs_max;
lctx.logits_size = logits_size;
lctx.embd_size = embd_size;
// set all ids as invalid (negative)
std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
lctx.n_outputs = 0;
return n_outputs_max;
}
// make the outputs have the same order they had in the user-provided batch
static void llama_output_reorder(struct llama_context * ctx) {
std::vector<size_t> & out_ids = ctx->sbatch.out_ids;
if (!out_ids.empty()) {
uint32_t n_vocab = ctx->model.hparams.n_vocab;
uint32_t n_embd = ctx->model.hparams.n_embd;
int32_t n_outputs = ctx->n_outputs;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
// TODO: is there something more efficient which also minimizes swaps?
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
for (int32_t i = 0; i < n_outputs - 1; ++i) {
int32_t j_min = i;
for (int32_t j = i + 1; j < n_outputs; ++j) {
if (out_ids[j] < out_ids[j_min]) {
j_min = j;
}
}
if (j_min == i) { continue; }
std::swap(out_ids[i], out_ids[j_min]);
if (ctx->logits_size > 0) {
for (uint32_t k = 0; k < n_vocab; k++) {
std::swap(ctx->logits[i*n_vocab + k], ctx->logits[j_min*n_vocab + k]);
}
}
if (ctx->embd_size > 0) {
for (uint32_t k = 0; k < n_embd; k++) {
std::swap(ctx->embd[i*n_embd + k], ctx->embd[j_min*n_embd + k]);
}
}
}
std::fill(ctx->output_ids.begin(), ctx->output_ids.end(), -1);
for (int32_t i = 0; i < n_outputs; ++i) {
ctx->output_ids[out_ids[i]] = i;
}
out_ids.clear();
}
}

View File

@ -0,0 +1 @@
#include "llama-control-vector.h"

130
src/llama-control-vector.h Normal file
View File

@ -0,0 +1,130 @@
#pragma once
#include "llama-impl.h"
#include "ggml-cpp.h"
#include "llama-model.h" // TODO: need only hparams
#include <vector>
#include <map>
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; // per layer
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
int32_t layer_start = -1;
int32_t layer_end = -1;
struct ggml_tensor * tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr;
}
return tensors[il];
}
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
ggml_tensor * layer_dir = tensor_for(il);
if (layer_dir != nullptr) {
cur = ggml_add(ctx, cur, layer_dir);
}
return cur;
}
};
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
GGML_ASSERT(cvec.tensors.empty());
GGML_ASSERT(cvec.ctxs.empty());
GGML_ASSERT(cvec.bufs.empty());
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
struct ggml_init_params params = {
/*.mem_size =*/ model.hparams.n_layer*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
return nullptr;
}
ctx_map[buft] = ctx;
cvec.ctxs.emplace_back(ctx);
return ctx;
}
return it->second;
};
// make tensors
cvec.tensors.reserve(model.hparams.n_layer);
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < model.hparams.n_layer; il++) {
ggml_backend_buffer_type_t buft = select_buft(*model.dev_layer.at(il).buft_list,
[&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
return ggml_add(ctx, cur, layer_dir);
});
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
return false;
}
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
cvec.tensors.push_back(tensor);
}
// allocate tensors / buffers and zero
cvec.bufs.reserve(ctx_map.size());
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
return false;
}
ggml_backend_buffer_clear(buf, 0);
cvec.bufs.emplace_back(buf);
}
return true;
}
static int32_t llama_control_vector_apply(struct llama_control_vector & cvec, const llama_model & model, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
if (data == nullptr) {
// disable the current control vector (but leave allocated for later)
cvec.layer_start = -1;
cvec.layer_end = -1;
return 0;
}
if (n_embd != (int) model.hparams.n_embd) {
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
return 1;
}
if (cvec.tensors.empty()) {
if (!llama_control_vector_init(cvec, model)) {
return 1;
}
}
cvec.layer_start = il_start;
cvec.layer_end = il_end;
for (size_t il = 1; il < model.hparams.n_layer; il++) {
assert(cvec.tensors[il] != nullptr);
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
if (off + n_embd <= len) {
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
}
}
return 0;
}

View File

@ -24,6 +24,23 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
void llama_log_internal (ggml_log_level level, const char * format, ...);
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
// TODO: move to source
LLAMA_ATTRIBUTE_FORMAT(1, 2)
static std::string format(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)

2
src/llama-kv-cache.cpp Normal file
View File

@ -0,0 +1,2 @@
#include "llama-kv-cache.h"

625
src/llama-kv-cache.h Normal file
View File

@ -0,0 +1,625 @@
#pragma once
#include "llama-impl.h"
#include "llama-batch.h"
#include "llama-model.h"
#include "ggml-cpp.h"
#include <set>
#include <vector>
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
int32_t src = -1; // used by recurrent state models to copy states
int32_t tail = -1;
std::set<llama_seq_id> seq_id;
bool has_seq_id(const llama_seq_id & id) const {
return seq_id.find(id) != seq_id.end();
}
bool is_empty() const {
return seq_id.empty();
}
bool is_same_seq(const llama_kv_cell & other) const {
return seq_id == other.seq_id;
}
};
// ring-buffer of cached KV data
struct llama_kv_cache {
bool has_shift = false;
bool do_defrag = false;
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
bool v_trans = true; // the value tensor is transposed
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_internal also uses it, so it
// cannot be freely changed after a slot has been allocated.
uint32_t head = 0;
uint32_t size = 0;
uint32_t used = 0; // used cells (i.e. at least one seq_id)
// computed before each graph build
uint32_t n = 0;
ggml_type type_k = GGML_TYPE_F16;
ggml_type type_v = GGML_TYPE_F16;
std::vector<llama_kv_cell> cells;
std::vector<struct ggml_tensor *> k_l; // per layer
std::vector<struct ggml_tensor *> v_l;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
size_t total_size() {
size_t size = 0;
for (auto & buf : bufs) {
size += ggml_backend_buffer_get_size(buf.get());
}
return size;
}
};
//
// kv cache helpers
//
// a structure holds information about the slot found in llama_kv_cache_find_slot
struct llama_kv_cache_slot_info {
std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
bool found = false; // the slot was found
explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
operator bool() const { return found; }
};
static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
// find an empty slot of size "n_tokens" in the cache
// updates the cache head
// returns a structure holding information about the slot found
// Note: On success, it's important that cache.head points
// to the first cell of the slot.
static struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
struct llama_kv_cache & cache,
const struct llama_ubatch & batch) {
const uint32_t n_tokens = batch.n_tokens;
const uint32_t n_seqs = batch.n_seqs;
const uint32_t n_seq_tokens = batch.n_seq_tokens;
if (cache.recurrent) {
// For recurrent state architectures (like Mamba or RWKV),
// each cache cell can store the state for a whole sequence.
// A slot should be always be contiguous.
// can only process batches with an equal number of new tokens in each sequence
GGML_ASSERT(batch.equal_seqs);
int32_t min = cache.size - 1;
int32_t max = 0;
// everything should fit if all seq_ids are smaller than the max
for (uint32_t s = 0; s < n_seqs; ++s) {
const uint32_t n_seq_id = batch.n_seq_id[s];
for (uint32_t j = 0; j < n_seq_id; ++j) {
const llama_seq_id seq_id = batch.seq_id[s][j];
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
// too big seq_id
// TODO: would it be possible to resize the cache instead?
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
return llama_kv_cache_slot_info_failed;
}
if (j > 0) {
llama_kv_cell & seq = cache.cells[seq_id];
if (seq.tail >= 0) {
llama_kv_cell & cell = cache.cells[seq.tail];
// clear cells from seq_ids that become shared
// (should not normally happen, but let's handle it anyway)
cell.seq_id.erase(seq_id);
seq.tail = -1;
if (cell.seq_id.empty()) {
cell.pos = -1;
cell.src = -1;
cache.used -= 1;
}
}
}
}
}
#ifndef NDEBUG
{
std::vector<int32_t> tails_verif;
tails_verif.assign(cache.size, -1);
for (uint32_t i = 0; i < cache.size; ++i) {
llama_kv_cell & cell = cache.cells[i];
for (llama_seq_id seq_id : cell.seq_id) {
if (tails_verif[seq_id] != -1) {
LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
}
tails_verif[seq_id] = i;
}
}
for (uint32_t i = 0; i < cache.size; ++i) {
if (tails_verif[i] != cache.cells[i].tail) {
LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
}
}
}
#endif
// find next empty cell
uint32_t next_empty_cell = cache.head;
for (uint32_t i = 0; i < cache.size; ++i) {
if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
llama_kv_cell & cell = cache.cells[next_empty_cell];
if (cell.is_empty()) { break; }
next_empty_cell += 1;
}
// find usable cell range
for (uint32_t s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = batch.seq_id[s][0];
llama_kv_cell & seq_meta = cache.cells[seq_id];
bool has_cell = false;
if (seq_meta.tail >= 0) {
llama_kv_cell & cell = cache.cells[seq_meta.tail];
GGML_ASSERT(cell.has_seq_id(seq_id));
// does this seq_id "own" the cell?
if (cell.seq_id.size() == 1) { has_cell = true; }
}
if (!has_cell) {
llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
GGML_ASSERT(empty_cell.is_empty());
// copy old tail into the empty cell
if (seq_meta.tail >= 0) {
llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
empty_cell.pos = orig_cell.pos;
empty_cell.src = orig_cell.src;
orig_cell.seq_id.erase(seq_id);
empty_cell.seq_id.insert(seq_id); // will be overwritten
}
seq_meta.tail = next_empty_cell;
// find next empty cell
if (s + 1 < n_seqs) {
next_empty_cell += 1;
for (uint32_t i = 0; i < cache.size; ++i) {
if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
llama_kv_cell & cell = cache.cells[next_empty_cell];
if (cell.is_empty()) { break; }
next_empty_cell += 1;
}
}
}
if (min > seq_meta.tail) { min = seq_meta.tail; }
if (max < seq_meta.tail) { max = seq_meta.tail; }
}
// gather and re-order
for (uint32_t s = 0; s < n_seqs; ++s) {
int32_t dst_id = s + min;
int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
if (dst_id != src_id) {
llama_kv_cell & dst_cell = cache.cells[dst_id];
llama_kv_cell & src_cell = cache.cells[src_id];
std::swap(dst_cell.pos, src_cell.pos);
std::swap(dst_cell.src, src_cell.src);
std::swap(dst_cell.seq_id, src_cell.seq_id);
// swap tails (assuming they NEVER overlap)
for (const llama_seq_id seq_id : src_cell.seq_id) {
cache.cells[seq_id].tail = src_id;
}
for (const llama_seq_id seq_id : dst_cell.seq_id) {
cache.cells[seq_id].tail = dst_id;
}
}
}
// update the pos of the used seqs
for (uint32_t s = 0; s < n_seqs; ++s) {
const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
int32_t cell_id = s + min;
llama_kv_cell & cell = cache.cells[cell_id];
if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
// What should happen when the pos backtracks or skips a value?
// Clearing the state mid-batch would require special-casing which isn't done.
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
__func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
}
cell.pos = last_pos;
cell.seq_id.clear();
for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
const llama_seq_id seq_id = batch.seq_id[s][j];
cell.seq_id.insert(seq_id);
cache.cells[seq_id].tail = cell_id;
}
}
// allow getting the range of used cells, from head to head + n
cache.head = min;
cache.n = max - min + 1;
cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
[](const llama_kv_cell& cell){ return !cell.is_empty(); });
// sanity check
return llama_kv_cache_slot_info(cache.n >= n_seqs);
}
// otherwise, one cell per token.
if (n_tokens > cache.size) {
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
return llama_kv_cache_slot_info_failed;
}
uint32_t n_tested = 0;
while (true) {
if (cache.head + n_tokens > cache.size) {
n_tested += cache.size - cache.head;
cache.head = 0;
continue;
}
bool found = true;
for (uint32_t i = 0; i < n_tokens; i++) {
if (cache.cells[cache.head + i].pos >= 0) {
found = false;
cache.head += i + 1;
n_tested += i + 1;
break;
}
}
if (found) {
break;
}
if (n_tested >= cache.size) {
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
return llama_kv_cache_slot_info_failed;
}
}
for (uint32_t s = 0; s < n_seqs; s++) {
for (uint32_t i = 0; i < n_seq_tokens; ++i) {
uint32_t k = s*n_seq_tokens + i;
cache.cells[cache.head + k].pos = batch.pos[k];
for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
}
}
}
cache.used += n_tokens;
return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
}
// find how many cells are currently in use
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
for (uint32_t i = cache.size; i > 0; --i) {
const llama_kv_cell & cell = cache.cells[i - 1];
if (cell.pos >= 0 && !cell.is_empty()) {
return i;
}
}
return 0;
}
static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
cache.cells[i].pos = -1;
cache.cells[i].seq_id.clear();
cache.cells[i].src = -1;
cache.cells[i].tail = -1;
}
cache.head = 0;
cache.used = 0;
for (auto & buf : cache.bufs) {
ggml_backend_buffer_clear(buf.get(), 0);
}
}
static bool llama_kv_cache_seq_rm(
struct llama_kv_cache & cache,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1) {
uint32_t new_head = cache.size;
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
// models like Mamba or RWKV can't have a state partially erased
if (cache.recurrent) {
if (seq_id >= (int64_t) cache.size) {
// could be fatal
return false;
}
if (0 <= seq_id) {
int32_t & tail_id = cache.cells[seq_id].tail;
if (tail_id >= 0) {
const llama_kv_cell & cell = cache.cells[tail_id];
// partial intersection is invalid
if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
return false;
}
// invalidate tails which will be cleared
if (p0 <= cell.pos && cell.pos < p1) {
tail_id = -1;
}
}
} else {
// seq_id is negative, then the range should include everything or nothing
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
return false;
}
}
}
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
if (seq_id < 0) {
cache.cells[i].seq_id.clear();
} else if (cache.cells[i].has_seq_id(seq_id)) {
cache.cells[i].seq_id.erase(seq_id);
} else {
continue;
}
if (cache.cells[i].is_empty()) {
// keep count of the number of used cells
if (cache.cells[i].pos >= 0) cache.used--;
cache.cells[i].pos = -1;
cache.cells[i].src = -1;
if (new_head == cache.size) new_head = i;
}
}
}
// If we freed up a slot, set head to it so searching can start there.
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
return true;
}
static void llama_kv_cache_seq_cp(
struct llama_kv_cache & cache,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1) {
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
if (cache.recurrent) {
if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
llama_kv_cell & tail_src = cache.cells[seq_id_src];
llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
if (tail_dst.tail >= 0) {
// clear destination seq_id if it wasn't empty
llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
cell_dst.seq_id.erase(seq_id_dst);
tail_dst.tail = -1;
if (cell_dst.seq_id.empty()) {
cell_dst.pos = -1;
cell_dst.delta = -1;
cell_dst.src = -1;
cache.used -= 1;
}
}
if (tail_src.tail >= 0) {
llama_kv_cell & cell_src = cache.cells[tail_src.tail];
cell_src.seq_id.insert(seq_id_dst);
tail_dst.tail = tail_src.tail;
}
}
return;
}
// otherwise, this is the KV cache of a Transformer-like model
cache.head = 0;
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
cache.cells[i].seq_id.insert(seq_id_dst);
}
}
}
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
uint32_t new_head = cache.size;
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.recurrent && (llama_seq_id) i != seq_id) {
cache.cells[i].tail = -1;
}
if (!cache.cells[i].has_seq_id(seq_id)) {
if (cache.cells[i].pos >= 0) cache.used--;
cache.cells[i].pos = -1;
cache.cells[i].src = -1;
cache.cells[i].seq_id.clear();
if (new_head == cache.size) new_head = i;
} else {
cache.cells[i].seq_id.clear();
cache.cells[i].seq_id.insert(seq_id);
}
}
// If we freed up a slot, set head to it so searching can start there.
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
}
static void llama_kv_cache_seq_add(
struct llama_kv_cache & cache,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta) {
uint32_t new_head = cache.size;
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
// If there is no range then return early to avoid looping over the cache.
if (p0 == p1) return;
if (cache.recurrent) {
// for Mamba-like or RWKV models, only the pos needs to be shifted
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
const int32_t tail_id = cache.cells[seq_id].tail;
if (tail_id >= 0) {
llama_kv_cell & cell = cache.cells[tail_id];
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
cell.pos += delta;
}
}
}
return;
}
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
cache.has_shift = true;
cache.cells[i].pos += delta;
cache.cells[i].delta += delta;
if (cache.cells[i].pos < 0) {
if (!cache.cells[i].is_empty()) {
cache.used--;
}
cache.cells[i].pos = -1;
cache.cells[i].seq_id.clear();
if (new_head == cache.size) {
new_head = i;
}
}
}
}
// If we freed up a slot, set head to it so searching can start there.
// Otherwise we just start the next search from the beginning.
cache.head = new_head != cache.size ? new_head : 0;
}
static void llama_kv_cache_seq_div(
struct llama_kv_cache & cache,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d) {
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
// If there is no range then return early to avoid looping over the cache.
if (p0 == p1) return;
if (cache.recurrent) {
// for Mamba-like or RWKV models, only the pos needs to be changed
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
const int32_t tail_id = cache.cells[seq_id].tail;
if (tail_id >= 0) {
llama_kv_cell & cell = cache.cells[tail_id];
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
cell.pos /= d;
}
}
}
return;
}
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
cache.has_shift = true;
{
llama_pos p_old = cache.cells[i].pos;
cache.cells[i].pos /= d;
cache.cells[i].delta += cache.cells[i].pos - p_old;
}
}
}
}
static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
llama_pos result = 0;
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id)) {
result = std::max(result, cache.cells[i].pos);
}
}
return result;
}
static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
if (!cache.recurrent) {
cache.do_defrag = true;
}
}
// saves the kv_cache state for future recovery.
// used to rollback llama_kv_cache_find_slot changes.
struct llama_kv_slot_restorer {
struct llama_kv_cache_state {
uint32_t head = 0;
uint32_t n = 0;
} old_state;
// for non-recurrent models only
// list of slots to restore
std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
bool do_restore = false;
explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
old_state.head = cache.head;
old_state.n = cache.n;
}
// saves a slot information for future restoration
void save(const struct llama_kv_cache_slot_info & slot) {
if (slot) {
do_restore = true;
if (slot.boundaries.first != slot.boundaries.second) {
slot_boundaries.push_back(slot.boundaries);
}
}
}
// must be explicitly called to restore the kv_cache state
// and rollback changes from all llama_kv_cache_find_slot calls
void restore(struct llama_kv_cache & cache) {
if (do_restore) {
cache.head = old_state.head;
cache.n = old_state.n;
if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
llama_kv_cache_seq_rm(cache, -1, -1, -1);
} else {
for (auto & slot : slot_boundaries) {
llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
}
}
}
}
};

1
src/llama-mmap.cpp Normal file
View File

@ -0,0 +1 @@
#include "llama-mmap.h"

587
src/llama-mmap.h Normal file
View File

@ -0,0 +1,587 @@
#pragma once
#include "llama-impl.h"
#include "ggml.h"
#include <cstdio>
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#include <fcntl.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#ifndef PATH_MAX
#define PATH_MAX MAX_PATH
#endif
#include <io.h>
#endif
struct llama_file {
#if defined(_WIN32)
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
HANDLE fp_win32;
size_t size;
private:
std::string GetErrorMessageWin32(DWORD error_code) const {
std::string ret;
LPSTR lpMsgBuf = NULL;
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
if (!bufLen) {
ret = format("Win32 error code: %lx", error_code);
} else {
ret = lpMsgBuf;
LocalFree(lpMsgBuf);
}
return ret;
}
public:
llama_file(const char * fname, const char * mode) {
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
size_t tell() const {
// SetFilePointerEx returns the current position when seeking relative 0 bytes
LARGE_INTEGER li;
li.QuadPart = 0;
BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
if (!ret) {
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
}
return li.QuadPart;
}
void seek(size_t offset, int whence) const {
// no need to convert SEEK_* to FILE_*. The enums are the same.
// Still, keep static asserts to avoid failures in the future.
static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
LARGE_INTEGER li;
li.QuadPart = offset;
BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
if (!ret) {
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
}
}
void read_raw(void * ptr, size_t len) const {
// On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
// use the Win32 API to do file io instead of the C/C++ library functions.
// There are conditions under which ReadFile cannot read chunks >64MB.
// Thus split the operation into smaller chunks if len exceeds this limit.
size_t bytes_read = 0;
while (bytes_read < len) {
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
DWORD chunk_read = 0;
BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
if (!result) {
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
}
if (chunk_read < chunk_size || chunk_read == 0) {
throw std::runtime_error("unexpectedly reached end of file");
}
bytes_read += chunk_read;
} ;
}
uint32_t read_u32() const {
uint32_t val;
read_raw(&val, sizeof(val));
return val;
}
void write_raw(const void * ptr, size_t len) const {
// There are conditions under which WriteFile cannot write chunks >64MB.
// Thus split the operation into smaller chunks if len exceeds this limit.
size_t bytes_written = 0;
while (bytes_written < len) {
size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
DWORD chunk_written = 0;
BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
if (!result) {
throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
}
if (chunk_written < chunk_size || chunk_written == 0) {
throw std::runtime_error("unexpectedly failed to write bytes");
}
bytes_written += chunk_written;
}
}
void write_u32(std::uint32_t val) const {
write_raw(&val, sizeof(val));
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
#else
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) {
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
}
return (size_t) ret;
}
void seek(size_t offset, int whence) const {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
if (ret != 0) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
}
void read_raw(void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}
}
uint32_t read_u32() const {
uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
void write_raw(const void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, len, 1, fp);
if (ret != 1) {
throw std::runtime_error(format("write error: %s", strerror(errno)));
}
}
void write_u32(std::uint32_t val) const {
write_raw(&val, sizeof(val));
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
#endif
};
using llama_files = std::vector<std::unique_ptr<llama_file>>;
struct llama_mmap {
void * addr;
size_t size;
llama_mmap(const llama_mmap &) = delete;
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
// list of mapped fragments (first_offset, last_offset)
std::vector<std::pair<size_t, size_t>> mapped_fragments;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__
// advise the kernel to read the file sequentially (increases readahead)
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
strerror(errno));
}
if (prefetch) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) { // NOLINT
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}
if (prefetch > 0) {
// advise the kernel to preload the mapped memory
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
}
if (numa) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
strerror(errno));
}
}
// initialize list of mapped_fragments
mapped_fragments.emplace_back(0, file->size);
}
static void align_range(size_t * first, size_t * last, size_t page_size) {
// align first to the next page
size_t offset_in_page = *first & (page_size - 1);
size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
*first += offset_to_page;
// align last to the previous page
*last = *last & ~(page_size - 1);
if (*last <= *first) {
*last = *first;
}
}
// partially unmap the file in the range [first, last)
void unmap_fragment(size_t first, size_t last) {
// note: this function must not be called multiple times with overlapping ranges
// otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
int page_size = sysconf(_SC_PAGESIZE);
align_range(&first, &last, page_size);
size_t len = last - first;
if (len == 0) {
return;
}
GGML_ASSERT(first % page_size == 0);
GGML_ASSERT(last % page_size == 0);
GGML_ASSERT(last > first);
void * next_page_start = (uint8_t *) addr + first;
// unmap the range
if (munmap(next_page_start, len)) {
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
}
// update the list of mapped fragments to avoid unmapping the same range again in the destructor
std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
for (const auto & frag : mapped_fragments) {
if (frag.first < first && frag.second > last) {
// the range is in the middle of the fragment, split it
new_mapped_fragments.emplace_back(frag.first, first);
new_mapped_fragments.emplace_back(last, frag.second);
} else if (frag.first < first && frag.second > first) {
// the range starts in the middle of the fragment
new_mapped_fragments.emplace_back(frag.first, first);
} else if (frag.first < last && frag.second > last) {
// the range ends in the middle of the fragment
new_mapped_fragments.emplace_back(last, frag.second);
} else if (frag.first >= first && frag.second <= last) {
// the range covers the entire fragment
} else {
// the range is outside the fragment
new_mapped_fragments.push_back(frag);
}
}
mapped_fragments = std::move(new_mapped_fragments);
}
~llama_mmap() {
for (const auto & frag : mapped_fragments) {
if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
}
}
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
GGML_UNUSED(numa);
size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
if (hMapping == NULL) {
DWORD error = GetLastError();
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
}
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
DWORD error = GetLastError();
CloseHandle(hMapping);
if (addr == NULL) {
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
}
if (prefetch > 0) {
#if _WIN32_WINNT >= 0x602
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
// may fail on pre-Windows 8 systems
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
if (pPrefetchVirtualMemory) {
// advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
throw std::runtime_error("PrefetchVirtualMemory unavailable");
#endif
}
}
void unmap_fragment(size_t first, size_t last) {
// not supported
GGML_UNUSED(first);
GGML_UNUSED(last);
}
~llama_mmap() {
if (!UnmapViewOfFile(addr)) {
LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
GGML_UNUSED(file);
GGML_UNUSED(prefetch);
GGML_UNUSED(numa);
throw std::runtime_error("mmap not supported");
}
void unmap_fragment(size_t first, size_t last) {
GGML_UNUSED(first);
GGML_UNUSED(last);
throw std::runtime_error("mmap not supported");
}
#endif
};
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
// Represents some region of memory being locked using mlock or VirtualLock;
// will automatically unlock on destruction.
struct llama_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
llama_mlock() {}
llama_mlock(const llama_mlock &) = delete;
~llama_mlock() {
if (size) {
raw_unlock(addr, size);
}
}
void init(void * ptr) {
GGML_ASSERT(addr == NULL && size == 0); // NOLINT
addr = ptr;
}
void grow_to(size_t target_size) {
GGML_ASSERT(addr);
if (failed_already) {
return;
}
size_t granularity = lock_granularity();
target_size = (target_size + granularity - 1) & ~(granularity - 1);
if (target_size > size) {
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
size = target_size;
} else {
failed_already = true;
}
}
}
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
static size_t lock_granularity() {
return (size_t) sysconf(_SC_PAGESIZE);
}
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) const {
if (!mlock(addr, size)) {
return true;
}
char* errmsg = std::strerror(errno);
bool suggest = (errno == ENOMEM);
// Check if the resource limit is fine after all
struct rlimit lock_limit;
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
suggest = false;
}
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
suggest = false;
}
LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
return false;
}
#undef MLOCK_SUGGESTION
static void raw_unlock(void * addr, size_t size) {
if (munlock(addr, size)) {
LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
}
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
static size_t lock_granularity() {
SYSTEM_INFO si;
GetSystemInfo(&si);
return (size_t) si.dwPageSize;
}
bool raw_lock(void * ptr, size_t len) const {
for (int tries = 1; ; tries++) {
if (VirtualLock(ptr, len)) {
return true;
}
if (tries == 2) {
LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
len, size, llama_format_win_err(GetLastError()).c_str());
return false;
}
// It failed but this was only the first try; increase the working
// set size and try again.
SIZE_T min_ws_size, max_ws_size;
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
return false;
}
// Per MSDN: "The maximum number of pages that a process can lock
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
size_t increment = len + 1048576;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment;
max_ws_size += increment;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
return false;
}
}
}
static void raw_unlock(void * ptr, size_t len) {
if (!VirtualUnlock(ptr, len)) {
LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
static size_t lock_granularity() {
return (size_t) 65536;
}
bool raw_lock(const void * addr, size_t len) const {
LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
return false;
}
static void raw_unlock(const void * addr, size_t len) {}
#endif
};
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

1
src/llama-model.cpp Normal file
View File

@ -0,0 +1 @@
#include "llama-model.h"

650
src/llama-model.h Normal file
View File

@ -0,0 +1,650 @@
#pragma once
#include "llama.h"
#include "llama-arch.h"
#include "llama-vocab.h"
#include "llama-mmap.h"
#include "ggml-cpp.h"
#include <array>
#include <vector>
#include <cassert>
// bump if necessary
#define LLAMA_MAX_LAYERS 512
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
// available llama models
enum e_model {
MODEL_UNKNOWN,
MODEL_14M,
MODEL_17M,
MODEL_22M,
MODEL_33M,
MODEL_60M,
MODEL_70M,
MODEL_80M,
MODEL_109M,
MODEL_137M,
MODEL_160M,
MODEL_220M,
MODEL_250M,
MODEL_270M,
MODEL_335M,
MODEL_410M,
MODEL_450M,
MODEL_770M,
MODEL_780M,
MODEL_0_5B,
MODEL_1B,
MODEL_1_3B,
MODEL_1_4B,
MODEL_1_5B,
MODEL_1_6B,
MODEL_2B,
MODEL_2_8B,
MODEL_3B,
MODEL_4B,
MODEL_6B,
MODEL_6_9B,
MODEL_7B,
MODEL_8B,
MODEL_9B,
MODEL_11B,
MODEL_12B,
MODEL_13B,
MODEL_14B,
MODEL_15B,
MODEL_16B,
MODEL_20B,
MODEL_30B,
MODEL_32B,
MODEL_34B,
MODEL_35B,
MODEL_40B,
MODEL_65B,
MODEL_70B,
MODEL_236B,
MODEL_314B,
MODEL_SMALL,
MODEL_MEDIUM,
MODEL_LARGE,
MODEL_XL,
MODEL_A1_7B,
MODEL_A2_7B,
MODEL_8x7B,
MODEL_8x22B,
MODEL_16x12B,
MODEL_10B_128x3_66B,
MODEL_57B_A14B,
MODEL_27B,
};
static const char * llama_model_type_name(e_model type) {
switch (type) {
case MODEL_14M: return "14M";
case MODEL_17M: return "17M";
case MODEL_22M: return "22M";
case MODEL_33M: return "33M";
case MODEL_60M: return "60M";
case MODEL_70M: return "70M";
case MODEL_80M: return "80M";
case MODEL_109M: return "109M";
case MODEL_137M: return "137M";
case MODEL_160M: return "160M";
case MODEL_220M: return "220M";
case MODEL_250M: return "250M";
case MODEL_270M: return "270M";
case MODEL_335M: return "335M";
case MODEL_410M: return "410M";
case MODEL_450M: return "450M";
case MODEL_770M: return "770M";
case MODEL_780M: return "780M";
case MODEL_0_5B: return "0.5B";
case MODEL_1B: return "1B";
case MODEL_1_3B: return "1.3B";
case MODEL_1_4B: return "1.4B";
case MODEL_1_5B: return "1.5B";
case MODEL_1_6B: return "1.6B";
case MODEL_2B: return "2B";
case MODEL_2_8B: return "2.8B";
case MODEL_3B: return "3B";
case MODEL_4B: return "4B";
case MODEL_6B: return "6B";
case MODEL_6_9B: return "6.9B";
case MODEL_7B: return "7B";
case MODEL_8B: return "8B";
case MODEL_9B: return "9B";
case MODEL_11B: return "11B";
case MODEL_12B: return "12B";
case MODEL_13B: return "13B";
case MODEL_14B: return "14B";
case MODEL_15B: return "15B";
case MODEL_16B: return "16B";
case MODEL_20B: return "20B";
case MODEL_30B: return "30B";
case MODEL_32B: return "32B";
case MODEL_34B: return "34B";
case MODEL_35B: return "35B";
case MODEL_40B: return "40B";
case MODEL_65B: return "65B";
case MODEL_70B: return "70B";
case MODEL_236B: return "236B";
case MODEL_314B: return "314B";
case MODEL_SMALL: return "0.1B";
case MODEL_MEDIUM: return "0.4B";
case MODEL_LARGE: return "0.8B";
case MODEL_XL: return "1.5B";
case MODEL_A1_7B: return "A1.7B";
case MODEL_A2_7B: return "A2.7B";
case MODEL_8x7B: return "8x7B";
case MODEL_8x22B: return "8x22B";
case MODEL_16x12B: return "16x12B";
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
case MODEL_57B_A14B: return "57B.A14B";
case MODEL_27B: return "27B";
default: return "?B";
}
}
struct llama_hparams_posnet {
uint32_t n_embd;
uint32_t n_layer;
};
struct llama_hparams_convnext {
uint32_t n_embd;
uint32_t n_layer;
};
struct llama_hparams {
bool vocab_only;
bool rope_finetuned;
bool use_par_res;
bool swin_norm;
uint32_t n_vocab = 0;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_embd_features = 0;
uint32_t n_layer;
uint32_t n_rot;
uint32_t n_swa = 0; // sliding window attention (SWA)
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_vocab_type = 0; // for BERT-style token types
uint32_t n_rel_attn_bkts = 0;
// for WavTokenizer
struct llama_hparams_posnet posnet;
struct llama_hparams_convnext convnext;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
uint32_t n_ff_exp = 0;
uint32_t n_ff_shexp = 0;
uint32_t n_expert_shared = 0;
float expert_weights_scale = 0.0;
float f_norm_eps;
float f_norm_rms_eps;
float f_norm_group_eps;
uint32_t n_norm_groups;
float f_attn_logit_softcapping = 50.0f;
float f_final_logit_softcapping = 30.0f;
// for RWKV
uint32_t rescale_every_n_layers = 0;
uint32_t time_mix_extra_dim = 0;
uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0;
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul;
int rope_sections[4];
// for State Space Models
uint32_t ssm_d_conv = 0;
uint32_t ssm_d_inner = 0;
uint32_t ssm_d_state = 0;
uint32_t ssm_dt_rank = 0;
bool ssm_dt_b_c_rms = false;
float f_clamp_kqv = 0.0f;
float f_max_alibi_bias = 0.0f;
float f_logit_scale = 0.0f;
// Additional scale factors (Granite/Granite MoE)
float f_residual_scale = 0.0f;
float f_embedding_scale = 0.0f;
float f_attention_scale = 0.0f;
bool causal_attn = true;
bool use_alibi = false;
bool attn_soft_cap = false;
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
uint32_t n_head(uint32_t il = 0) const {
if (il < n_layer) {
return n_head_arr[il];
}
GGML_ABORT("fatal error");
}
uint32_t n_head_kv(uint32_t il = 0) const {
if (il < n_layer) {
return n_head_kv_arr[il];
}
GGML_ABORT("fatal error");
}
uint32_t n_ff(uint32_t il = 0) const {
if (il < n_layer) {
return n_ff_arr[il];
}
GGML_ABORT("fatal error");
}
uint32_t n_gqa(uint32_t il = 0) const {
const uint32_t n_head = this->n_head(il);
const uint32_t n_head_kv = this->n_head_kv(il);
if (n_head_kv == 0) {
return 0;
}
return n_head/n_head_kv;
}
uint32_t n_embd_k_gqa(uint32_t il = 0) const { // dimension of key embeddings across all k-v heads
const uint32_t n_head_kv = this->n_head_kv(il);
return n_embd_head_k * n_head_kv;
}
uint32_t n_embd_v_gqa(uint32_t il = 0) const { // dimension of value embeddings across all k-v heads
const uint32_t n_head_kv = this->n_head_kv(il);
return n_embd_head_v * n_head_kv;
}
uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
// corresponds to Mamba's conv_states size or RWKV's token_shift states size
if (wkv_head_size != 0) {
// for RWKV models
return 2 * n_embd;
}
// TODO: maybe support other convolution strides than 1
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
}
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
if (wkv_head_size != 0) {
// corresponds to RWKV's wkv_states size
return n_embd * wkv_head_size;
}
// corresponds to Mamba's ssm_states size
return ssm_d_state * ssm_d_inner;
}
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
struct llama_layer_posnet {
// resnet
struct ggml_tensor * norm1 = nullptr;
struct ggml_tensor * norm1_b = nullptr;
struct ggml_tensor * conv1 = nullptr;
struct ggml_tensor * conv1_b = nullptr;
struct ggml_tensor * norm2 = nullptr;
struct ggml_tensor * norm2_b = nullptr;
struct ggml_tensor * conv2 = nullptr;
struct ggml_tensor * conv2_b = nullptr;
// attention
struct ggml_tensor * attn_norm = nullptr;
struct ggml_tensor * attn_norm_b = nullptr;
struct ggml_tensor * attn_q = nullptr;
struct ggml_tensor * attn_q_b = nullptr;
struct ggml_tensor * attn_k = nullptr;
struct ggml_tensor * attn_k_b = nullptr;
struct ggml_tensor * attn_v = nullptr;
struct ggml_tensor * attn_v_b = nullptr;
struct ggml_tensor * attn_o = nullptr;
struct ggml_tensor * attn_o_b = nullptr;
// normalize
struct ggml_tensor * norm = nullptr;
struct ggml_tensor * norm_b = nullptr;
};
struct llama_layer_convnext {
struct ggml_tensor * dw = nullptr;
struct ggml_tensor * dw_b = nullptr;
struct ggml_tensor * norm = nullptr;
struct ggml_tensor * norm_b = nullptr;
struct ggml_tensor * pw1 = nullptr;
struct ggml_tensor * pw1_b = nullptr;
struct ggml_tensor * pw2 = nullptr;
struct ggml_tensor * pw2_b = nullptr;
struct ggml_tensor * gamma = nullptr;
};
struct llama_layer {
// normalization
struct ggml_tensor * attn_norm = nullptr;
struct ggml_tensor * attn_norm_b = nullptr;
struct ggml_tensor * attn_norm_2 = nullptr;
struct ggml_tensor * attn_norm_2_b = nullptr;
struct ggml_tensor * attn_q_norm = nullptr;
struct ggml_tensor * attn_q_norm_b = nullptr;
struct ggml_tensor * attn_k_norm = nullptr;
struct ggml_tensor * attn_k_norm_b = nullptr;
struct ggml_tensor * attn_out_norm = nullptr;
struct ggml_tensor * attn_out_norm_b = nullptr;
struct ggml_tensor * attn_q_a_norm = nullptr;
struct ggml_tensor * attn_kv_a_norm = nullptr;
struct ggml_tensor * attn_sub_norm = nullptr;
struct ggml_tensor * attn_post_norm = nullptr;
struct ggml_tensor * ffn_sub_norm = nullptr;
struct ggml_tensor * attn_norm_cross = nullptr;
struct ggml_tensor * attn_norm_enc = nullptr;
// attention
struct ggml_tensor * wq = nullptr;
struct ggml_tensor * wk = nullptr;
struct ggml_tensor * wv = nullptr;
struct ggml_tensor * wo = nullptr;
struct ggml_tensor * wqkv = nullptr;
struct ggml_tensor * wq_a = nullptr;
struct ggml_tensor * wq_b = nullptr;
struct ggml_tensor * wkv_a_mqa = nullptr;
struct ggml_tensor * wkv_b = nullptr;
struct ggml_tensor * wq_cross = nullptr;
struct ggml_tensor * wk_cross = nullptr;
struct ggml_tensor * wv_cross = nullptr;
struct ggml_tensor * wo_cross = nullptr;
struct ggml_tensor * wq_enc = nullptr;
struct ggml_tensor * wk_enc = nullptr;
struct ggml_tensor * wv_enc = nullptr;
struct ggml_tensor * wo_enc = nullptr;
// attention bias
struct ggml_tensor * bq = nullptr;
struct ggml_tensor * bk = nullptr;
struct ggml_tensor * bv = nullptr;
struct ggml_tensor * bo = nullptr;
struct ggml_tensor * bqkv = nullptr;
// relative position bias
struct ggml_tensor * attn_rel_b = nullptr;
struct ggml_tensor * attn_rel_b_enc = nullptr;
struct ggml_tensor * attn_rel_b_cross = nullptr;
// normalization
struct ggml_tensor * ffn_norm = nullptr;
struct ggml_tensor * ffn_norm_b = nullptr;
struct ggml_tensor * ffn_post_norm = nullptr;
struct ggml_tensor * layer_out_norm = nullptr;
struct ggml_tensor * layer_out_norm_b = nullptr;
struct ggml_tensor * ffn_norm_exps = nullptr;
struct ggml_tensor * ffn_norm_enc = nullptr;
// ff
struct ggml_tensor * ffn_gate = nullptr; // w1
struct ggml_tensor * ffn_down = nullptr; // w2
struct ggml_tensor * ffn_up = nullptr; // w3
struct ggml_tensor * ffn_gate_enc = nullptr;
struct ggml_tensor * ffn_down_enc = nullptr;
struct ggml_tensor * ffn_up_enc = nullptr;
// ff MoE
struct ggml_tensor * ffn_gate_inp = nullptr;
struct ggml_tensor * ffn_gate_exps = nullptr;
struct ggml_tensor * ffn_down_exps = nullptr;
struct ggml_tensor * ffn_up_exps = nullptr;
// ff shared expert (shexp)
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
struct ggml_tensor * ffn_gate_shexp = nullptr;
struct ggml_tensor * ffn_down_shexp = nullptr;
struct ggml_tensor * ffn_up_shexp = nullptr;
// ff bias
struct ggml_tensor * ffn_gate_b = nullptr;
struct ggml_tensor * ffn_down_b = nullptr; // b2
struct ggml_tensor * ffn_up_b = nullptr; // b3
struct ggml_tensor * ffn_act = nullptr;
// mamba proj
struct ggml_tensor * ssm_in = nullptr;
struct ggml_tensor * ssm_x = nullptr;
struct ggml_tensor * ssm_dt = nullptr;
struct ggml_tensor * ssm_out = nullptr;
// mamba
struct ggml_tensor * ssm_conv1d = nullptr;
struct ggml_tensor * ssm_a = nullptr;
struct ggml_tensor * ssm_d = nullptr;
// mamba bias
struct ggml_tensor * ssm_conv1d_b = nullptr;
struct ggml_tensor * ssm_dt_b = nullptr;
// rwkv
struct ggml_tensor * time_mix_w1 = nullptr;
struct ggml_tensor * time_mix_w2 = nullptr;
struct ggml_tensor * time_mix_lerp_x = nullptr;
struct ggml_tensor * time_mix_lerp_w = nullptr;
struct ggml_tensor * time_mix_lerp_k = nullptr;
struct ggml_tensor * time_mix_lerp_v = nullptr;
struct ggml_tensor * time_mix_lerp_r = nullptr;
struct ggml_tensor * time_mix_lerp_g = nullptr;
struct ggml_tensor * time_mix_first = nullptr;
struct ggml_tensor * time_mix_decay = nullptr;
struct ggml_tensor * time_mix_decay_w1 = nullptr;
struct ggml_tensor * time_mix_decay_w2 = nullptr;
struct ggml_tensor * time_mix_key = nullptr;
struct ggml_tensor * time_mix_value = nullptr;
struct ggml_tensor * time_mix_receptance = nullptr;
struct ggml_tensor * time_mix_gate = nullptr;
struct ggml_tensor * time_mix_ln = nullptr;
struct ggml_tensor * time_mix_ln_b = nullptr;
struct ggml_tensor * time_mix_output = nullptr;
struct ggml_tensor * channel_mix_lerp_k = nullptr;
struct ggml_tensor * channel_mix_lerp_r = nullptr;
struct ggml_tensor * channel_mix_key = nullptr;
struct ggml_tensor * channel_mix_receptance = nullptr;
struct ggml_tensor * channel_mix_value = nullptr;
// long rope factors
struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr;
struct ggml_tensor * rope_freqs = nullptr;
// bitnet scale
struct ggml_tensor * wq_scale = nullptr;
struct ggml_tensor * wk_scale = nullptr;
struct ggml_tensor * wv_scale = nullptr;
struct ggml_tensor * wo_scale = nullptr;
struct ggml_tensor * ffn_gate_scale = nullptr;
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
};
struct llama_model {
e_model type = MODEL_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a";
llama_hparams hparams = {};
llama_vocab vocab;
struct ggml_tensor * tok_embd = nullptr;
struct ggml_tensor * type_embd = nullptr;
struct ggml_tensor * pos_embd = nullptr;
struct ggml_tensor * tok_norm = nullptr;
struct ggml_tensor * tok_norm_b = nullptr;
struct ggml_tensor * output_norm = nullptr;
struct ggml_tensor * output_norm_b = nullptr;
struct ggml_tensor * output = nullptr;
struct ggml_tensor * output_b = nullptr;
struct ggml_tensor * output_norm_enc = nullptr;
// classifier
struct ggml_tensor * cls = nullptr;
struct ggml_tensor * cls_b = nullptr;
struct ggml_tensor * cls_out = nullptr;
struct ggml_tensor * cls_out_b = nullptr;
struct ggml_tensor * conv1d = nullptr;
struct ggml_tensor * conv1d_b = nullptr;
std::vector<llama_layer> layers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
llama_split_mode split_mode;
int main_gpu;
int n_gpu_layers;
std::vector<std::string> rpc_servers;
// list of devices used in this model
std::vector<ggml_backend_dev_t> devices;
// lists of buffer types used for each layer
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
struct layer_dev {
ggml_backend_dev_t dev;
buft_list_t * buft_list;
};
layer_dev dev_input = {};
layer_dev dev_output = {};
std::vector<layer_dev> dev_layer;
// contexts where the model tensors metadata is stored
std::vector<ggml_context_ptr> ctxs;
// the model memory buffers for the tensor data
std::vector<ggml_backend_buffer_ptr> bufs;
// model memory mapped files
llama_mmaps mappings;
// objects representing data potentially being locked in memory
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
// total number of parameters in the model
uint64_t n_elements = 0;
// total size of all the tensors in the model in bytes
size_t n_bytes = 0;
// keep track of loaded lora adapters
std::set<struct llama_lora_adapter *> lora_adapters;
~llama_model() {
while (!lora_adapters.empty()) {
llama_lora_adapter_free(*lora_adapters.begin());
}
}
};
template<typename F>
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx { ggml_init(params) };
if (!ctx) {
throw std::runtime_error(format("failed to create ggml context"));
}
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
ggml_tensor * op_tensor = fn(ctx.get());
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op_tensor->src[i] != nullptr) {
assert(op_tensor->src[i]->buffer == nullptr);
op_tensor->src[i]->buffer = buf.get();
}
}
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
return op_supported;
}
template<typename F>
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (buft_supported(cur_buft, cur_dev, fn)) {
return cur_buft;
}
}
throw std::runtime_error(format("no suitable buffer type found"));
}

View File

@ -1,5 +1,7 @@
#include "llama-vocab.h"
#include "llama-impl.h"
#include "unicode.h"
#include <algorithm>
@ -16,22 +18,6 @@
// helpers
//
LLAMA_ATTRIBUTE_FORMAT(1, 2)
static std::string format(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
struct naive_trie {
naive_trie() : has_value(false), value(0) {
}

View File

@ -8,6 +8,18 @@
#include <map>
#include <set>
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
switch (type) {
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
default: return "unknown";
}
}
struct llm_tokenizer;
struct llama_vocab {

File diff suppressed because it is too large Load Diff