mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-30 21:34:36 +00:00
7eee341bee
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
* common : use common_ prefix for common library functions --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
102 lines
4.0 KiB
C++
102 lines
4.0 KiB
C++
#pragma once
|
|
|
|
#include "llama.h"
|
|
|
|
#include <unordered_map>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#define LLAMA_NGRAM_MIN 1
|
|
#define LLAMA_NGRAM_MAX 4
|
|
#define LLAMA_NGRAM_STATIC 2
|
|
|
|
// Data structures to map n-grams to empirical token probabilities:
|
|
|
|
struct common_ngram {
|
|
llama_token tokens[LLAMA_NGRAM_MAX];
|
|
|
|
common_ngram() {
|
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
tokens[i] = -1;
|
|
}
|
|
}
|
|
|
|
common_ngram(const llama_token * input, const int ngram_size) {
|
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
tokens[i] = i < ngram_size ? input[i] : -1;
|
|
}
|
|
}
|
|
|
|
bool operator==(const common_ngram & other) const {
|
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
if (tokens[i] != other.tokens[i]) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
struct common_token_hash_function {
|
|
size_t operator()(const llama_token token) const {
|
|
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
|
return token * 11400714819323198485llu;
|
|
}
|
|
};
|
|
|
|
struct common_ngram_hash_function {
|
|
size_t operator()(const common_ngram & ngram) const {
|
|
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
|
|
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
|
hash ^= common_token_hash_function{}(ngram.tokens[i]);
|
|
}
|
|
return hash;
|
|
}
|
|
};
|
|
|
|
// token -> number of times token has been seen
|
|
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
|
|
|
|
// n-gram -> empirical distribution of following tokens
|
|
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
|
|
|
|
|
|
// Update an ngram cache with tokens.
|
|
// ngram_cache: the cache to modify.
|
|
// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
|
|
// inp_data: the token sequence with which to update ngram_cache.
|
|
// nnew: how many new tokens have been appended to inp_data since the last call to this function.
|
|
// print_progress: whether to print progress to stderr.
|
|
//
|
|
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
|
// Changes in the middle need a complete rebuild.
|
|
void common_ngram_cache_update(
|
|
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
|
|
|
// Try to draft tokens from ngram caches.
|
|
// inp: the tokens generated so far.
|
|
// draft: the token sequence to draft. Expected to initially contain the previously sampled token.
|
|
// n_draft: maximum number of tokens to add to draft.
|
|
// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
|
|
// nc_context: ngram cache based on current context.
|
|
// nc_dynamic: ngram cache based on previous user generations.
|
|
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
|
void common_ngram_cache_draft(
|
|
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
|
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
|
|
|
|
// Save an ngram cache to a file.
|
|
// ngram_cache: the ngram cache to save.
|
|
// filename: the path under which to save the ngram cache.
|
|
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
|
|
|
|
// Load an ngram cache saved with common_ngram_cache_save.
|
|
// filename: the path from which to load the ngram cache.
|
|
// returns: an ngram cache containing the information saved to filename.
|
|
common_ngram_cache common_ngram_cache_load(std::string & filename);
|
|
|
|
// Merge two ngram caches.
|
|
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
|
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
|
|
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
|