llama.cpp/common/sampling.h

#pragma once

#include "llama.h"

#include "grammar-parser.h"

#include <string>
#include <vector>
#include <unordered_map>

// sampling parameters
typedef struct llama_sampling_params {
    int32_t n_prev            = 64;    // number of previous tokens to remember
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float   penalty_repeat    = 1.10f; // 1.0 = disabled
    float   penalty_freq      = 0.00f; // 0.0 = disabled
    float   penalty_present   = 0.00f; // 0.0 = disabled
    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
    bool    penalize_nl       = true;  // consider newlines as a repeatable token

    std::string grammar;  // optional BNF-like grammar to constrain sampling

    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
    std::string cfg_negative_prompt; // string to help guidance
    float       cfg_scale     = 1.f; // how strong is guidance

    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
} llama_sampling_params;

// general sampler context
// TODO: move to llama.h
struct llama_sampling_context {
    // parameters that will be used for sampling
    llama_sampling_params params;

    // mirostat sampler state
    float mirostat_mu;

    llama_grammar * grammar;

    // internal
    grammar_parser::parse_state parsed_grammar;

    // TODO: replace with ring-buffer
    std::vector<llama_token>      prev;
    std::vector<llama_token_data> cur;
};

#include "common.h"

// Create a new sampling context instance.
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);

void llama_sampling_free(struct llama_sampling_context * ctx);

// Reset the sampler context
// - clear prev tokens
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);

// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);

// Get the last sampled token
llama_token llama_sampling_last(llama_sampling_context * ctx);

// Get a string representation of the last sampled tokens
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);

// Print sampling parameters into a string
std::string llama_sampling_print(const llama_sampling_params & params);

// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
//       llama_sampling_reset when a sequence ends
//
// required:
//  - ctx_main:     context to use for sampling
//  - ctx_sampling: sampling-specific context
//
// optional:
//  - ctx_cfg:      context to use for classifier-free guidance
//  - idx:          sample from llama_get_logits_ith(ctx, idx)
//
// returns:
//  - token:      sampled token
//  - candidates: vector of candidate tokens
//
llama_token llama_sampling_sample(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        struct llama_context * ctx_cfg,
        int idx = 0);

void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        llama_token id,
        bool apply_grammar);
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00			`#pragma once`

			`#include "llama.h"`

speculative : add tree-based sampling example (#3624) * sampling : one sequence per sampling context ggml-ci * speculative : add tree-based sampling support ggml-ci * speculative : reuse the n_parallel CLI param * speculative : refactor sampling * examples : fix build after sampling refactoring ggml-ci * batched : fix n_seq_id * sampling : fix malloc ggml-ci * swift : fix build ggml-ci * swift : try to fix build ggml-ci * prompts : add assistant.txt * common : add llama_batch_add() and llama_batch_clear() helpers * speculative : minor refactor ggml-ci * minor : comments + rename ggml-ci * speculative : fix off-by-one for n_drafted * speculative : fix the n_drafted fix + p constants 2023-10-18 13:21:57 +00:00			`#include "grammar-parser.h"`

common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00			`#include <string>`
			`#include <vector>`
			`#include <unordered_map>`

			`// sampling parameters`
			`typedef struct llama_sampling_params {`
sampling : rename penalty params + reduce size of "prev" vector 2023-10-20 14:47:13 +00:00			`int32_t n_prev = 64; // number of previous tokens to remember`
llama : combine repetition, frequency and presence penalties in 1 call 2023-10-20 14:05:46 +00:00			`int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00			`int32_t top_k = 40; // <= 0 to use vocab size`
			`float top_p = 0.95f; // 1.0 = disabled`
			`float tfs_z = 1.00f; // 1.0 = disabled`
			`float typical_p = 1.00f; // 1.0 = disabled`
			`float temp = 0.80f; // 1.0 = disabled`
sampling : rename penalty params + reduce size of "prev" vector 2023-10-20 14:47:13 +00:00			`int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)`
			`float penalty_repeat = 1.10f; // 1.0 = disabled`
			`float penalty_freq = 0.00f; // 0.0 = disabled`
			`float penalty_present = 0.00f; // 0.0 = disabled`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00			`int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0`
			`float mirostat_tau = 5.00f; // target entropy`
			`float mirostat_eta = 0.10f; // learning rate`
			`bool penalize_nl = true; // consider newlines as a repeatable token`

llama : combine repetition, frequency and presence penalties in 1 call 2023-10-20 14:05:46 +00:00			`std::string grammar; // optional BNF-like grammar to constrain sampling`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00
			`// Classifier-Free Guidance`
			`// https://arxiv.org/abs/2306.17806`
sampling : add llama_sampling_print helper 2023-10-20 15:02:50 +00:00			`std::string cfg_negative_prompt; // string to help guidance`
			`float cfg_scale = 1.f; // how strong is guidance`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00
			`std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens`
			`} llama_sampling_params;`

			`// general sampler context`
speculative : add tree-based sampling example (#3624) * sampling : one sequence per sampling context ggml-ci * speculative : add tree-based sampling support ggml-ci * speculative : reuse the n_parallel CLI param * speculative : refactor sampling * examples : fix build after sampling refactoring ggml-ci * batched : fix n_seq_id * sampling : fix malloc ggml-ci * swift : fix build ggml-ci * swift : try to fix build ggml-ci * prompts : add assistant.txt * common : add llama_batch_add() and llama_batch_clear() helpers * speculative : minor refactor ggml-ci * minor : comments + rename ggml-ci * speculative : fix off-by-one for n_drafted * speculative : fix the n_drafted fix + p constants 2023-10-18 13:21:57 +00:00			`// TODO: move to llama.h`
			`struct llama_sampling_context {`
			`// parameters that will be used for sampling`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00			`llama_sampling_params params;`

speculative : add tree-based sampling example (#3624) * sampling : one sequence per sampling context ggml-ci * speculative : add tree-based sampling support ggml-ci * speculative : reuse the n_parallel CLI param * speculative : refactor sampling * examples : fix build after sampling refactoring ggml-ci * batched : fix n_seq_id * sampling : fix malloc ggml-ci * swift : fix build ggml-ci * swift : try to fix build ggml-ci * prompts : add assistant.txt * common : add llama_batch_add() and llama_batch_clear() helpers * speculative : minor refactor ggml-ci * minor : comments + rename ggml-ci * speculative : fix off-by-one for n_drafted * speculative : fix the n_drafted fix + p constants 2023-10-18 13:21:57 +00:00			`// mirostat sampler state`
			`float mirostat_mu;`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00
			`llama_grammar * grammar;`
speculative : add tree-based sampling example (#3624) * sampling : one sequence per sampling context ggml-ci * speculative : add tree-based sampling support ggml-ci * speculative : reuse the n_parallel CLI param * speculative : refactor sampling * examples : fix build after sampling refactoring ggml-ci * batched : fix n_seq_id * sampling : fix malloc ggml-ci * swift : fix build ggml-ci * swift : try to fix build ggml-ci * prompts : add assistant.txt * common : add llama_batch_add() and llama_batch_clear() helpers * speculative : minor refactor ggml-ci * minor : comments + rename ggml-ci * speculative : fix off-by-one for n_drafted * speculative : fix the n_drafted fix + p constants 2023-10-18 13:21:57 +00:00
			`// internal`
			`grammar_parser::parse_state parsed_grammar;`

			`// TODO: replace with ring-buffer`
			`std::vector<llama_token> prev;`
			`std::vector<llama_token_data> cur;`
			`};`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00
			`#include "common.h"`

			`// Create a new sampling context instance.`
sampling : refactor init to use llama_sampling_params 2023-10-20 11:58:20 +00:00			`struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);`
speculative : add tree-based sampling example (#3624) * sampling : one sequence per sampling context ggml-ci * speculative : add tree-based sampling support ggml-ci * speculative : reuse the n_parallel CLI param * speculative : refactor sampling * examples : fix build after sampling refactoring ggml-ci * batched : fix n_seq_id * sampling : fix malloc ggml-ci * swift : fix build ggml-ci * swift : try to fix build ggml-ci * prompts : add assistant.txt * common : add llama_batch_add() and llama_batch_clear() helpers * speculative : minor refactor ggml-ci * minor : comments + rename ggml-ci * speculative : fix off-by-one for n_drafted * speculative : fix the n_drafted fix + p constants 2023-10-18 13:21:57 +00:00
			`void llama_sampling_free(struct llama_sampling_context * ctx);`

			`// Reset the sampler context`
			`// - clear prev tokens`
			`// - reset grammar`
			`void llama_sampling_reset(llama_sampling_context * ctx);`

			`// Copy the sampler context`
			`void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00
sampling : hide prev behind API and apply #3661 ggml-ci 2023-10-20 15:26:20 +00:00			`// Get the last sampled token`
			`llama_token llama_sampling_last(llama_sampling_context * ctx);`

			`// Get a string representation of the last sampled tokens`
			`std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);`

sampling : add llama_sampling_print helper 2023-10-20 15:02:50 +00:00			`// Print sampling parameters into a string`
			`std::string llama_sampling_print(const llama_sampling_params & params);`

common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00			`// this is a common sampling function used across the examples for convenience`
			`// it can serve as a starting point for implementing your own sampling function`
			`// Note: When using multiple sequences, it is the caller's responsibility to call`
speculative : add tree-based sampling example (#3624) * sampling : one sequence per sampling context ggml-ci * speculative : add tree-based sampling support ggml-ci * speculative : reuse the n_parallel CLI param * speculative : refactor sampling * examples : fix build after sampling refactoring ggml-ci * batched : fix n_seq_id * sampling : fix malloc ggml-ci * swift : fix build ggml-ci * swift : try to fix build ggml-ci * prompts : add assistant.txt * common : add llama_batch_add() and llama_batch_clear() helpers * speculative : minor refactor ggml-ci * minor : comments + rename ggml-ci * speculative : fix off-by-one for n_drafted * speculative : fix the n_drafted fix + p constants 2023-10-18 13:21:57 +00:00			`// llama_sampling_reset when a sequence ends`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00			`//`
			`// required:`
speculative : add tree-based sampling example (#3624) * sampling : one sequence per sampling context ggml-ci * speculative : add tree-based sampling support ggml-ci * speculative : reuse the n_parallel CLI param * speculative : refactor sampling * examples : fix build after sampling refactoring ggml-ci * batched : fix n_seq_id * sampling : fix malloc ggml-ci * swift : fix build ggml-ci * swift : try to fix build ggml-ci * prompts : add assistant.txt * common : add llama_batch_add() and llama_batch_clear() helpers * speculative : minor refactor ggml-ci * minor : comments + rename ggml-ci * speculative : fix off-by-one for n_drafted * speculative : fix the n_drafted fix + p constants 2023-10-18 13:21:57 +00:00			`// - ctx_main: context to use for sampling`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00			`// - ctx_sampling: sampling-specific context`
			`//`
			`// optional:`
speculative : add tree-based sampling example (#3624) * sampling : one sequence per sampling context ggml-ci * speculative : add tree-based sampling support ggml-ci * speculative : reuse the n_parallel CLI param * speculative : refactor sampling * examples : fix build after sampling refactoring ggml-ci * batched : fix n_seq_id * sampling : fix malloc ggml-ci * swift : fix build ggml-ci * swift : try to fix build ggml-ci * prompts : add assistant.txt * common : add llama_batch_add() and llama_batch_clear() helpers * speculative : minor refactor ggml-ci * minor : comments + rename ggml-ci * speculative : fix off-by-one for n_drafted * speculative : fix the n_drafted fix + p constants 2023-10-18 13:21:57 +00:00			`// - ctx_cfg: context to use for classifier-free guidance`
			`// - idx: sample from llama_get_logits_ith(ctx, idx)`
common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts 2023-10-11 19:35:46 +00:00			`//`
			`// returns:`
			`// - token: sampled token`
			`// - candidates: vector of candidate tokens`
			`//`
			`llama_token llama_sampling_sample(`
speculative : add tree-based sampling example (#3624) * sampling : one sequence per sampling context ggml-ci * speculative : add tree-based sampling support ggml-ci * speculative : reuse the n_parallel CLI param * speculative : refactor sampling * examples : fix build after sampling refactoring ggml-ci * batched : fix n_seq_id * sampling : fix malloc ggml-ci * swift : fix build ggml-ci * swift : try to fix build ggml-ci * prompts : add assistant.txt * common : add llama_batch_add() and llama_batch_clear() helpers * speculative : minor refactor ggml-ci * minor : comments + rename ggml-ci * speculative : fix off-by-one for n_drafted * speculative : fix the n_drafted fix + p constants 2023-10-18 13:21:57 +00:00			`struct llama_sampling_context * ctx_sampling,`
			`struct llama_context * ctx_main,`
			`struct llama_context * ctx_cfg,`
			`int idx = 0);`

			`void llama_sampling_accept(`
			`struct llama_sampling_context * ctx_sampling,`
			`struct llama_context * ctx_main,`
sampling : hide prev behind API and apply #3661 ggml-ci 2023-10-20 15:26:20 +00:00			`llama_token id,`
			`bool apply_grammar);`