Merge a0bd8f0343 into 1b28061400

llama : skip token bounds check when evaluating embeddings (#9437 )
py : support converting local models (#7547 )
2024-09-22 21:16:20 +00:00 · 2024-09-11 21:27:30 +02:00 · 2024-09-11 17:52:13 +02:00 · 2024-09-11 15:29:51 +03:00 · 2024-09-11 12:59:13 +02:00 · 2024-09-11 13:02:09 +03:00
56 changed files with 2843 additions and 2524 deletions
--- a/7
+++ b/7
@ -925,6 +925,7 @@ OBJ_LLAMA = \

 OBJ_COMMON = \
 	common/common.o \
+	common/arg.o \
 	common/console.o \
 	common/ngram-cache.o \
 	common/sampling.o \
@ -1157,6 +1158,11 @@ common/common.o: \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+common/arg.o: \
+	common/arg.cpp \
+	common/arg.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
@ -1448,7 +1454,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	./llama-gen-docs

 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -54,6 +54,8 @@ add_library(${TARGET} STATIC
    base64.hpp
    common.h
    common.cpp
+    arg.h
+    arg.cpp
    sampling.h
    sampling.cpp
    console.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@ -0,0 +1,77 @@
+#pragma once
+
+#include "common.h"
+
+#include <set>
+#include <string>
+#include <vector>
+
+//
+// CLI argument parsing
+//
+
+struct llama_arg {
+    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::vector<const char *> args;
+    const char * value_hint   = nullptr; // help text or example for arg value
+    const char * value_hint_2 = nullptr; // for second arg value
+    const char * env          = nullptr;
+    std::string help;
+    bool is_sparam = false; // is current arg a sampling param?
+    void (*handler_void)   (gpt_params & params) = nullptr;
+    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (gpt_params & params, int) = nullptr;
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, const std::string &)
+    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, int)
+    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const std::string & help,
+        void (*handler)(gpt_params & params)
+    ) : args(args), help(help), handler_void(handler) {}
+
+    // support 2 values for arg
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const char * value_hint_2,
+        const std::string & help,
+        void (*handler)(gpt_params & params, const std::string &, const std::string &)
+    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+
+    llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    llama_arg & set_env(const char * env);
+    llama_arg & set_sparam();
+    bool in_example(enum llama_example ex);
+    bool get_value_from_env(std::string & output);
+    bool has_value_from_env();
+    std::string to_string();
+};
+
+struct gpt_params_context {
+    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
+    gpt_params & params;
+    std::vector<llama_arg> options;
+    void(*print_usage)(int, char **) = nullptr;
+    gpt_params_context(gpt_params & params) : params(params) {}
+};
+
+// parse input arguments from CLI
+// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+
+// function to be used by test-arg-parser
+gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@ -4,20 +4,11 @@

 #include "llama.h"

-#include "sampling.h"
-
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"

-#include <cmath>
 #include <string>
 #include <vector>
-#include <random>
-#include <thread>
-#include <set>
-#include <unordered_map>
-#include <tuple>
-#include <functional>

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@ -56,11 +47,20 @@ struct llama_control_vector_load_info;
 // CPU utils
 //

+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};
+
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();

 //
-// CLI argument parsing
+// Common params
 //

 enum llama_example {
@ -78,28 +78,71 @@ enum llama_example {
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
    LLAMA_EXAMPLE_EXPORT_LORA,
    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_LOOKUP,
+    LLAMA_EXAMPLE_PARALLEL,

    LLAMA_EXAMPLE_COUNT,
 };

+enum gpt_sampler_type {
+    GPT_SAMPLER_TYPE_NONE        = 0,
+    GPT_SAMPLER_TYPE_TOP_K       = 1,
+    GPT_SAMPLER_TYPE_TOP_P       = 2,
+    GPT_SAMPLER_TYPE_MIN_P       = 3,
+    GPT_SAMPLER_TYPE_TFS_Z       = 4,
+    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
+    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
+};
+
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
    DIMRE_METHOD_PCA,
    DIMRE_METHOD_MEAN,
 };

-struct cpu_params {
-    int      n_threads                   = -1;
-    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
-    bool     mask_valid                  = false;   // Default: any CPU
-    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
-    bool     strict_cpu                  = false;   // Use strict CPU placement
-    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+// sampler parameters
+struct gpt_sampler_params {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
+
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
+    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range    = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.00f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = false; // consider newlines as a repeatable token
+    bool    ignore_eos        = false;
+
+    std::vector<enum gpt_sampler_type> samplers = {
+        GPT_SAMPLER_TYPE_TOP_K,
+        GPT_SAMPLER_TYPE_TFS_Z,
+        GPT_SAMPLER_TYPE_TYPICAL_P,
+        GPT_SAMPLER_TYPE_TOP_P,
+        GPT_SAMPLER_TYPE_MIN_P,
+        GPT_SAMPLER_TYPE_TEMPERATURE
+    };
+
+    std::string grammar; // optional BNF-like grammar to constrain sampling
+
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+
+    // print the parameters into a string
+    std::string print() const;
 };

 struct gpt_params {
-    enum llama_example curr_ex    = LLAMA_EXAMPLE_COMMON;
-
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -143,23 +186,23 @@ struct gpt_params {

    struct gpt_sampler_params sparams;

-    std::string model                = ""; // model path
-    std::string model_draft          = ""; // draft model for speculative decoding
-    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = ""; // model url to download
-    std::string hf_token             = ""; // HF token
-    std::string hf_repo              = ""; // HF repo
-    std::string hf_file              = ""; // HF file
-    std::string prompt               = "";
-    std::string prompt_file          = ""; // store the external prompt file name
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
-    std::string input_prefix         = ""; // string to prefix user inputs with
-    std::string input_suffix         = ""; // string to suffix user inputs with
-    std::string logdir               = ""; // directory in which to save YAML log files
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = ""; // file for saving *all* logits
-    std::string rpc_servers          = ""; // comma separated list of RPC servers
+    std::string model                = ""; // model path                                                    // NOLINT
+    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
+    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
+    std::string model_url            = ""; // model url to download                                         // NOLINT
+    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
+    std::string hf_file              = ""; // HF file                                                       // NOLINT
+    std::string prompt               = "";                                                                  // NOLINT
+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
+    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
+    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
+    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT

    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@ -189,7 +232,6 @@ struct gpt_params {

    bool   kl_divergence    = false; // compute KL divergence

-    std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
    bool usage             = false; // print usage
    bool use_color         = false; // use color to distinguish generations and inputs
    bool special           = false; // enable special token output
@ -220,7 +262,7 @@ struct gpt_params {
    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector
+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
    std::vector<std::string> image; // path to image file(s)

    // embedding
@ -236,15 +278,15 @@ struct gpt_params {
    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)

    std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";
-    std::string chat_template = "";
-    std::string system_prompt = "";
+    std::string public_path   = "";                                                                         // NOLINT
+    std::string chat_template = "";                                                                         // NOLINT
+    std::string system_prompt = "";                                                                         // NOLINT
    bool enable_chat_template = true;

    std::vector<std::string> api_keys;

-    std::string ssl_file_key  = "";
-    std::string ssl_file_cert = "";
+    std::string ssl_file_key  = "";                                                                         // NOLINT
+    std::string ssl_file_cert = "";                                                                         // NOLINT

    bool endpoint_slots   = true;
    bool endpoint_metrics = false;
@ -299,92 +341,6 @@ struct gpt_params {
    bool batched_bench_output_jsonl = false;
 };

-struct llama_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::vector<const char *> args;
-    const char * value_hint   = nullptr; // help text or example for arg value
-    const char * value_hint_2 = nullptr; // for second arg value
-    const char * env          = nullptr;
-    std::string help;
-    void (*handler_void)   (gpt_params & params) = nullptr;
-    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (gpt_params & params, int) = nullptr;
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
-        void (*handler)(gpt_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
-
-    // support 2 values for arg
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
-
-    llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
-        this->examples = std::move(examples);
-        return *this;
-    }
-
-    llama_arg & set_env(const char * env) {
-        help = help + "\n(env: " + env + ")";
-        this->env = env;
-        return *this;
-    }
-
-    bool in_example(enum llama_example ex) {
-        return examples.find(ex) != examples.end();
-    }
-
-    bool get_value_from_env(std::string & output) const {
-        if (env == nullptr) return false;
-        char * value = std::getenv(env);
-        if (value) {
-            output = value;
-            return true;
-        }
-        return false;
-    }
-
-    bool has_value_from_env() const {
-        return env != nullptr && std::getenv(env);
-    }
-
-    std::string to_string();
-};
-
-// initialize list of options (arguments) that can be used by the current example
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
-// optionally, we can provide "print_usage" to print example usage
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
-
-// parse input arguments from CLI
-// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool gpt_params_parse   (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-
-// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
-void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
-
 std::string gpt_params_get_system_info(const gpt_params & params);

 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -2,6 +2,9 @@

 #include "common.h"

+#include <cmath>
+#include <unordered_map>
+
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
 template<typename T>
@ -307,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
    return cur_p.data[cur_p.selected].id;
 }

+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
+    return llama_sampler_get_seed(gsmpl->chain);
+}
+
 // helpers

 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
@ -420,7 +427,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
 }

 std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map {
+    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
--- a/common/sampling.h
+++ b/common/sampling.h
@ -2,61 +2,11 @@

 #include "llama.h"

+#include "common.h"
+
 #include <string>
 #include <vector>

-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-};
-
-// sampling parameters
-struct gpt_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_TEMPERATURE
-    };
-
-    std::string grammar; // optional BNF-like grammar to constrain sampling
-
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
-
-    // print the parameters into a string
-    std::string print() const;
-};
-
 // gpt_sampler extends llama_sampler with additional functionality:
 //
 //  - grammar support
@ -110,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 //
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);

+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
+
 // helpers

 // access the internal list of current candidate tokens
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -302,6 +302,8 @@ class Model:
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                            gguf.MODEL_TENSOR.TIME_MIX_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
                        )
                    )
                    or not new_name.endswith(".weight")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -31,6 +31,7 @@ import re
 import requests
 import sys
 import json
+import shutil

 from hashlib import sha256
 from enum import IntEnum, auto
@ -125,6 +126,21 @@ def download_model(model):
    if tokt == TOKENIZER_TYPE.UGM:
        files.append("spiece.model")

+    if os.path.isdir(repo):
+        # If repo is a path on the file system, copy the directory
+        for file in files:
+            src_path = os.path.join(repo, file)
+            dst_path = f"models/tokenizers/{name}/{file}"
+            if os.path.isfile(dst_path):
+                logger.info(f"{name}: File {dst_path} already exists - skipping")
+                continue
+            if os.path.isfile(src_path):
+                shutil.copy2(src_path, dst_path)
+                logger.info(f"{name}: Copied {src_path} to {dst_path}")
+            else:
+                logger.warning(f"{name}: Source file {src_path} does not exist")
+    else:
+        # If repo is a URL, download the files
        for file in files:
            save_path = f"models/tokenizers/{name}/{file}"
            if os.path.isfile(save_path):
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -1,33 +1,12 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

-// mutates the input string
-static std::vector<int> parse_list(char * p) {
-    std::vector<int> ret;
-
-    char * q = p;
-
-    while (*p) {
-        if (*p == ',') {
-            *p = '\0';
-            ret.push_back(std::atoi(q));
-            q = p + 1;
-        }
-
-        ++p;
-    }
-
-    ret.push_back(std::atoi(q));
-
-    return ret;
-}
-
 static void print_usage(int, char ** argv) {
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
@ -37,8 +16,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
        return 1;
    }

--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -140,8 +140,6 @@ while n_cur <= n_len {

        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

-        llama_sampler_accept(smpl, new_token_id)
-
        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            i_batch[i] = -1
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
        return 1;
    }

@ -172,8 +172,6 @@ int main(int argc, char ** argv) {

            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);

-            llama_sampler_accept(smpl, new_token_id);
-
            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                i_batch[i] = -1;
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@ -388,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
        return 1;
    }

--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -12,12 +12,9 @@

 #include <cstdio>
 #include <ctime>
+#include <random>
 #include <string>
-#include <tuple>
 #include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>

 #define DEBUG_POS 5

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -79,8 +80,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
        return 1;
    }

@ -90,8 +90,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@ -144,8 +145,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@ -401,8 +402,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
        return 1;
    }

--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"

 #include <fstream>
@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) {
    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);

    gpt_params params;
-    auto options = gpt_params_parser_init(params, ex);
+    auto ctx_arg = gpt_params_parser_init(params, ex);

    file << "| Argument | Explanation |\n";
    file << "| -------- | ----------- |\n";
-    for (auto & opt : options) {
+    for (auto & opt : ctx_arg.options) {
        file << "| `";
        // args
        for (const auto & arg : opt.args) {
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -121,7 +122,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
        llama_decode(ctx, bat);

        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
-        llama_sampler_accept(smpl, token);

        if (token == eos_token) {
            break;
@ -154,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -577,8 +578,7 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    params.verbosity = 1;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
        return 1;
    }

--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -1,6 +1,7 @@
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "sampling.h"
 #include "llama.h"

 #include <cassert>
@ -105,8 +106,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
        return 1;
    }

@ -159,8 +159,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);
@ -301,6 +299,9 @@ int main(int argc, char ** argv) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
+    smpl = gpt_sampler_init(model, sparams);
+
+    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
    LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");
@ -340,8 +341,6 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    smpl = gpt_sampler_init(model, sparams);
-
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -414,8 +414,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    // sample the most likely token
    const auto new_token_id = llama_sampler_sample(sampler, context, -1);

-    llama_sampler_accept(sampler, new_token_id);
-
    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
        return nullptr;
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -152,8 +152,6 @@ actor LlamaContext {

        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

-        llama_sampler_accept(sampling, new_token_id)
-
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
            is_done = true
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -1,11 +1,12 @@
-#include "ggml.h"
+#include "arg.h"
+#include "base64.hpp"
 #include "log.h"
 #include "common.h"
+#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
-
-#include "base64.hpp"
+#include "ggml.h"

 #include <cstdio>
 #include <cstdlib>
@ -278,8 +279,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
        return 1;
    }

--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -1,9 +1,11 @@
-#include "ggml.h"
+#include "arg.h"
 #include "log.h"
 #include "common.h"
+#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
+#include "ggml.h"

 #include <cstdio>
 #include <cstdlib>
@ -16,8 +18,8 @@ struct llava_context {
 };

 static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

 static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
@ -253,8 +255,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
        return 1;
    }

--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -1,4 +1,6 @@
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "llama.h"

 #include <cstdio>
@ -36,8 +38,7 @@ struct ngram_container {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@ -1,7 +1,8 @@
-#include "ggml.h"
-#include "llama.h"
+#include "arg.h"
 #include "common.h"
 #include "ngram-cache.h"
+#include "ggml.h"
+#include "llama.h"

 #include <cstdint>
 #include <fstream>
@ -13,8 +14,7 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

@ -40,4 +40,6 @@ int main(int argc, char ** argv){
    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());

    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
+
+    return 0;
 }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -1,8 +1,9 @@
-#include "ggml.h"
+#include "arg.h"
 #include "common.h"
-#include "llama.h"
 #include "log.h"
 #include "ngram-cache.h"
+#include "llama.h"
+#include "ggml.h"

 #include <cmath>
 #include <cstdint>
@ -15,8 +16,7 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -1,7 +1,9 @@
+#include "arg.h"
 #include "ggml.h"
-#include "llama.h"
 #include "common.h"
 #include "ngram-cache.h"
+#include "sampling.h"
+#include "llama.h"

 #include <cstdint>
 #include <cstdio>
@ -12,8 +14,7 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -1,6 +1,7 @@
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "sampling.h"
 #include "llama.h"

 #include <cassert>
@ -138,9 +139,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
-
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
        return 1;
    }

@ -192,8 +191,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);
@ -471,8 +468,10 @@ int main(int argc, char ** argv) {
        exit(1);
    }

+    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
    LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
    LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -1,7 +1,9 @@
 // A basic application simulating a server with multiple clients.
 // The clients submit requests to the server and they are processed in parallel.

+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "llama.h"

 #include <cmath>
@ -100,8 +102,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
    }

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -19,8 +20,7 @@ int main(int argc, char ** argv) {
    params.n_keep = 32;
    params.i_pos  = -1;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
        return 1;
    }

@ -220,8 +220,6 @@ int main(int argc, char ** argv) {
        {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            llama_sampler_accept(smpl, new_token_id);
-
            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                LOG_TEE("\n");
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1,18 +1,19 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

+#include <array>
+#include <atomic>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <fstream>
+#include <mutex>
+#include <random>
 #include <sstream>
 #include <thread>
-#include <mutex>
-#include <atomic>
 #include <vector>
-#include <array>
-#include <fstream>
-#include <sstream>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -1967,8 +1968,7 @@ int main(int argc, char ** argv) {
    params.n_ctx = 512;
    params.logits_all = true;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
        return 1;
    }

@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -111,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
        return 1;
    }

--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -10,8 +11,7 @@ int main(int argc, char ** argv) {
    params.prompt = "The quick brown fox";
    params.sparams.seed = 1234;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

@ -74,8 +74,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

-        llama_sampler_accept(smpl, next_token);
-
        printf("%s", next_token_str.c_str());
        result0 += next_token_str;

@ -132,8 +130,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);

-        llama_sampler_accept(smpl2, next_token);
-
        printf("%s", next_token_str.c_str());
        result1 += next_token_str;

@ -222,8 +218,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);

-        llama_sampler_accept(smpl3, next_token);
-
        printf("%s", next_token_str.c_str());
        result2 += next_token_str;

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co
 | `--version` | show version and build info |
 | `-v, --verbose` | print verbose information |
 | `--verbosity N` | set specific verbosity level (default: 0) |
-| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
-| `--no-display-prompt` | don't print prompt at generation (default: false) |
-| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
 | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
 | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
 | `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
+| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
 | `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
 | `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
 | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
+| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
-| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
 | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
 | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
-| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
 | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
 | `-p, --prompt PROMPT` | prompt to start generation with |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
-| `--in-file FNAME` | an input file (repeat to specify multiple files) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
 | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--no-escape` | do not process escape sequences |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--penalize-nl` | penalize newline tokens (default: false) |
@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
-| `-ns, --sequences N` | number of sequences to decode (default: 1) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
-| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
 | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
 | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
 | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
@ -123,7 +118,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
-| `--timeout N` | server read/write timeout in seconds (default: 600) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
 | `--log-format {text, json}` | log output format: json or text (default: json) |
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1,6 +1,8 @@
 #include "utils.hpp"

+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"

@ -613,7 +615,7 @@ struct server_context {

    gpt_params params;

-    llama_batch batch;
+    llama_batch batch = {};

    bool clean_kv_cache = true;
    bool add_bos_token  = true;
@ -1264,6 +1266,7 @@ struct server_context {
            {"n_predict",                 slot.n_predict},     // Server configured n_predict
            {"model",                     params.model_alias},
            {"seed",                      slot.sparams.seed},
+            {"seed_cur",                  slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
            {"temperature",               slot.sparams.temp},
            {"dynatemp_range",            slot.sparams.dynatemp_range},
            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
@ -2423,8 +2426,7 @@ int main(int argc, char ** argv) {
    // own arguments required by this example
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
        return 1;
    }

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
        return 1;
    }

@ -118,8 +118,6 @@ int main(int argc, char ** argv) {
        {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            llama_sampler_accept(smpl, new_token_id);
-
            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                LOG_TEE("\n");
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -1,11 +1,13 @@
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "llama.h"

-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <set>
+#include <random>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -27,8 +29,7 @@ struct seq_draft {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
        return 1;
    }

--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1725024810,
-        "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
+        "lastModified": 1725234343,
+        "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
+        "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1724819573,
-        "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
+        "lastModified": 1725634671,
+        "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
+        "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
        "type": "github"
      },
      "original": {
@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1722555339,
-        "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
+        "lastModified": 1725233747,
+        "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
      }
    },
    "root": {
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -1065,6 +1065,11 @@ if (NOT MSVC)
    endif()
 endif()

+list(LENGTH CMAKE_OSX_ARCHITECTURES OSX_ARCHITECTURES_LENGTH)
+if (OSX_ARCHITECTURES_LENGTH GREATER 1)
+    message(FATAL_ERROR "There's no support for multiple architectures with CMAKE_OSX_ARCHITECTURES.\nPlease use 'lipo -create ...' to create a universal library/binary.")
+endif()
+
 set(ARCH_FLAGS "")

 if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -2552,7 +2552,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        for (int i = 0; i < cgraph->n_nodes; i++) {
            ggml_tensor * node = cgraph->nodes[i];

-            if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+                continue;
+            }
+
+            if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;

-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
+    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
+    // Also its fixup needs to allocate a temporary buffer in the memory pool.
+    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
+    const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@ -2742,6 +2742,7 @@ struct mmq_args {
    int64_t ne00; int64_t ne01; int64_t stride01;
    int64_t ne10; int64_t ne11; int64_t stride11;
    int64_t ne0;
+    bool use_stream_k;
 };

 template<ggml_type type>
@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
    const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
    const dim3 block_nums_xy_tiling(nty, ntx, 1);

-    const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
-    if (!use_stream_k) {
+    if (!args.use_stream_k) {
        if (args.ne01 % mmq_y == 0) {
            constexpr bool need_check = false;
            mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -130,42 +130,3 @@
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
-
-// XXX: Clang builtins mapping
-#define __vsub4   __vsub4_musa
-#define __vcmpeq4 __vcmpeq4_musa
-#define __vcmpne4 __vcmpne4_musa
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-
-static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
-    return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-    }
-    return c;
-}
-
-static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
-    }
-    return c;
-}
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -3039,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute(
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
            if (status == MTLCommandBufferStatusError) {
-                NSString * error_code = [command_buffer error].localizedDescription;
-                GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
+                GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
            }

            return GGML_STATUS_FAILED;
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -4003,13 +4003,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
-        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
-        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
-
    svfloat32_t sumv0 = svdup_n_f32(0.0f);
    svfloat32_t sumv1 = svdup_n_f32(0.0f);

+    const int vector_length = ggml_sve_cnt_b*8;
+
+    // VLA Implementation using switch case
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating higher lanes for 4 float32 elements
+                const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
+
                for (; ib + 1 < nb; ib += 2) {
                    const block_q4_0 * restrict x0 = &x[ib + 0];
                    const block_q4_0 * restrict x1 = &x[ib + 1];
@ -4021,8 +4026,54 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);

                    // 4-bit -> 8-bit
-            const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
-            const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
+                    const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
+                    const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
+                    const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
+                    const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
+
+                    // sub 8
+                    const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
+                    const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
+                    const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
+                    const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
+
+                    // load y
+                    const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
+                    const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
+                    const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
+                                    svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
+                                    svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for  16 int8 elements
+                const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));

                    // sub 8
                    const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
@ -4033,12 +4084,60 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);

                    // dot product
-            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }

                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating higher lanes for 32 int8 elements
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
+                const svbool_t pl16 = svnot_b_z(ph32, ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(ph32, y0->qs);
+                    const svint8_t qy1 = svld1_s8(ph32, y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }
+
+                sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
+            } break;
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+
 #elif defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -5488,10 +5587,50 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
    svfloat32_t sumv0 = svdup_n_f32(0.0f);
    svfloat32_t sumv1 = svdup_n_f32(0.0f);

+    const int vector_length = ggml_sve_cnt_b*8;
+
+    //VLA Implemenation for SVE
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating lanes for 16 Int8 elements
+                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
+                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
+                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
+                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
+                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
+
+                    // load y
+                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
+                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
+                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
+                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
+
+                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
+                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
+                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                //printf("sve256");
                for (; ib + 1 < nb; ib += 2) {
                    const block_q8_0 * restrict x0 = &x[ib + 0];
                    const block_q8_0 * restrict x1 = &x[ib + 1];
@ -5506,11 +5645,66 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);

-            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }

                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating high 256 bit
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+                // predicate for activating low 256 bit
+                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
+
+                // predicate for activating high lanes for 8 float32 elements
+                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
+                // predicate for activating low lanes for 8 float32 elements
+                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
+
+                svfloat32_t sumv00 = svdup_n_f32(0.0f);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
+                    // and add them to make one 64 element vector
+                    // load x
+                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
+                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
+
+                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
+
+                    // load y
+                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
+                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
+
+                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
+
+                    // scale creation
+                    const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
+                    const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
+
+                    // duplicate deq1 in first half of vector and deq2 in second half of vector
+                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
+
+                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
+
+                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), sumv00);
+                break;
+            }
+        default:
+            assert(false && "Unsupported vector length");
+            break;
    }
 #elif defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
    }
    result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
    if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
-        return nullptr;
+        result->buffer = nullptr;
    }

+    if (result->buffer) {
        // require that the tensor data does not go beyond the buffer end
        uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
        uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
        uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
        GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
        GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+    }

    result->op = (ggml_op) tensor->op;
    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
    const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
    GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);

-    static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_size,
        /*.mem_buffer =*/ NULL,
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_CLAMP:
+            return true;
        case GGML_OP_CONT:
+            return op->src[0]->type != GGML_TYPE_BF16;
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
+            // TODO: add support for the new F32 operations
+            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3847,7 +3847,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml

    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
        GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                __func__, cur_end + size_needed, ctx->mem_size);
+                __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
        assert(false);
        return NULL;
    }
--- a/include/llama.h
+++ b/include/llama.h
@ -1127,15 +1127,20 @@ extern "C" {
                             int32_t   n_logit_bias,
              const llama_logit_bias * logit_bias);

-    // Shorthand for:
+
+    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
+    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
+
+    /// @details Sample and accept a token from the idx-th output of the last evaluation
    //
+    // Shorthand for:
    //    const auto * logits = llama_get_logits_ith(ctx, idx);
    //    llama_token_data_array cur_p = { ... init from logits ... };
    //    llama_sampler_apply(smpl, &cur_p);
-    //    return cur_p.data[cur_p.selected].id;
-    //
-    // At this point, this is mostly a convenience function.
-    //
+    //    auto token = cur_p.data[cur_p.selected].id;
+    //    llama_sampler_accept(smpl, token);
+    //    return token;
+    // Returns the sampled token
    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);

    // TODO: extend in the future
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -8,49 +8,45 @@
 #include <cstring>
 #include <ctime>
 #include <cfloat>
+#include <chrono>
+#include <cmath>
 #include <numeric>
 #include <random>
 #include <unordered_map>

-static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng, std::vector<float> & probs) {
-#if 1
-    probs.resize(cur_p->size);
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        probs[i] = cur_p->data[i].p;
-    }
-
-    std::discrete_distribution<size_t> dist(probs.begin(), probs.end());
-#else
-    // avoid the copy with a custom iterator
+static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
+    // iterator for the probabilities
+#ifdef __GNUC__
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+#endif

    struct probs_iterator {
        typedef std::input_iterator_tag iterator_category;
        typedef float value_type;
        typedef float * pointer;
        typedef float & reference;
-        typedef size_t difference_type;
+        typedef ptrdiff_t difference_type;

-        const llama_token_data_array * data;
-        size_t i;
+        const llama_token_data * data;

-        bool operator==(const probs_iterator & other) const { return data + i == other.data + other.i; }
-        bool operator!=(const probs_iterator & other) const { return data + i != other.data + other.i; }
-        float operator*() const { return data->data[i].p; }
-        probs_iterator & operator++() { ++i; return *this; }
-        probs_iterator operator++(int) { probs_iterator tmp = *this; ++i; return tmp; }
+        bool operator==(const probs_iterator & other) const { return data == other.data; }
+        bool operator!=(const probs_iterator & other) const { return data != other.data; }
+        const float & operator*() const { return data->p; }
+        probs_iterator & operator++() { ++data; return *this; }
+        probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; }
    };
+
+#ifdef __GNUC__
    #pragma GCC diagnostic pop
-
-    std::discrete_distribution<size_t> dist(probs_iterator{cur_p, 0}, probs_iterator{cur_p, cur_p->size});
-
-    GGML_UNUSED(probs);
 #endif

+    std::discrete_distribution<int> dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size});
+
    return dist(rng);
 }

+/*
 static void llama_log_softmax(float * array, size_t size) {
    float max_l = *std::max_element(array, array + size);
    float sum = 0.f;
@ -64,6 +60,7 @@ static void llama_log_softmax(float * array, size_t size) {
        array[i] = logf(array[i] / sum);
    }
 }
+*/

 static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
    GGML_ASSERT(cur_p->size > 0);
@ -166,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
    cur_p->size = k;
 }

+static uint32_t get_rng_seed(uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        // use system clock if std::random_device is not a true RNG
+        static bool is_rd_prng = std::random_device().entropy() == 0;
+        if (is_rd_prng) {
+            return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
+        }
+        std::random_device rd;
+        return rd();
+    }
+    return seed;
+}
+
 // llama_sampler API

 const char * llama_sampler_name(const struct llama_sampler * smpl) {
@ -231,18 +241,31 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
    }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    llama_token_data_array cur_p = {
+        /* .data       = */ cur.data(),
+        /* .size       = */ cur.size(),
+        /* .selected   = */ -1,
+        /* .sorted     = */ false,
+    };

    llama_sampler_apply(smpl, &cur_p);

-    return cur_p.data[cur_p.selected].id;
+    GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
+
+    auto token = cur_p.data[cur_p.selected].id;
+
+    llama_sampler_accept(smpl, token);
+
+    return token;
 }

 // sampler chain

-static struct llama_sampler_i llama_sampler_chain_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "chain"; },
-    /* .accept = */ [](struct llama_sampler * smpl, llama_token token) {
+static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
+    return "chain";
+}
+
+static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) {
    auto * chain = (llama_sampler_chain *) smpl->ctx;

    time_meas tm(chain->t_sample_us, chain->params.no_perf);
@ -252,8 +275,9 @@ static struct llama_sampler_i llama_sampler_chain_i = {
    }

    chain->n_sample++;
-    },
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+}
+
+static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
    auto * chain = (llama_sampler_chain *) smpl->ctx;

    time_meas tm(chain->t_sample_us, chain->params.no_perf);
@ -261,8 +285,9 @@ static struct llama_sampler_i llama_sampler_chain_i = {
    for (auto * smpl : chain->samplers) {
        llama_sampler_apply(smpl, cur_p);
    }
-    },
-    /* .reset  = */ [](struct llama_sampler * smpl) {
+}
+
+static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
    auto * chain = (llama_sampler_chain *) smpl->ctx;

    for (auto * smpl : chain->samplers) {
@ -271,8 +296,9 @@ static struct llama_sampler_i llama_sampler_chain_i = {

    chain->t_sample_us = 0;
    chain->n_sample    = 0;
-    },
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
+}
+
+static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
    const auto * chain_src = (const llama_sampler_chain *) smpl->ctx;

    auto * result = llama_sampler_chain_init(chain_src->params);
@ -282,8 +308,9 @@ static struct llama_sampler_i llama_sampler_chain_i = {
    }

    return result;
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
+}
+
+static void llama_sampler_chain_free(struct llama_sampler * smpl) {
    auto * chain = (llama_sampler_chain *) smpl->ctx;

    for (auto * smpl : chain->samplers) {
@ -291,7 +318,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
    }

    delete chain;
-    },
+}
+
+static struct llama_sampler_i llama_sampler_chain_i = {
+    /* .name   = */ llama_sampler_chain_name,
+    /* .accept = */ llama_sampler_chain_accept,
+    /* .apply  = */ llama_sampler_chain_apply,
+    /* .reset  = */ llama_sampler_chain_reset,
+    /* .clone  = */ llama_sampler_chain_clone,
+    /* .free   = */ llama_sampler_chain_free,
 };

 struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
@ -366,10 +401,9 @@ struct llama_sampler * llama_sampler_init_greedy() {

 struct llama_sampler_dist {
    const uint32_t seed;
+          uint32_t seed_cur;

    std::mt19937 rng;
-
-    std::vector<float> probs; // work array
 };

 static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) {
@ -378,7 +412,7 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*

 static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
    auto * ctx = (llama_sampler_dist *) smpl->ctx;
-    cur_p->selected = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
+    cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
 }

 static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
@ -397,7 +431,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample

 static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_dist *) smpl->ctx;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }

 static void llama_sampler_dist_free(struct llama_sampler * smpl) {
@ -414,12 +449,13 @@ static struct llama_sampler_i llama_sampler_dist_i = {
 };

 struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_dist_i,
        /* .ctx   = */ new llama_sampler_dist {
            /* .seed     = */ seed,
-            /* .rng  = */ std::mt19937(seed),
-            /* .probs = */ {},
+            /* .seed_cur = */ seed_cur,
+            /* .rng      = */ std::mt19937(seed_cur),
        },
    };
 }
@ -1014,6 +1050,7 @@ struct llama_sampler_mirostat {
    const int32_t n_vocab;

    const uint32_t seed;
+          uint32_t seed_cur;

    const float tau;
    const float eta;
@ -1023,8 +1060,6 @@ struct llama_sampler_mirostat {
    float mu;

    std::mt19937 rng;
-
-    std::vector<float> probs;
 };

 static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
@ -1055,7 +1090,7 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
    llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
    llama_sampler_softmax_impl(cur_p);

-    const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
+    const int idx = llama_sample_dist(cur_p, ctx->rng);

    cur_p->selected = idx;

@ -1084,7 +1119,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa
 static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
    ctx->mu = 2.0f*ctx->tau;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }

 static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
@ -1101,17 +1137,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
 };

 struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
+    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_mirostat_i,
        /* .ctx   = */ new llama_sampler_mirostat {
            /* .n_vocab  = */ n_vocab,
            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
            /* .tau      = */ tau,
            /* .eta      = */ eta,
            /* .m        = */ m,
            /* .mu       = */ 2.0f*tau,
-            /* .rng     = */ std::mt19937(seed),
-            /* .probs   = */ {},
+            /* .rng      = */ std::mt19937(seed_cur),
        },
    };
 }
@ -1120,6 +1157,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see

 struct llama_sampler_mirostat_v2 {
    const uint32_t seed;
+          uint32_t seed_cur;

    const float tau;
    const float eta;
@ -1127,8 +1165,6 @@ struct llama_sampler_mirostat_v2 {
    float mu;

    std::mt19937 rng;
-
-    std::vector<float> probs;
 };

 static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) {
@ -1152,7 +1188,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
    // Normalize the probabilities of the remaining words
    llama_sampler_softmax_impl(cur_p);

-    const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
+    const int idx = llama_sample_dist(cur_p, ctx->rng);

    cur_p->selected = idx;

@ -1166,7 +1202,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
 static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
    ctx->mu = 2.0f*ctx->tau;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }

 static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
@ -1199,15 +1236,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
 };

 struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
+    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_mirostat_v2_i,
        /* .ctx   = */ new llama_sampler_mirostat_v2 {
            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
            /* .tau      = */ tau,
            /* .eta      = */ eta,
            /* .mu       = */ 2.0f*tau,
-            /* .rng   = */ std::mt19937(seed),
-            /* .probs = */ {},
+            /* .rng      = */ std::mt19937(seed_cur),
        },
    };
 }
@ -1493,6 +1531,8 @@ struct llama_sampler * llama_sampler_init_penalties(
        ignore_eos = false;
    }

+    penalty_last_n = std::max(penalty_last_n, 0);
+
    return new llama_sampler {
        /* .iface = */ &llama_sampler_penalties_i,
        /* .ctx   = */ new llama_sampler_penalties {
@ -1527,6 +1567,10 @@ static const char * llama_sampler_logit_bias_name(const struct llama_sampler * /
 static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
    auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;

+    if (ctx->logit_bias.empty()) {
+        return;
+    }
+
    ctx->to_search.clear();

    // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id)
@ -1538,6 +1582,10 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
        }
    }

+    if (ctx->to_search.empty()) {
+        return;
+    }
+
    // search for the remaining candidates that were not found in the previous step
    for (size_t i = 0; i < cur_p->size; ++i) {
        for (const auto & lb : ctx->to_search) {
@ -1548,6 +1596,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
        }
    }
 }
+
 static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
    return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
@ -1579,3 +1628,31 @@ struct llama_sampler * llama_sampler_init_logit_bias(
        },
    };
 }
+
+// utils
+
+uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
+    if (smpl->iface == &llama_sampler_dist_i) {
+        return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_i) {
+        return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_v2_i) {
+        return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_chain_i) {
+        const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
+        for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
+            const uint32_t seed = llama_sampler_get_seed(*it);
+            if (seed != LLAMA_DEFAULT_SEED) {
+                return seed;
+            }
+        }
+    }
+
+    return LLAMA_DEFAULT_SEED;
+}
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -9258,7 +9258,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
    // FIXME: zero-out NANs?
    states = ggml_mul(ctx, states, state_mask);

-    // copy states which won't be changed further (between n_seqs and n_rs)
+    // copy states which won't be changed further (between n_seqs and n_kv)
    ggml_build_forward_expand(graph,
        ggml_cpy(ctx,
            ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
@ -16076,19 +16076,21 @@ static int llama_decode_internal(
        return -1;
    }

-    for (uint32_t i = 0; i < n_tokens_all; ++i) {
-        if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
-            return -1;
-        }
-    }
-
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT

+    if (batch_all.token) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
+                return -1;
+            }
+        }
+    }
+
    GGML_ASSERT(n_tokens_all <= cparams.n_batch);

    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@ -16375,19 +16377,21 @@ static int llama_encode_internal(
        return -1;
    }

-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
-            return -1;
-        }
-    }
-
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+
    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");

@ -17530,6 +17534,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;

        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@ -1,3 +1,6 @@
+#include "arg.h"
+#include "common.h"
+
 #include <string>
 #include <vector>
 #include <sstream>
@ -6,18 +9,16 @@
 #undef NDEBUG
 #include <cassert>

-#include "common.h"
-
 int main(void) {
    gpt_params params;

    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
    for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
        try {
-            auto options = gpt_params_parser_init(params, (enum llama_example)ex);
+            auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
            std::unordered_set<std::string> seen_args;
            std::unordered_set<std::string> seen_env_vars;
-            for (const auto & opt : options) {
+            for (const auto & opt : ctx_arg.options) {
                // check for args duplications
                for (const auto & arg : opt.args) {
                    if (seen_args.find(arg) == seen_args.end()) {
@ -52,40 +53,51 @@ int main(void) {
    };

    std::vector<std::string> argv;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);

    printf("test-arg-parser: test invalid usage\n\n");

+    // missing value
    argv = {"binary_name", "-m"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));

+    // wrong value (int)
    argv = {"binary_name", "-ngl", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));

+    // wrong value (enum)
    argv = {"binary_name", "-sm", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
+    argv = {"binary_name", "--draft", "123"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));


    printf("test-arg-parser: test valid usage\n\n");

    argv = {"binary_name", "-m", "model_file.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "model_file.gguf");

    argv = {"binary_name", "-t", "1234"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.cpuparams.n_threads == 1234);

    argv = {"binary_name", "--verbose"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.verbosity == 1);

    argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "abc.gguf");
    assert(params.n_predict == 6789);
    assert(params.n_batch == 9090);

+    // --draft cannot be used outside llama-speculative
+    argv = {"binary_name", "--draft", "123"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(params.n_draft == 123);
+
 // skip this part on windows, because setenv is not supported
 #ifdef _WIN32
    printf("test-arg-parser: skip on windows build\n");
@ -94,12 +106,12 @@ int main(void) {

    setenv("LLAMA_ARG_THREADS", "blah", true);
    argv = {"binary_name"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));

    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
    setenv("LLAMA_ARG_THREADS", "1010", true);
    argv = {"binary_name"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "blah.gguf");
    assert(params.cpuparams.n_threads == 1010);

@ -109,7 +121,7 @@ int main(void) {
    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
    setenv("LLAMA_ARG_THREADS", "1010", true);
    argv = {"binary_name", "-m", "overwritten.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "overwritten.gguf");
    assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -245,7 +245,7 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
        }
    }

-    printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
+    printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
           samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
 }
Author	SHA1	Message	Date
Michael Podvitskiy	0bc8f72979	Merge `a0bd8f0343` into `1b28061400`	2024-09-11 21:27:30 +02:00
slaren	1b28061400	llama : skip token bounds check when evaluating embeddings (#9437 ) Some checks are pending Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run Details Nix CI / nix-eval (macos-latest) (push) Waiting to run Details Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run Details Nix CI / nix-build (macos-latest) (push) Waiting to run Details Nix CI / nix-build (ubuntu-latest) (push) Waiting to run Details Python check requirements.txt / check-requirements (push) Waiting to run Details flake8 Lint / Lint (push) Waiting to run Details Python Type-Check / pyright type-check (push) Waiting to run Details	2024-09-11 17:52:13 +02:00
Pavel Zloi	8db003a19d	py : support converting local models (#7547 ) * Support of converting local models added to convert-hf-to-gguf-update.py * Description fixed * shutil added to imports	2024-09-11 15:29:51 +03:00
Xuan Son Nguyen	0996c5597f	llava : correct args for minicpmv-cli (#9429 )	2024-09-11 12:59:13 +02:00
Xuan Son Nguyen	5bb2c5dbd2	files : remove accidentally added `lora_test` submodule (#9430 )	2024-09-11 13:02:09 +03:00
Farbod Bijary	67155ab7f5	feat: Implements retrying logic for downloading models using --model-url flag (#9255 ) * feat: Implements retrying logic for downloading models using --model-url flag * Update common/common.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * Update common/common.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * apply comments * implements a retry function to avoid duplication * fix editorconfig * change function name --------- Co-authored-by: farbod <farbod.bjary82@gmail.com> Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2024-09-11 11:22:37 +02:00
Johannes Gäßler	5af118efda	CUDA: fix --split-mode row race condition (#9413 )	2024-09-11 10:22:40 +02:00
Georgi Gerganov	d2b496bff4	batched-bench : remove unused code (#9305 )	2024-09-11 10:03:54 +03:00
R0CKSTAR	b34e023480	musa: remove Clang builtins mapping (#9421 ) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>	2024-09-11 03:46:55 +02:00
Alberto Cabrera Pérez	51b6038636	sycl : update support conditions (#9394 ) * sycl : update support condition to im2col Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com> * Added TODO to remind supporting FP32 im2col --------- Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com>	2024-09-11 08:53:42 +08:00
Georgi Gerganov	cb9c933eb2	flake.lock: Update (#9360 ) Flake lock file updates: • Updated input 'flake-parts': 'github:hercules-ci/flake-parts/af510d4a62d071ea13925ce41c95e3dec816c01d?narHash=sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E%3D' (2024-08-30) → 'github:hercules-ci/flake-parts/567b938d64d4b4112ee253b9274472dc3a346eb6?narHash=sha256-%2Bebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y%3D' (2024-09-01) • Updated input 'flake-parts/nixpkgs-lib': '`a5d394176e`.tar.gz?narHash=sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q%3D' (2024-08-01) → '`356624c120`.tar.gz?narHash=sha256-Ss8QWLXdr2JCBPcYChJhz4xJm%2Bh/xjl4G0c0XlP6a74%3D' (2024-09-01) • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/71e91c409d1e654808b2621f28a327acfdad8dc2?narHash=sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w%3D' (2024-08-28) → 'github:NixOS/nixpkgs/574d1eac1c200690e27b8eb4e24887f8df7ac27c?narHash=sha256-v3rIhsJBOMLR8e/RNWxr828tB%2BWywYIoajrZKFM%2B0Gg%3D' (2024-09-06) Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2024-09-10 15:46:59 -07:00
Xuan Son Nguyen	6cd4e03444	arg : bring back missing ifdef (#9411 ) * arg : bring back missing ifdef * replace with llama_supports_gpu_offload	2024-09-10 22:41:29 +02:00
matteo	8d300bd35f	enable --special arg for llama-server (#9419 ) Co-authored-by: matteo serva <matteo.serva@gmail.com>	2024-09-10 22:40:59 +02:00
slaren	49006c67b4	llama : move random seed generation to the samplers (#9398 ) * llama_sampler_penalties : clamp penalty_last_n to zero	2024-09-10 18:04:25 +02:00
Georgi Gerganov	00ba2ff781	metal : fix compile warning with GGML_METAL_NDEBUG (#0 )	2024-09-10 10:17:43 +03:00
Daniel Bevenius	83008b7cfe	llama : update llm_build_copy_mask_state comment [no ci] (#9385 ) This commit updates the comment, which seems to contain a typo or be an outdated comment, in the copy_mask_state function changing the variable n_rs to n_kv. I believe this change is correct and what the comment wants to convey is to copy the states that are not going to be used in the upcoming processing, which are the tokens states from n_seqs up to the number of possible token states n_kv.	2024-09-10 10:03:21 +03:00
Molly Sophia	0b4ac75772	RWKV v6: Add time_mix_decay_w1/w2 in quant exclusion list (#9387 ) Signed-off-by: Molly Sophia <mollysophia379@gmail.com>	2024-09-10 10:02:30 +03:00
slaren	fb3f249815	make : do not run llama-gen-docs when building (#9399 )	2024-09-10 09:23:33 +03:00
Xuan Son Nguyen	bfe76d4a17	common : move arg parser code to `arg.cpp` (#9388 ) * common : move arg parser to arg.cpp * better categorize args * add cmake * missing climits * missing cstdarg * common : more explicit includes * fix build * refactor gpt_params_parse * update server readme * fix test --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-09-09 23:36:09 +02:00
Radoslav Gerganov	293bebe077	rpc : fix segfault with nkvo (#9389 ) * rpc : fix nkvo * rpc : buf_size must not be static ref: #9337 --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-09-09 18:40:10 +03:00
Prashant Vithule	5fac4d5764	ggml : vector length agnostic SVE support (#9290 ) * Implemented vector length agnostic SVE using switch case for 512-bit, 256-bit, 128-bit vector lengths * Implemented vector length agnostic SVE using switch case for 512-bit, 256-bit, 128-bit vector lengths * Removed WhiteSpaces * ggml : style changes + fix 512-bit nb loop check - fix local scope in switch cases - consistent predicate names - empty lines when necessary - opening braces, spaces - const-correctness - add asserts * Update ggml/src/ggml-quants.c Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-09-09 18:37:18 +03:00
slaren	5fb5e24811	llama : minor sampling refactor (2) (#9386 )	2024-09-09 17:10:46 +02:00
Michael Podvitskiy	a0bd8f0343	a way to process CMAKE_OSX_ARCHITECTURES as a list	2024-09-05 22:49:18 +02:00