Merge a97b3621cf into 924518e2e5

Reset color before we exit (#11205 )
We don't want colors to leak post termination of llama-run. Signed-off-by: Eric Curtin <ecurtin@redhat.com>
2025-01-13 04:00:16 +00:00 · 2025-01-12 23:45:46 +05:00 · 2025-01-12 18:23:10 +00:00 · 2025-01-12 17:57:51 +02:00 · 2025-01-12 17:22:16 +02:00 · 2025-01-12 16:53:44 +02:00
38 changed files with 763 additions and 763 deletions
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@ -13,7 +13,15 @@ function(llama_add_compile_flags)
            list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
                                -Werror=implicit-int -Werror=implicit-function-declaration)

-            list(APPEND CXX_FLAGS -Wshadow -Wmissing-declarations -Wmissing-noreturn)
+            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
+
+            if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+                list(APPEND CXX_FLAGS -Wshadow)
+
+                if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+                    list(APPEND CXX_FLAGS -Wshadow-field-in-constructor)
+                endif()
+            endif()

            list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)

--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@ -25,43 +25,43 @@ struct common_arg {
    void (*handler_int)    (common_params & params, int) = nullptr;

    common_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
+        const std::initializer_list<const char *> & args_,
+        const char * value_hint_,
+        const std::string & help_,
        void (*handler)(common_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+    ) : args(args_), value_hint(value_hint_), help(help_), handler_string(handler) {}

    common_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
+        const std::initializer_list<const char *> & args_,
+        const char * value_hint_,
+        const std::string & help_,
        void (*handler)(common_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+    ) : args(args_), value_hint(value_hint_), help(help_), handler_int(handler) {}

    common_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
+        const std::initializer_list<const char *> & args_,
+        const std::string & help_,
        void (*handler)(common_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
+    ) : args(args_), help(help_), handler_void(handler) {}

    // support 2 values for arg
    common_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
+        const std::initializer_list<const char *> & args_,
+        const char * value_hint_,
+        const char * value_hint_2_,
+        const std::string & help_,
        void (*handler)(common_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+    ) : args(args_), value_hint(value_hint_), value_hint_2(value_hint_2_), help(help_), handler_str_str(handler) {}

-    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
-    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
-    common_arg & set_env(const char * env);
+    common_arg & set_examples(std::initializer_list<enum llama_example> vals);
+    common_arg & set_excludes(std::initializer_list<enum llama_example> vals);
+    common_arg & set_env(const char * val);
    common_arg & set_sparam();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
-    bool get_value_from_env(std::string & output);
-    bool has_value_from_env();
-    std::string to_string();
+    bool get_value_from_env(std::string & output) const;
+    bool has_value_from_env() const;
+    std::string to_string() const;
 };

 struct common_params_context {
@ -69,7 +69,7 @@ struct common_params_context {
    common_params & params;
    std::vector<common_arg> options;
    void(*print_usage)(int, char **) = nullptr;
-    common_params_context(common_params & params) : params(params) {}
+    common_params_context(common_params & params_) : params(params_) {}
 };

 // parse input arguments from CLI
--- a/common/common.cpp
+++ b/common/common.cpp
@ -763,9 +763,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
    return true;
 #else
    // if the path already exists, check whether it's a directory
-    struct stat info;
-    if (stat(path.c_str(), &info) == 0) {
-        return S_ISDIR(info.st_mode);
+    {
+        struct stat info;
+        if (stat(path.c_str(), &info) == 0) {
+            return S_ISDIR(info.st_mode);
+        }
    }

    size_t pos_slash = 1; // skip leading slashes for directory creation
@ -796,7 +798,7 @@ bool fs_create_directory_with_parents(const std::string & path) {
 }

 std::string fs_get_cache_directory() {
-    std::string cache_directory = "";
+    std::string cache_directory;
    auto ensure_trailing_slash = [](std::string p) {
        // Make sure to add trailing slash
        if (p.back() != DIRECTORY_SEPARATOR) {
@ -1206,7 +1208,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
    {
        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers * cur = (common_load_model_from_url_headers *) userdata;

            static std::regex header_regex("([^:]+): (.*)\r\n");
            static std::regex etag_regex("ETag", std::regex_constants::icase);
@ -1218,9 +1220,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
                const std::string & key = match[1];
                const std::string & value = match[2];
                if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
+                    cur->etag = value;
                } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
+                    cur->last_modified = value;
                }
            }
            return n_items;
@ -1292,18 +1294,18 @@ static bool common_download_file(const std::string & url, const std::string & pa
        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);

        // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
-            std::size_t protocol_pos = url.find("://");
+        auto llama_download_hide_password_in_url = [](const std::string & url_full) -> std::string {
+            std::size_t protocol_pos = url_full.find("://");
            if (protocol_pos == std::string::npos) {
-                return url;  // Malformed URL
+                return url_full;  // Malformed URL
            }

-            std::size_t at_pos = url.find('@', protocol_pos + 3);
+            std::size_t at_pos = url_full.find('@', protocol_pos + 3);
            if (at_pos == std::string::npos) {
-                return url;  // No password in URL
+                return url_full;  // No password in URL
            }

-            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+            return url_full.substr(0, protocol_pos + 3) + "********" + url_full.substr(at_pos);
        };

        // start the download
@ -1636,15 +1638,8 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
 //

 std::string common_get_builtin_chat_template(const struct llama_model * model) {
-    static const char * template_key = "tokenizer.chat_template";
-    // call with NULL buffer to get the total size of the string
-    int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
-    if (res > 0) {
-        std::vector<char> model_template(res + 1, 0);
-        llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
-        return std::string(model_template.data(), model_template.size() - 1);
-    }
-    return "";
+    const char * ptr_tmpl = llama_model_chat_template(model);
+    return ptr_tmpl == nullptr ? "" : ptr_tmpl;
 }

 bool common_chat_verify_template(const std::string & tmpl) {
--- a/common/console.cpp
+++ b/common/console.cpp
@ -43,7 +43,7 @@ namespace console {
    static bool      simple_io        = true;
    static display_t current_display  = reset;

-    static FILE*     out              = stdout;
+    static FILE*     fout             = stdout;

 #if defined (_WIN32)
    static void*     hConsole;
@ -110,7 +110,7 @@ namespace console {

            tty = fopen("/dev/tty", "w+");
            if (tty != nullptr) {
-                out = tty;
+                fout = tty;
            }
        }

@ -126,7 +126,7 @@ namespace console {
        // Restore settings on POSIX systems
        if (!simple_io) {
            if (tty != nullptr) {
-                out = stdout;
+                fout = stdout;
                fclose(tty);
                tty = nullptr;
            }
@ -145,19 +145,19 @@ namespace console {
            fflush(stdout);
            switch(display) {
                case reset:
-                    fprintf(out, ANSI_COLOR_RESET);
+                    fprintf(fout, ANSI_COLOR_RESET);
                    break;
                case prompt:
-                    fprintf(out, ANSI_COLOR_YELLOW);
+                    fprintf(fout, ANSI_COLOR_YELLOW);
                    break;
                case user_input:
-                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
+                    fprintf(fout, ANSI_BOLD ANSI_COLOR_GREEN);
                    break;
                case error:
-                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
+                    fprintf(fout, ANSI_BOLD ANSI_COLOR_RED);
            }
            current_display = display;
-            fflush(out);
+            fflush(fout);
        }
    }

@ -233,7 +233,7 @@ namespace console {
            return;
        }
 #endif
-        putc('\b', out);
+        putc('\b', fout);
    }

    static int estimateWidth(char32_t codepoint) {
@ -274,7 +274,7 @@ namespace console {
 #else
        // We can trust expectedWidth if we've got one
        if (expectedWidth >= 0 || tty == nullptr) {
-            fwrite(utf8_codepoint, length, 1, out);
+            fwrite(utf8_codepoint, length, 1, fout);
            return expectedWidth;
        }

@ -311,7 +311,7 @@ namespace console {
        pop_cursor();
        put_codepoint(&ch, 1, 1);
 #else
-        fprintf(out, "\b%c", ch);
+        fprintf(fout, "\b%c", ch);
 #endif
    }

@ -353,7 +353,7 @@ namespace console {
    }

    static bool readline_advanced(std::string & line, bool multiline_input) {
-        if (out != stdout) {
+        if (fout != stdout) {
            fflush(stdout);
        }

@ -364,7 +364,7 @@ namespace console {

        char32_t input_char;
        while (true) {
-            fflush(out); // Ensure all output is displayed before waiting for input
+            fflush(fout); // Ensure all output is displayed before waiting for input
            input_char = getchar32();

            if (input_char == '\r' || input_char == '\n') {
@ -432,7 +432,7 @@ namespace console {
            line.pop_back();
            if (last == '\\') {
                line += '\n';
-                fputc('\n', out);
+                fputc('\n', fout);
                has_more = !has_more;
            } else {
                // llama will just eat the single space, it won't act as a space
@ -447,11 +447,11 @@ namespace console {
                has_more = false;
            } else {
                line += '\n';
-                fputc('\n', out);
+                fputc('\n', fout);
            }
        }

-        fflush(out);
+        fflush(fout);
        return has_more;
    }

--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -579,8 +579,8 @@ private:
                    seq.back().second = false;
                } else {
                    std::string literal;
-                    auto is_non_literal = [&](char c) {
-                        return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
+                    auto is_non_literal = [&](char ch) {
+                        return NON_LITERAL_SET.find(ch) != NON_LITERAL_SET.end();
                    };
                    while (i < length) {
                        if (sub_pattern[i] == '\\' && i < length - 1) {
--- a/common/log.cpp
+++ b/common/log.cpp
@ -255,8 +255,8 @@ public:
        thrd = std::thread([this]() {
            while (true) {
                {
-                    std::unique_lock<std::mutex> lock(mtx);
-                    cv.wait(lock, [this]() { return head != tail; });
+                    std::unique_lock<std::mutex> lock_thrd(mtx);
+                    cv.wait(lock_thrd, [this]() { return head != tail; });

                    cur = entries[head];

@ -338,16 +338,16 @@ public:
        resume();
    }

-    void set_prefix(bool prefix) {
+    void set_prefix(bool val) {
        std::lock_guard<std::mutex> lock(mtx);

-        this->prefix = prefix;
+        prefix = val;
    }

-    void set_timestamps(bool timestamps) {
+    void set_timestamps(bool val) {
        std::lock_guard<std::mutex> lock(mtx);

-        this->timestamps = timestamps;
+        timestamps = val;
    }
 };

--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -62,7 +62,7 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);

    // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
+    auto decode_helper = [&ctx, &batch](int32_t n_batch) {
        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));

@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
            common_batch_add(batch, 0, i, { 0 }, false);
        }

-        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+        if (!decode_helper(ctx_params.n_batch)) {
            LOG_ERR("%s: llama_decode() failed\n", __func__);
            return 1;
        }
@ -134,7 +134,7 @@ int main(int argc, char ** argv) {

                llama_kv_cache_clear(ctx);

-                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                if (!decode_helper(ctx_params.n_batch)) {
                    LOG_ERR("%s: llama_decode() failed\n", __func__);
                    return 1;
                }
@ -156,7 +156,7 @@ int main(int argc, char ** argv) {
                        common_batch_add(batch, 0, pp + i, { j }, true);
                    }

-                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                    if (!decode_helper(ctx_params.n_batch)) {
                        LOG_ERR("%s: llama_decode() failed\n", __func__);
                        return 1;
                    }
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -471,12 +471,12 @@ struct my_llama_file {
        GGML_ASSERT(ret == 0); // same
    }

-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
+    void read_raw(void * raw_addr, size_t raw_size) {
+        if (raw_size == 0) {
            return;
        }
        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
+        std::size_t ret = std::fread(raw_addr, raw_size, 1, fp);
        if (ferror(fp)) {
            die_fmt("fread failed: %s", strerror(errno));
        }
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -66,7 +66,7 @@ struct file_input {
    float alpha;
    float scale;

-    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
+    file_input(std::string & fname, float scale_): f_in(fname, std::ios::binary), scale(scale_) {
        if (!f_in.is_open()) {
            throw std::runtime_error("failed to open input gguf from " + fname);
        }
@ -131,7 +131,7 @@ struct lora_merge_ctx {
            std::string & base_fname,
            std::vector<common_adapter_lora_info> & lora_files,
            std::string & outfile,
-            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
+            int n_threads_) : base_model(base_fname, 0), n_threads(n_threads_), fout(outfile, std::ios::binary) {
        fout.exceptions(std::ofstream::failbit); // fail fast on write errors

        if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
@ -157,7 +157,7 @@ struct lora_merge_ctx {
        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
    }

-    void check_metadata_lora(file_input * adapter) {
+    void check_metadata_lora(const file_input * adapter) const {
        auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
        if (general_type != "adapter") {
            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
@ -175,7 +175,7 @@ struct lora_merge_ctx {
        }
    }

-    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
+    static ggml_type get_out_tensor_type(struct ggml_tensor * t) {
        if (t->type == GGML_TYPE_F32) {
            return GGML_TYPE_F32;
        } else {
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@ -60,13 +60,6 @@ int main(int argc, char** argv) {
    const std::string grammar_filename = argv[1];
    const std::string input_filename = argv[2];

-    // Read the GBNF grammar file
-    FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
-    if (!grammar_file) {
-        fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
-        return 1;
-    }
-
    std::string grammar_str;
    {
        std::ifstream grammar_file(grammar_filename);
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@ -204,14 +204,14 @@ struct split_strategy {
    // temporary buffer for reading in tensor data
    std::vector<uint8_t> read_buf;

-    split_strategy(const split_params & params,
-            std::ifstream & f_input,
-            struct gguf_context * ctx_gguf,
-            struct ggml_context * ctx_meta) :
-        params(params),
-        f_input(f_input),
-        ctx_gguf(ctx_gguf),
-        ctx_meta(ctx_meta),
+    split_strategy(const split_params & params_,
+            std::ifstream & f_input_,
+            struct gguf_context * ctx_gguf_,
+            struct ggml_context * ctx_meta_) :
+        params(params_),
+        f_input(f_input_),
+        ctx_gguf(ctx_gguf_),
+        ctx_meta(ctx_meta_),
        n_tensors(gguf_get_n_tensors(ctx_gguf)) {

        // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -294,7 +294,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
 bool IMatrixCollector::load_imatrix(const char * fname) {
    std::ifstream in(fname, std::ios::binary);
    if (!in) {
-        LOG_ERR("%s: failed to open %s\n",__func__, fname);
+        LOG_ERR("%s: failed to open %s\n", __func__, fname);
        return false;
    }
    int n_entries;
@ -308,7 +308,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
-            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+            LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname);
            return false;
        }
        name_as_vec[len] = 0;
@ -319,7 +319,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
-            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
            m_stats = {};
            return false;
        }
@ -332,15 +332,15 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        std::vector<float> tmp(nval);
        in.read((char*)tmp.data(), nval*sizeof(float));
        if (in.fail()) {
-            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
            m_stats = {};
            return false;
        }

        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
-        for (int i = 0; i < nval; i++) {
-            e.values[i] += tmp[i];
-            e.counts[i] += ncall;
+        for (int j = 0; j < nval; j++) {
+            e.values[j] += tmp[j];
+            e.counts[j] += ncall;
        }
        e.ncall += ncall;

@ -488,12 +488,10 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }

-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
+    for (int ich = 0; ich < n_chunk; ++ich) {
+        const int start =   ich * n_ctx;
        const int end   = start + n_ctx;

-        std::vector<float> logits;
-
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
@ -537,7 +535,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {

        const auto t_end = std::chrono::high_resolution_clock::now();

-        if (i == 0) {
+        if (ich == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
@ -555,7 +553,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
            count += n_ctx - first - 1;

-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", ich + 1, std::exp(nll / count));
            fflush(stdout);

            logits.clear();
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -462,14 +462,14 @@ int main(int argc, char ** argv) {
                }

                // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
+                std::vector<llama_token> inp_pfx_cur = common_tokenize(ctx, params.input_prefix, false);
+                std::vector<llama_token> inp_sfx_cur = common_tokenize(ctx, params.input_suffix, false);

-                inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
-                inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
+                inp_pfx_cur.insert(inp_pfx_cur.begin(), llama_vocab_fim_pre(vocab));
+                inp_sfx_cur.insert(inp_sfx_cur.begin(), llama_vocab_fim_suf(vocab));

-                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
+                embd_inp = params.spm_infill ? inp_sfx_cur : inp_pfx_cur;
+                embd_end = params.spm_infill ? inp_pfx_cur : inp_sfx_cur;
                if (add_bos) {
                    embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
                }
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -548,11 +548,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                GGML_ASSERT(split_arg.size() <= llama_max_devices());

                std::vector<float> tensor_split(llama_max_devices());
-                for (size_t i = 0; i < llama_max_devices(); ++i) {
-                    if (i < split_arg.size()) {
-                        tensor_split[i] = std::stof(split_arg[i]);
+                for (size_t is = 0; is < llama_max_devices(); ++is) {
+                    if (is < split_arg.size()) {
+                        tensor_split[is] = std::stof(split_arg[is]);
                    } else {
-                        tensor_split[i] = 0.0f;
+                        tensor_split[is] = 0.0f;
                    }
                }
                params.tensor_split.push_back(tensor_split);
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -1039,41 +1039,40 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            }

            { // attention
-                int hidden_size = 4096;
-                const int d_head = 128;
-                int n_head = hidden_size/d_head;
+                int hidden_size_cur = 4096;
                int num_query = 96;
                if (ctx->minicpmv_version == 2) {
-                    hidden_size = 4096;
-                    n_head = hidden_size/d_head;
+                    hidden_size_cur = 4096;
                    num_query = 96;
                }
                else if (ctx->minicpmv_version == 3) {
-                    hidden_size = 3584;
-                    n_head = hidden_size/d_head;
+                    hidden_size_cur = 3584;
                    num_query = 64;
                }

+                const int d_head_cur = 128;
+                const int n_head_cur = hidden_size_cur/d_head_cur;
+
                struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
-                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
+                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head_cur));
                struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
                struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
                // permute
-                Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
+                Q = ggml_reshape_4d(ctx0, Q, d_head_cur, n_head_cur, num_query, batch_size);
                Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-                Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
-                K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+                Q = ggml_reshape_3d(ctx0, Q, d_head_cur, num_query, n_head_cur * batch_size);
+                K = ggml_reshape_4d(ctx0, K, d_head_cur, n_head_cur, num_positions, batch_size);
                K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-                K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
-                V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
+                K = ggml_reshape_3d(ctx0, K, d_head_cur, num_positions, n_head_cur * batch_size);
+                V = ggml_reshape_4d(ctx0, V, d_head_cur, n_head_cur, num_positions, batch_size);
                V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-                V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
+                V = ggml_reshape_3d(ctx0, V, num_positions, d_head_cur, n_head_cur * batch_size);
                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
                KQ = ggml_soft_max_inplace(ctx0, KQ);
                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-                KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
+                KQV = ggml_reshape_4d(ctx0, KQV, d_head_cur, num_query, n_head_cur, batch_size);
                KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-                KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
+                KQV = ggml_cont_3d(ctx0, KQV, hidden_size_cur, num_query, batch_size);

                embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
            }
@ -1113,12 +1112,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    struct ggml_context * meta = NULL;

-    struct gguf_init_params params = {
+    struct gguf_init_params params_meta = {
        /*.no_alloc = */ true,
        /*.ctx      = */ &meta,
    };

-    struct gguf_context * ctx = gguf_init_from_file(fname, params);
+    struct gguf_context * ctx = gguf_init_from_file(fname, params_meta);
    if (!ctx) {
        throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
    }
@ -1310,13 +1309,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    // load tensors
    {
        std::vector<uint8_t> read_buf;
-        struct ggml_init_params params = {
+        struct ggml_init_params params_data = {
            /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
            /*.mem_buffer =*/ NULL,
            /*.no_alloc =*/ true,
        };

-        new_clip->ctx_data = ggml_init(params);
+        new_clip->ctx_data = ggml_init(params_data);
        if (!new_clip->ctx_data) {
            LOG_ERR("%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
@ -2083,7 +2082,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
    }
    else if (ctx->has_qwen2vl_merger) {
        clip_image_u8 * resized = clip_image_u8_init();
-        auto patch_size = clip_patch_size(ctx) * 2;
+        auto patch_size = clip_get_patch_size(ctx) * 2;
        int nx = ceil((float)img->nx / patch_size) * patch_size;
        int ny = ceil((float)img->ny / patch_size) * patch_size;
        bicubic_resize(*img, *resized, nx, ny);
@ -2294,15 +2293,15 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w
    return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }

-int32_t clip_image_size(const struct clip_ctx * ctx) {
+int32_t clip_get_image_size(const struct clip_ctx * ctx) {
    return ctx->vision_model.hparams.image_size;
 }

-int32_t clip_patch_size(const struct clip_ctx * ctx) {
+int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
    return ctx->vision_model.hparams.patch_size;
 }

-int32_t clip_hidden_size(const struct clip_ctx * ctx) {
+int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
    return ctx->vision_model.hparams.hidden_size;
 }

--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -47,9 +47,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);

-CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);

 // TODO: should be enum, not string
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -105,8 +105,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
        struct ggml_context * ctx;
    } model;

-    const int32_t image_size = clip_image_size(ctx_clip);
-    const int32_t patch_size = clip_patch_size(ctx_clip);
+    const int32_t image_size = clip_get_image_size(ctx_clip);
+    const int32_t patch_size = clip_get_patch_size(ctx_clip);

    int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)

@ -353,7 +353,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        img_res_v.size = 0;
        img_res_v.data = nullptr;

-        const int32_t image_size = clip_image_size(ctx_clip);
+        const int32_t image_size = clip_get_image_size(ctx_clip);

        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -348,8 +348,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params

    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);

-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * params.ppl_stride;
+    for (int ich = 0; ich < n_chunk; ++ich) {
+        const int start =   ich * params.ppl_stride;
        const int end   = start + calc_chunk;

        const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
@ -400,7 +400,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params

        const auto t_end = std::chrono::high_resolution_clock::now();

-        if (i == 0) {
+        if (ich == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
@ -427,9 +427,9 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
        }
        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", ich + 1, std::exp(nll / count));
        } else {
-            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+            LOG("%8d  %.4lf\n", ich*params.ppl_stride, std::exp(nll / count));
        }
    }
    LOG("\n");
@ -659,7 +659,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &

 static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
    int prev_outputs = 0;
-    for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
+    for (int i = 0; i < batch.n_tokens; i += n_batch) {
        const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);

        llama_batch batch_view = {
@ -679,8 +679,8 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
        }

        int n_outputs = 0;
-        for (int i = 0; i < n_tokens; ++i) {
-            n_outputs += batch_view.logits[i] != 0;
+        for (int iv = 0; iv < n_tokens; ++iv) {
+            n_outputs += batch_view.logits[iv] != 0;
        }

        memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
@ -1752,14 +1752,14 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
    auto    kld_ptr =    kld_values.data();
    auto p_diff_ptr = p_diff_values.data();

-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
+    for (int ich = 0; ich < n_chunk; ++ich) {
+        const int start =   ich * n_ctx;
        const int end   = start + n_ctx;

        const auto t_start = std::chrono::high_resolution_clock::now();

        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
+            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, ich);
            return;
        }

@ -1804,7 +1804,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {

        const auto t_end = std::chrono::high_resolution_clock::now();

-        if (i == 0) {
+        if (ich == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
@ -1824,7 +1824,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
        p_diff_ptr += n_ctx - 1 - first;
        kld_ptr    += n_ctx - 1 - first;

-        LOG("%4d", i+1);
+        LOG("%4d", ich + 1);

        auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
        const double ppl_val = exp(log_ppl.first);
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@ -3,3 +3,12 @@ add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+# TMP
+if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    target_compile_options(${TARGET} PRIVATE -Wno-shadow)
+
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        target_compile_options(${TARGET} PRIVATE -Wno-shadow-field-in-constructor)
+    endif()
+endif()
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -29,7 +29,7 @@

 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
 [[noreturn]] static void sigint_handler(int) {
-    printf("\n");
+    printf("\n\033[0m");
    exit(0);  // not ideal, but it's the only way to guarantee exit in all cases
 }
 #endif
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -122,9 +122,9 @@ struct slot_params {
            samplers.emplace_back(common_sampler_type_to_str(sampler));
        }

-        json lora = json::array();
-        for (size_t i = 0; i < this->lora.size(); ++i) {
-            lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
+        json json_lora = json::array();
+        for (size_t i = 0; i < lora.size(); ++i) {
+            json_lora.push_back({{"id", i}, {"scale", lora[i].scale}});
        }

        return json {
@ -167,7 +167,7 @@ struct slot_params {
            {"speculative.p_min",         speculative.p_min},
            {"timings_per_token",         timings_per_token},
            {"post_sampling_probs",       post_sampling_probs},
-            {"lora",                      lora},
+            {"lora",                      json_lora},
        };
    }
 };
@ -200,7 +200,7 @@ struct server_task {
    // used by SERVER_TASK_TYPE_SET_LORA
    std::vector<common_adapter_lora_info> set_lora;

-    server_task(server_task_type type) : type(type) {}
+    server_task(server_task_type type_) : type(type_) {}

    static slot_params params_from_json_cmpl(
            const llama_context * ctx,
@ -1641,7 +1641,7 @@ struct server_context {

    llama_context_params cparams_dft;

-    llama_batch batch = {};
+    llama_batch batch_main = {};

    bool clean_kv_cache = true;
    bool add_bos_token  = true;
@ -1676,7 +1676,7 @@ struct server_context {
            llama_batch_free(slot.batch_spec);
        }

-        llama_batch_free(batch);
+        llama_batch_free(batch_main);
    }

    bool load_model(const common_params & params) {
@ -1797,7 +1797,7 @@ struct server_context {
            const int32_t n_batch = llama_n_batch(ctx);

            // only a single seq_id per token is needed
-            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
+            batch_main = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
        }

        metrics.init();
@ -2655,7 +2655,7 @@ struct server_context {
        }

        // start populating the batch for this iteration
-        common_batch_clear(batch);
+        common_batch_clear(batch_main);

        // track if given slot can be batched with slots already in the batch
        server_slot * slot_batched = nullptr;
@ -2673,9 +2673,9 @@ struct server_context {
                continue;
            }

-            slot.i_batch = batch.n_tokens;
+            slot.i_batch = batch_main.n_tokens;

-            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
+            common_batch_add(batch_main, slot.sampled, slot.n_past, { slot.id }, true);

            slot.n_past += 1;

@ -2692,7 +2692,7 @@ struct server_context {
        int32_t n_ubatch = llama_n_ubatch(ctx);

        // next, batch any pending prompts without exceeding n_batch
-        if (params_base.cont_batching || batch.n_tokens == 0) {
+        if (params_base.cont_batching || batch_main.n_tokens == 0) {
            for (auto & slot : slots) {
                // check if we can batch this slot with the previous one
                if (slot.is_processing()) {
@ -2858,7 +2858,7 @@ struct server_context {
                    // non-causal tasks require to fit the entire prompt in the physical batch
                    if (slot.is_non_causal()) {
                        // cannot fit the prompt in the current batch - will try next iter
-                        if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
+                        if (batch_main.n_tokens + slot.n_prompt_tokens > n_batch) {
                            continue;
                        }
                    }
@ -2878,11 +2878,11 @@ struct server_context {
                    slot.cache_tokens.resize(slot.n_past);

                    // add prompt tokens for processing in the current batch
-                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
+                    while (slot.n_past < slot.n_prompt_tokens && batch_main.n_tokens < n_batch) {
                        // without pooling, we want to output the embeddings for all the tokens in the batch
                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;

-                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
+                        common_batch_add(batch_main, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);

                        if (slot.params.cache_prompt) {
                            slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@ -2892,13 +2892,13 @@ struct server_context {
                        slot.n_past++;
                    }

-                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
+                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch_main.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);

                    // entire prompt has been processed
                    if (slot.n_past == slot.n_prompt_tokens) {
                        slot.state = SLOT_STATE_DONE_PROMPT;

-                        GGML_ASSERT(batch.n_tokens > 0);
+                        GGML_ASSERT(batch_main.n_tokens > 0);

                        common_sampler_reset(slot.smpl);

@ -2908,27 +2908,27 @@ struct server_context {
                        }

                        // extract the logits only for the last token
-                        batch.logits[batch.n_tokens - 1] = true;
+                        batch_main.logits[batch_main.n_tokens - 1] = true;

                        slot.n_decoded = 0;
-                        slot.i_batch   = batch.n_tokens - 1;
+                        slot.i_batch   = batch_main.n_tokens - 1;

-                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
+                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch_main.n_tokens);
                    }
                }

-                if (batch.n_tokens >= n_batch) {
+                if (batch_main.n_tokens >= n_batch) {
                    break;
                }
            }
        }

-        if (batch.n_tokens == 0) {
+        if (batch_main.n_tokens == 0) {
            SRV_WRN("%s", "no tokens to decode\n");
            return;
        }

-        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch_main.n_tokens);

        if (slot_batched) {
            // make sure we're in the right embedding mode
@ -2938,17 +2938,17 @@ struct server_context {
        }

        // process the created batch of tokens
-        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
+        for (int32_t i_batch = 0; i_batch < batch_main.n_tokens; i_batch += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, batch_main.n_tokens - i_batch);

            llama_batch batch_view = {
                n_tokens,
-                batch.token    + i,
+                batch_main.token    + i_batch,
                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
+                batch_main.pos      + i_batch,
+                batch_main.n_seq_id + i_batch,
+                batch_main.seq_id   + i_batch,
+                batch_main.logits   + i_batch,
            };

            const int ret = llama_decode(ctx, batch_view);
@ -2957,7 +2957,7 @@ struct server_context {
            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+                    SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i_batch = %d, n_batch = %d, ret = %d\n", i_batch, n_batch, ret);
                    for (auto & slot : slots) {
                        slot.release();
                        send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
@ -2967,15 +2967,15 @@ struct server_context {

                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
-                i -= n_batch;
+                i_batch -= n_batch;

-                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i_batch = %d, n_batch = %d, ret = %d\n", i_batch, n_batch, ret);

                continue; // continue loop of n_batch
            }

            for (auto & slot : slots) {
-                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
+                if (slot.i_batch < (int) i_batch || slot.i_batch >= (int) (i_batch + n_tokens)) {
                    continue; // continue loop of slots
                }

@ -3001,7 +3001,7 @@ struct server_context {
                    continue; // continue loop of slots
                }

-                const int tok_idx = slot.i_batch - i;
+                const int tok_idx = slot.i_batch - i_batch;

                llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);

@ -3687,8 +3687,8 @@ int main(int argc, char ** argv) {
                } else {
                    // multiple results (multitask)
                    json arr = json::array();
-                    for (auto & res : results) {
-                        arr.push_back(res->to_json());
+                    for (auto & result : results) {
+                        arr.push_back(result->to_json());
                    }
                    res_ok(res, arr);
                }
@ -3702,8 +3702,8 @@ int main(int argc, char ** argv) {
                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool {
                    json res_json = result->to_json();
                    if (res_json.is_array()) {
-                        for (const auto & res : res_json) {
-                            if (!server_sent_event(sink, "data", res)) {
+                        for (const auto & item : res_json) {
+                            if (!server_sent_event(sink, "data", item)) {
                                return false;
                            }
                        }
@ -3973,9 +3973,9 @@ int main(int argc, char ** argv) {
            std::unordered_set<int> task_ids = server_task::get_list_id(tasks);

            ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
-                for (auto & res : results) {
-                    GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
-                    responses.push_back(res->to_json());
+                for (auto & result : results) {
+                    GGML_ASSERT(dynamic_cast<server_task_result_embd*>(result.get()) != nullptr);
+                    responses.push_back(result->to_json());
                }
            }, [&](const json & error_data) {
                res_error(res, error_data);
@ -4063,9 +4063,9 @@ int main(int argc, char ** argv) {
            std::unordered_set<int> task_ids = server_task::get_list_id(tasks);

            ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
-                for (auto & res : results) {
-                    GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
-                    responses.push_back(res->to_json());
+                for (auto & result : results) {
+                    GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(result.get()) != nullptr);
+                    responses.push_back(result->to_json());
                }
            }, [&](const json & error_data) {
                res_error(res, error_data);
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -129,15 +129,15 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_
            if (p.is_string()) {
                auto s = p.template get<std::string>();

-                llama_tokens p;
+                llama_tokens ids;
                if (first) {
-                    p = common_tokenize(vocab, s, add_special, parse_special);
+                    ids = common_tokenize(vocab, s, add_special, parse_special);
                    first = false;
                } else {
-                    p = common_tokenize(vocab, s, false, parse_special);
+                    ids = common_tokenize(vocab, s, false, parse_special);
                }

-                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+                prompt_tokens.insert(prompt_tokens.end(), ids.begin(), ids.end());
            } else {
                if (first) {
                    first = false;
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@ -110,9 +110,8 @@ int main(int argc, char ** argv) {
        llama_token new_token_id;
        while (true) {
            // check if we have enough space in the context to evaluate this batch
-            int n_ctx = llama_n_ctx(ctx);
            int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
-            if (n_ctx_used + batch.n_tokens > n_ctx) {
+            if (n_ctx_used + batch.n_tokens > (int) llama_n_ctx(ctx)) {
                printf("\033[0m\n");
                fprintf(stderr, "context size exceeded\n");
                exit(0);
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -544,26 +544,26 @@ int main(int argc, char ** argv) {
                for (int is = 0; is < (int) sa.size(); ++is) {
                    const llama_token id = cur_p->data[is].id;

-                    const int s = sa[is];
+                    const int sd = sa[is];

-                    common_sampler_accept(drafts[s].smpl, id, true);
+                    common_sampler_accept(drafts[sd].smpl, id, true);

-                    drafts[s].tokens.push_back(id);
-                    // save cur_p.data into drafts[s].dists
-                    drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
+                    drafts[sd].tokens.push_back(id);
+                    // save cur_p.data into drafts[sd].dists
+                    drafts[sd].dists.push_back({cur_p->data, cur_p->data + cur_p->size});

                    // add unique drafted tokens to the target batch
-                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
+                    drafts[sd].i_batch_tgt.push_back(batch_tgt.n_tokens);

-                    common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+                    common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { sd }, true);

                    // add the token to the batch for batched decoding with the draft model
-                    drafts[s].i_batch_dft = batch_dft.n_tokens;
+                    drafts[sd].i_batch_dft = batch_dft.n_tokens;

-                    common_batch_add(batch_dft, id, n_past_cur, { s }, true);
+                    common_batch_add(batch_dft, id, n_past_cur, { sd }, true);

                    if (batch_tgt.n_tokens > n_draft) {
-                        drafts[s].drafting = false;
+                        drafts[sd].drafting = false;
                    }
                }
            }
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -323,7 +323,7 @@ extern "C" {
    // Utils
    //

-    struct ggml_backend_graph_copy {
+    struct ggml_backend_graph_copy_state {
        ggml_backend_buffer_t buffer;
        struct ggml_context * ctx_allocated;
        struct ggml_context * ctx_unallocated;
@ -331,8 +331,8 @@ extern "C" {
    };

    // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+    GGML_API struct ggml_backend_graph_copy_state ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                                 ggml_backend_graph_copy_free(struct ggml_backend_graph_copy_state copy);

    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);

--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -1724,7 +1724,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
    }
 }

-struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
+struct ggml_backend_graph_copy_state ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
    struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
    struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
    bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
@ -1805,14 +1805,14 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
    };
 }

-void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
+void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy_state copy) {
    ggml_backend_buffer_free(copy.buffer);
    ggml_free(copy.ctx_allocated);
    ggml_free(copy.ctx_unallocated);
 }

 bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
-    struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
+    struct ggml_backend_graph_copy_state copy = ggml_backend_graph_copy(backend2, graph);
    if (copy.buffer == NULL) {
        return false;
    }
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@ -55,7 +55,7 @@ struct llama_adapter_lora_weight {
    }

    llama_adapter_lora_weight() = default;
-    llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
+    llama_adapter_lora_weight(struct ggml_tensor * a_, struct ggml_tensor * b_) : a(a_), b(b_) {}
 };

 struct llama_adapter_lora {
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -178,7 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
    { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
    { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
-    { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat.template"                 },
+    { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
    { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
    { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
    { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
@ -1443,7 +1443,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_CONVNEXT_GAMMA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 };

-LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
+LLM_KV::LLM_KV(llm_arch arch_) : arch(arch_) {}

 std::string LLM_KV::operator()(llm_kv kv) const {
    return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -374,7 +374,7 @@ struct LLM_TN_IMPL {
 };

 struct LLM_TN {
-    LLM_TN(llm_arch arch) : arch(arch) {}
+    LLM_TN(llm_arch arch_) : arch(arch_) {}

    llm_arch arch;

--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -15,8 +15,8 @@
 #include <set>

 struct llama_context {
-    llama_context(const llama_model & model)
-        : model(model)
+    llama_context(const llama_model & model_)
+        : model(model_)
        , t_start_us(model.t_start_us)
        , t_load_us(model.t_load_us) {}

--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@ -17,7 +17,7 @@ struct llama_logger_state {

 static llama_logger_state g_logger_state;

-time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+time_meas::time_meas(int64_t & t_acc_, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc_) {}

 time_meas::~time_meas() {
        if (t_start_us >= 0) {
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@ -454,8 +454,8 @@ struct llama_mlock::impl {
        return (size_t) sysconf(_SC_PAGESIZE);
    }

-    bool raw_lock(const void * addr_cur, size_t size_cur) const {
-        if (!mlock(addr_cur, size_cur)) {
+    bool raw_lock(const void * lock_addr, size_t lock_len) const {
+        if (!mlock(lock_addr, lock_len)) {
            return true;
        }

@ -475,12 +475,12 @@ struct llama_mlock::impl {
        if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
            suggest = false;
        }
-        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size_cur)) {
+        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + lock_len)) {
            suggest = false;
        }

        LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                size_cur, size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+                lock_len, size, errmsg, suggest ? MLOCK_SUGGESTION : "");
        return false;
    }

@ -535,7 +535,7 @@ struct llama_mlock::impl {
        return (size_t) 65536;
    }

-    bool raw_lock(const void * addr_cur, size_t size_cur) const {
+    bool raw_lock(const void * lock_addr, size_t lock_len) const {
        LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
        return false;
    }
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@ -31,7 +31,7 @@ struct llama_model_loader {

        ggml_tensor * tensor;

-        llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
+        llama_tensor_weight(const llama_file * file, uint16_t idx_, const struct gguf_context * gguf_ctx, ggml_tensor * tensor_) : idx(idx_), tensor(tensor_) {
            const int tensor_idx = gguf_find_tensor(gguf_ctx,  ggml_get_name(tensor));
            if (tensor_idx < 0) {
                throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -311,9 +311,9 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_m
            ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
        if (ggml_backend_split_buffer_type_fn) {
            size_t dev_index = [&]() {
-                auto * reg = ggml_backend_dev_backend_reg(dev);
-                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
-                    if (ggml_backend_reg_dev_get(reg, i) == dev) {
+                ggml_backend_reg_t reg_dev = ggml_backend_dev_backend_reg(dev);
+                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg_dev); ++i) {
+                    if (ggml_backend_reg_dev_get(reg_dev, i) == dev) {
                        return i;
                    }
                }
@ -369,7 +369,7 @@ struct llama_model::impl {
    std::vector<layer_dev> dev_layer;
 };

-llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
+llama_model::llama_model(const struct llama_model_params & params_) : params(params_), pimpl(std::make_unique<impl>()) {
 }

 llama_model::~llama_model() {}
@ -1304,7 +1304,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
-            return {cpu_dev, &pimpl->cpu_buft_list};
+            return { cpu_dev, &pimpl->cpu_buft_list };
        }
        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
        auto * dev = devices.at(layer_gpu);
@ -1453,7 +1453,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            // avoid using a host buffer when using mmap
            auto * buft_dev = ggml_backend_buft_get_device(buft);
            if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
-                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
                buft = ggml_backend_dev_buffer_type(cpu_dev);
            }

@ -3697,8 +3696,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {

 const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
    auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
-            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
-                return it.first == name;
+            [name](const std::pair<std::string, struct ggml_tensor *> & entry) {
+                return entry.first == name;
            });
    if (it == tensors_by_name.end()) {
        return nullptr;
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -41,9 +41,9 @@ struct quantize_state_impl {
    // used to figure out if a model shares tok_embd with the output weight
    bool has_output = false;

-    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
-        : model(model)
-        , params(params)
+    quantize_state_impl(const llama_model & model_, const llama_model_quantize_params * params_)
+        : model(model_)
+        , params(params_)
        {}
 };

@ -130,17 +130,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
        return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
    };
    const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
-    auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
+    auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name_layer) {
        if (n_expert > 1) {
            // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
            // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
            // for getting the current layer as I initially thought, and we need to resort to parsing the
            // tensor name.
-            if (sscanf(name, "blk.%d.", &i_layer) != 1) {
-                throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
+            if (sscanf(name_layer, "blk.%d.", &i_layer) != 1) {
+                throw std::runtime_error(format("Failed to determine layer for tensor %s", name_layer));
            }
            if (i_layer < 0 || i_layer >= n_layer) {
-                throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
+                throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name_layer, n_layer));
            }
        }
        return std::make_pair(i_layer, n_layer);
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -115,7 +115,7 @@ struct llm_tokenizer_spm : llm_tokenizer {
 };

 struct llm_tokenizer_spm_session {
-    llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
+    llm_tokenizer_spm_session(const llama_vocab & vocab_) : vocab(vocab_) {}

    void tokenize(const std::string & text, std::vector<llama_token> & output) {
        // split string into utf8 chars
@ -415,7 +415,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
 };

 struct llm_tokenizer_bpe_session {
-    llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+    llm_tokenizer_bpe_session(const llama_vocab & vocab_, const llm_tokenizer_bpe & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {}

    static void append(const llama_token token_id, std::vector<llama_token> & output)  {
        output.push_back(token_id);
@ -603,7 +603,7 @@ struct llm_tokenizer_wpm : llm_tokenizer {
 };

 struct llm_tokenizer_wpm_session {
-    llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
+    llm_tokenizer_wpm_session(const llama_vocab & vocab_) : vocab(vocab_) {}

    void tokenize(const std::string & text, std::vector<llama_token> & output) {
        // normalize and split by whitespace
@ -782,7 +782,7 @@ struct llm_tokenizer_ugm : llm_tokenizer {
 };

 struct llm_tokenizer_ugm_session {
-    llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+    llm_tokenizer_ugm_session(const llama_vocab & vocab_, const llm_tokenizer_ugm & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {}

    /* This implementation is based on SentencePiece optimized Viterbi algorithm for
     * unigram language models. The general idea is to:
@ -949,7 +949,7 @@ private:
     */
    struct xcda_array_view {
    public:
-        xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
+        xcda_array_view(const uint32_t * xcda_array_, size_t xcda_array_size_) : xcda_array(xcda_array_), xcda_array_size(xcda_array_size_) {
        }
        uint32_t get_base(size_t index) {
            uint32_t packed_node = get_node(index);
@ -1135,7 +1135,7 @@ struct llm_tokenizer_rwkv : llm_tokenizer {
 };

 struct llm_tokenizer_rwkv_session {
-    llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+    llm_tokenizer_rwkv_session(const llama_vocab & vocab_, const llm_tokenizer_rwkv & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {}

    void tokenize(const std::string & text, std::vector<llama_token> & output) {
        uint32_t position = 0;
@ -1262,7 +1262,7 @@ struct llama_vocab::impl {

    std::vector<char> precompiled_charsmap;

-    impl(const llama_vocab & vocab) : vocab(vocab) {
+    impl(const llama_vocab & vocab_) : vocab(vocab_) {
    }

    ~impl() = default;
@ -2496,15 +2496,15 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t

    // copy piece chars to output text buffer
    // skip up to 'lstrip' leading spaces before copying
-    auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
-        for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
-            token++;
+    auto _try_copy = [=] (const char * text, size_t size) -> int32_t {
+        for (int32_t i = 0; i < lstrip && size && *text == ' '; ++i) {
+            text++;
            size--;
        }
        if (length < (int32_t)size) {
            return -(int32_t) size;
        }
-        memcpy(buf, token, size);
+        memcpy(buf, text, size);
        return (int32_t) size;
    };

--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -1089,16 +1089,16 @@ struct llm_build_context {

    // TODO: consider making the entire interface noexcept
    llm_build_context(
-        llama_context  & lctx,
-    const llama_ubatch & ubatch,
-    const llm_build_cb & cb,
+         llama_context & lctx_,
+    const llama_ubatch & ubatch_,
+    const llm_build_cb & cb_,
                  bool   worst_case) :
-        model            (lctx.model),
-        lctx             (lctx),
+        model            (lctx_.model),
+        lctx             (lctx_),
        hparams          (model.hparams),
-        cparams          (lctx.cparams),
-        ubatch           (ubatch),
-        kv_self          (lctx.kv_self),
+        cparams          (lctx_.cparams),
+        ubatch           (ubatch_),
+        kv_self          (lctx_.kv_self),
        n_embd           (hparams.n_embd),
        n_layer          (hparams.n_layer),
        n_rot            (hparams.n_rot),
@ -1119,17 +1119,17 @@ struct llm_build_context {
        beta_slow        (cparams.yarn_beta_slow),
        norm_eps         (hparams.f_norm_eps),
        norm_rms_eps     (hparams.f_norm_rms_eps),
-        n_tokens         (ubatch.n_tokens),
+        n_tokens         (ubatch_.n_tokens),
        n_kv             (worst_case ? kv_self.size : kv_self.n),
-        n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
-        n_outputs_enc    (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
+        n_outputs        (worst_case ? n_tokens : lctx_.n_outputs),
+        n_outputs_enc    (worst_case ? n_tokens : lctx_.embd_enc.size() / hparams.n_embd),
        kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
        n_ctx_orig       (cparams.n_ctx_orig_yarn),
        flash_attn       (cparams.flash_attn),
        pooling_type     (cparams.pooling_type),
        rope_type        (hparams.rope_type),
-        cb               (cb),
-        buf_compute_meta (lctx.buf_compute_meta) {
+        cb               (cb_),
+        buf_compute_meta (lctx_.buf_compute_meta) {
            // all initializations should be done in init()
        }
Author	SHA1	Message	Date
Georgi Gerganov	133fe809f7	Merge `a97b3621cf` into `924518e2e5`	2025-01-12 23:45:46 +05:00
Eric Curtin	924518e2e5	Reset color before we exit (#11205 ) We don't want colors to leak post termination of llama-run. Signed-off-by: Eric Curtin <ecurtin@redhat.com>	2025-01-12 18:23:10 +00:00
Georgi Gerganov	a97b3621cf	ggml : ggml_backend_graph_copy -> ggml_backend_graph_copy_state ggml-ci	2025-01-12 17:57:51 +02:00
Georgi Gerganov	afd40ea206	minor : better names ggml-ci	2025-01-12 17:22:16 +02:00
Georgi Gerganov	36803b1902	common : cont ggml-ci	2025-01-12 16:53:44 +02:00
Georgi Gerganov	a59ee7c4eb	common : cont ggml-ci	2025-01-12 16:20:24 +02:00
Georgi Gerganov	10eb87409e	shadow : cont gcc ggml-ci	2025-01-12 16:09:49 +02:00
Georgi Gerganov	f65e3d324d	ggml : ggml_backend_graph_copy -> ggml_backend_graph_copy_init	2025-01-12 15:34:48 +02:00
Georgi Gerganov	439e68c1e5	cmake : re-enable GCC -Wshadow ggml-ci	2025-01-12 15:29:33 +02:00
Georgi Gerganov	34889bf810	cmake : cont ggml-ci	2025-01-12 15:11:59 +02:00
Xuan Son Nguyen	9a483999a6	llama : fix chat template gguf key (#11201 )	2025-01-12 13:45:14 +01:00
Georgi Gerganov	e159e7751c	cmake : disable -Wshadow for GCC ggml-ci	2025-01-12 14:35:29 +02:00
Georgi Gerganov	9a735ae6d8	examplse : de-shadow ggml-ci	2025-01-12 14:25:32 +02:00