Merge 63b6e73500 into 67155ab7f5

feat: Implements retrying logic for downloading models using --model-url flag (#9255 )
* feat: Implements retrying logic for downloading models using --model-url flag * Update common/common.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * Update common/common.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * apply comments * implements a retry function to avoid duplication * fix editorconfig * change function name --------- Co-authored-by: farbod <farbod.bjary82@gmail.com> Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2024-09-22 21:16:20 +00:00 · 2024-09-11 11:22:41 +02:00 · 2024-09-11 11:22:37 +02:00 · 2024-09-11 10:22:40 +02:00 · 2024-09-11 10:03:54 +03:00 · 2024-09-11 03:46:55 +02:00
28 changed files with 432 additions and 152 deletions
--- a/1
+++ b/1
@ -1454,7 +1454,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	./llama-gen-docs

 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -173,7 +173,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
    std::string arg;
    const std::string arg_prefix = "--";
    gpt_params & params = ctx_arg.params;
-    gpt_sampler_params & sparams = params.sparams;

    std::unordered_map<std::string, llama_arg *> arg_to_options;
    for (auto & opt : ctx_arg.options) {
@ -283,10 +282,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
        params.kv_overrides.back().key[0] = 0;
    }

-    if (sparams.seed == LLAMA_DEFAULT_SEED) {
-        sparams.seed = time(NULL);
-    }
-
    return true;
 }

@ -823,7 +818,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params) {
            params.special = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(llama_arg(
        {"-cnv", "--conversation"},
        format(
@ -909,7 +904,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
    ).set_sparam());
    add_opt(llama_arg(
        {"-s", "--seed"}, "SEED",
-        format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
+        format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED),
        [](gpt_params & params, const std::string & value) {
            params.sparams.seed = std::stoul(value);
        }
@ -1422,20 +1417,18 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                params.split_mode = LLAMA_SPLIT_MODE_NONE;
            } else if (arg_next == "layer") {
                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            }
-            else if (arg_next == "row") {
+            } else if (arg_next == "row") {
 #ifdef GGML_USE_SYCL
                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
                exit(1);
 #endif // GGML_USE_SYCL
                params.split_mode = LLAMA_SPLIT_MODE_ROW;
-            }
-            else {
+            } else {
                throw std::invalid_argument("invalid value");
            }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
+            }
        }
    ));
    add_opt(llama_arg(
@ -1455,14 +1448,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            }
            for (size_t i = 0; i < llama_max_devices(); ++i) {
                if (i < split_arg.size()) {
-                        params.tensor_split[i] = std::stof(split_arg[i]);
+                    params.tensor_split[i] = std::stof(split_arg[i]);
                } else {
-                        params.tensor_split[i] = 0.0f;
+                    params.tensor_split[i] = 0.0f;
                }
            }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
+            }
        }
    ));
    add_opt(llama_arg(
@ -1470,9 +1463,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
        [](gpt_params & params, int value) {
            params.main_gpu = value;
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
+            }
        }
    ));
    add_opt(llama_arg(
--- a/common/common.cpp
+++ b/common/common.cpp
@ -56,14 +56,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUDA_SYCL
-#endif
-
-#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUDA_SYCL_VULKAN
-#endif
-
 #if defined(LLAMA_USE_CURL)
 #ifdef __linux__
 #include <linux/limits.h>
@ -949,11 +941,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p

 #ifdef LLAMA_USE_CURL

+#define CURL_MAX_RETRY 3
+#define CURL_RETRY_DELAY_SECONDS 2
+
+
 static bool starts_with(const std::string & str, const std::string & prefix) {
    // While we wait for C++20's std::string::starts_with...
    return str.rfind(prefix, 0) == 0;
 }

+static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
+    int remaining_attempts = max_attempts;
+
+    while (remaining_attempts > 0) {
+        fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+
+        CURLcode res = curl_easy_perform(curl);
+        if (res == CURLE_OK) {
+            return true;
+        }
+
+        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
+        fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+
+        remaining_attempts--;
+        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+    }
+
+    fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+    return false;
+}
+
 static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {

    // Initialize libcurl
@ -1057,9 +1075,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);

-        CURLcode res = curl_easy_perform(curl.get());
-        if (res != CURLE_OK) {
-            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
            return false;
        }

@ -1134,11 +1151,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        };

        // start the download
-        fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-                llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        auto res = curl_easy_perform(curl.get());
-        if (res != CURLE_OK) {
-            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+        fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
            return false;
        }

--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -310,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
    return cur_p.data[cur_p.selected].id;
 }

+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
+    return llama_sampler_get_seed(gsmpl->chain);
+}
+
 // helpers

 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
--- a/common/sampling.h
+++ b/common/sampling.h
@ -60,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 //
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);

+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
+
 // helpers

 // access the internal list of current candidate tokens
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -302,6 +302,8 @@ class Model:
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                            gguf.MODEL_TENSOR.TIME_MIX_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
                        )
                    )
                    or not new_name.endswith(".weight")
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -3,32 +3,10 @@
 #include "llama.h"

 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

-// mutates the input string
-static std::vector<int> parse_list(char * p) {
-    std::vector<int> ret;
-
-    char * q = p;
-
-    while (*p) {
-        if (*p == ',') {
-            *p = '\0';
-            ret.push_back(std::atoi(q));
-            q = p + 1;
-        }
-
-        ++p;
-    }
-
-    ret.push_back(std::atoi(q));
-
-    return ret;
-}
-
 static void print_usage(int, char ** argv) {
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -90,8 +90,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -159,8 +159,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);
@ -301,6 +299,9 @@ int main(int argc, char ** argv) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
+    smpl = gpt_sampler_init(model, sparams);
+
+    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
    LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");
@ -340,8 +341,6 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    smpl = gpt_sampler_init(model, sparams);
-
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -191,8 +191,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);
@ -470,8 +468,10 @@ int main(int argc, char ** argv) {
        exit(1);
    }

+    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
    LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
-    LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+    LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1266,6 +1266,7 @@ struct server_context {
            {"n_predict",                 slot.n_predict},     // Server configured n_predict
            {"model",                     params.model_alias},
            {"seed",                      slot.sparams.seed},
+            {"seed_cur",                  slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
            {"temperature",               slot.sparams.temp},
            {"dynatemp_range",            slot.sparams.dynatemp_range},
            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1725024810,
-        "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
+        "lastModified": 1725234343,
+        "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
+        "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1724819573,
-        "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
+        "lastModified": 1725634671,
+        "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
+        "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
        "type": "github"
      },
      "original": {
@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1722555339,
-        "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
+        "lastModified": 1725233747,
+        "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
      }
    },
    "root": {
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -32,7 +32,8 @@
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
-
+#include "ggml-cuda/ssm_conv.cuh"
+#include "ggml-cuda/ssm_scan.cuh"
 #include <algorithm>
 #include <array>
 #include <atomic>
@ -2321,6 +2322,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_FLASH_ATTN_EXT:
            ggml_cuda_flash_attn_ext(ctx, dst);
            break;
+        case GGML_OP_SSM_CONV:
+            ggml_cuda_op_ssm_conv(ctx, dst);
+            break;
+        case GGML_OP_SSM_SCAN:
+            ggml_cuda_op_ssm_scan(ctx, dst);
        case GGML_OP_CROSS_ENTROPY_LOSS:
            ggml_cuda_cross_entropy_loss(ctx, dst);
            break;
@ -2922,6 +2928,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
+        case GGML_OP_SSM_CONV:
+        case GGML_OP_SSM_SCAN:
            return true;
        case GGML_OP_FLASH_ATTN_EXT:
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;

-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
+    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
+    // Also its fixup needs to allocate a temporary buffer in the memory pool.
+    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
+    const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@ -2742,6 +2742,7 @@ struct mmq_args {
    int64_t ne00; int64_t ne01; int64_t stride01;
    int64_t ne10; int64_t ne11; int64_t stride11;
    int64_t ne0;
+    bool use_stream_k;
 };

 template<ggml_type type>
@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
    const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
    const dim3 block_nums_xy_tiling(nty, ntx, 1);

-    const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
-    if (!use_stream_k) {
+    if (!args.use_stream_k) {
        if (args.ne01 % mmq_y == 0) {
            constexpr bool need_check = false;
            mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@ -153,9 +153,9 @@ static void group_norm_f32_cuda(const float * x, float * dst, const int num_grou
 }

 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    GGML_ASSERT(ncols % WARP_SIZE == 0 || ncols < WARP_SIZE);
    if (ncols < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
+        const dim3 block_dims(min(ncols, WARP_SIZE), 1, 1);
        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
--- a/ggml/src/ggml-cuda/ssm_conv.cu
+++ b/ggml/src/ggml-cuda/ssm_conv.cu
@ -0,0 +1,100 @@
+#include "ssm_conv.cuh"
+
+template <int block_size>
+static __global__ void ssm_conv_f32(
+    const float * __restrict__ src0, const float * __restrict__ src1,
+    const int src0_nb0, const int src0_nb1, const int src0_nb2,
+    const int src1_nb1,
+    float * __restrict__ dst,
+    const int dst_nb0, const int dst_nb1, const int dst_nb2,
+    const int nc, const int ncs, const int nr, const int n_t, const int n_s) {
+
+    const int tid = blockIdx.y;
+    const int i3 = blockIdx.x;
+    const int i2 = threadIdx.x;
+
+    const int ith = tid;
+    const int nth = WARP_SIZE;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = min(ir0 + dr, nr);
+    const int ir  = ir1 - ir0;
+
+    // {d_conv - 1 + n_t, d_inner, n_seqs}
+    // sliding window
+    const float * s = (const float *) ((const char *) src0 + ir0*src0_nb1 + i2*src0_nb0 + i3*src0_nb2); // {d_conv, d_inner, n_s}
+    const float * c = (const float *) ((const char *) src1 + ir0*src1_nb1); // {d_conv, d_inner}
+    float * x = (float *) ((char *) dst + ir0*dst_nb0 + i2*dst_nb1 + i3*dst_nb2); // {d_inner, n_t, n_s}
+
+    // TODO: transpose the output for smaller strides for big batches?
+    // d_inner
+    for (int i1 = 0; i1 < ir; ++i1) {
+        // rowwise dot product
+        // NOTE: not using ggml_vec_dot_f32, because its sum is in double precision
+        float sumf = 0.0f;
+
+        // d_conv
+        #pragma unroll
+        for (int i0 = 0; i0 < nc; ++i0) {
+            sumf += s[i0 + i1*ncs] * c[i0 + i1*nc];
+        }
+        x[i1] = sumf;
+    }
+}
+
+static void ssm_conv_f32_cuda(
+    const float * src0, const float * src1,
+    const int src0_nb0, const int src0_nb1, const int src0_nb2,
+    const int src1_nb1,
+    float * dst,
+    const int dst_nb0, const int dst_nb1, const int dst_nb2,
+    const int nc, const int ncs, const int nr, const int n_t, const int n_s,
+    cudaStream_t stream) {
+
+    const dim3 block_dims(n_t, 1, 1);
+    //const int nblocks = n_s; // TODO
+    const dim3 grid_dims(n_s, WARP_SIZE, 1);
+
+    ssm_conv_f32<WARP_SIZE><<<grid_dims, block_dims, 0, stream>>>(
+        src0, src1,
+        src0_nb0, src0_nb1, src0_nb2,
+        src1_nb1,
+        dst,
+        dst_nb0, dst_nb1, dst_nb2,
+        nc, ncs, nr, n_t, n_s);
+}
+
+void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0]; // conv_x
+    const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
+
+    const int nc  = src1->ne[0]; // d_conv
+    const int ncs = src0->ne[0]; // d_conv - 1 + n_t
+    const int nr  = src0->ne[1]; // d_inner
+    const int n_t =  dst->ne[1]; // tokens per sequence
+    const int n_s =  dst->ne[2]; // number of sequences in the batch
+
+    GGML_ASSERT( dst->ne[0] == nr);
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
+
+    const float * src0_d = (const float *)src0->data;
+    const float * src1_d = (const float *)src1->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    ssm_conv_f32_cuda(src0_d, src1_d,
+        src0->nb[0], src0->nb[1], src0->nb[2],
+        src1->nb[1],
+        dst_d,
+        dst->nb[0], dst->nb[1], dst->nb[2],
+        nc, ncs, nr, n_t, n_s,
+        stream);
+}
--- a/ggml/src/ggml-cuda/ssm_conv.cuh
+++ b/ggml/src/ggml-cuda/ssm_conv.cuh
@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/ssm_scan.cu
+++ b/ggml/src/ggml-cuda/ssm_scan.cu
@ -0,0 +1,144 @@
+#include "ssm_scan.cuh"
+
+template <int block_size>
+static __global__ void ssm_scan_f32(
+    const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2, const float * __restrict__ src3,
+    const float * __restrict__ src4, const float * __restrict__ src5,
+    const int src0_nb1, const int src0_nb2,
+    const int src1_nb0, const int src1_nb1, const int src1_nb2, const int src1_nb3,
+    const int src2_nb0, const int src2_nb1, const int src2_nb2,
+    const int src3_nb1,
+    const int src4_nb1, const int src4_nb2,
+    const int src5_nb1, const int src5_nb2,
+    float * __restrict__ dst,
+    const int nc, const int nr, const int n_t, const int n_s) {
+
+//    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+    const int i3 = threadIdx.y;
+
+    const int ith = tid;
+    const int nth = WARP_SIZE;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = min(ir0 + dr, nr);
+    const int ir  = ir1 - ir0;
+    for (int i2 = 0; i2 < n_t; ++i2) {
+        const float * s0 = (const float *) ((const char *) src0 + ir0*src0_nb1 + i3*src0_nb2); // {d_state, d_inner, n_s}
+        const float * x  = (const float *) ((const char *) src1 + ir0*src1_nb0 + i2*src1_nb1 + i3*src1_nb2); // {d_inner, n_t, n_s}
+        const float * dt = (const float *) ((const char *) src2 + ir0*src2_nb0 + i2*src2_nb1 + i3*src2_nb2); // {d_inner, n_t, n_s}
+        const float * A  = (const float *) ((const char *) src3 + ir0*src3_nb1); // {d_state, d_inner}
+        const float * B  = (const float *) ((const char *) src4 +  i2*src4_nb1 + i3*src4_nb2); // {d_state, n_t, n_s}
+        const float * C  = (const float *) ((const char *) src5 +  i2*src5_nb1 + i3*src5_nb2); // {d_state, n_t, n_s}
+        float * y = (float *) ((char *) dst + ir0*src1_nb0 + i2*src1_nb1 + i3*src1_nb2); // {d_inner, n_t, n_s}
+        float * s = (float *) ((char *) dst + ir0*src0_nb1 + i3*src0_nb2 + src1_nb3); // {d_state, d_inner, n_s}
+
+        // use the output as the source for the next token-wise iterations
+        if (i2 > 0) { s0 = s; }
+
+        // d_inner
+        for (int i1 = 0; i1 < ir; ++i1) {
+            // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
+            float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+            float x_dt = x[i1] * dt_soft_plus;
+            float sumf = 0.0f;
+            // d_state
+            #pragma unroll
+            for (int i0 = 0; i0 < nc; ++i0) {
+                int i = i0 + i1*nc;
+                // state = prev_state * dA + dB * x
+                float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                // y = rowwise_dotprod(state, C)
+                sumf += state * C[i0];
+                s[i] = state;
+            }
+            y[i1] = sumf;
+        }
+    }
+}
+
+static void ssm_scan_f32_cuda(
+    const float * src0, const float * src1, const float * src2, const float * src3,
+    const float * src4, const float * src5,
+    const int src0_nb1, const int src0_nb2,
+    const int src1_nb0, const int src1_nb1, const int src1_nb2, const int src1_nb3,
+    const int src2_nb0, const int src2_nb1, const int src2_nb2,
+    const int src3_nb1,
+    const int src4_nb1, const int src4_nb2,
+    const int src5_nb1, const int src5_nb2,
+    float * dst,
+    const int nc, const int nr, const int n_t, const int n_s,
+    cudaStream_t stream) {
+
+    const dim3 block_dims(WARP_SIZE, n_s, 1);
+    const int nblocks = 1; // TODO
+
+    ssm_scan_f32<WARP_SIZE><<<nblocks, block_dims, 0, stream>>>(
+        src0, src1, src2, src3,
+        src4, src5,
+        src0_nb1, src0_nb2,
+        src1_nb0, src1_nb1, src1_nb2, src1_nb3,
+        src2_nb0, src2_nb1, src2_nb2,
+        src3_nb1,
+        src4_nb1, src4_nb2,
+        src5_nb1, src5_nb2,
+        dst,
+        nc, nr, n_t, n_s);
+}
+
+void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0]; // s
+    const struct ggml_tensor * src1 = dst->src[1]; // x
+    const struct ggml_tensor * src2 = dst->src[2]; // dt
+    const struct ggml_tensor * src3 = dst->src[3]; // A
+    const struct ggml_tensor * src4 = dst->src[4]; // B
+    const struct ggml_tensor * src5 = dst->src[5]; // C
+
+    const int64_t nc  = src0->ne[0]; // d_state
+    const int64_t nr  = src0->ne[1]; // d_inner
+    const int64_t n_t = src1->ne[1]; // number of tokens per sequence
+    const int64_t n_s = src0->ne[2]; // number of sequences in the batch
+
+    GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src2->nb[0] == sizeof(float));
+    GGML_ASSERT(src3->nb[0] == sizeof(float));
+    GGML_ASSERT(src4->nb[0] == sizeof(float));
+    GGML_ASSERT(src5->nb[0] == sizeof(float));
+    // required for the dot product between s and C
+    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
+    // required for per-sequence offsets for states
+    GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
+    // required to get correct offset for state destination (i.e. src1->nb[3])
+    GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float));
+
+    const float * src0_d = (const float *)src0->data;
+    const float * src1_d = (const float *)src1->data;
+    const float * src2_d = (const float *)src2->data;
+    const float * src3_d = (const float *)src3->data;
+    const float * src4_d = (const float *)src4->data;
+    const float * src5_d = (const float *)src5->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    ssm_scan_f32_cuda(
+        src0_d, src1_d, src2_d, src3_d,
+        src4_d, src5_d,
+        src0->nb[1], src0->nb[2],
+        src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
+        src2->nb[0], src2->nb[1], src2->nb[2],
+        src3->nb[1],
+        src4->nb[1], src4->nb[2],
+        src5->nb[1], src5->nb[2],
+        dst_d,
+        nc, nr, n_t, n_s,
+        stream);
+}
--- a/ggml/src/ggml-cuda/ssm_scan.cuh
+++ b/ggml/src/ggml-cuda/ssm_scan.cuh
@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -130,42 +130,3 @@
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
-
-// XXX: Clang builtins mapping
-#define __vsub4   __vsub4_musa
-#define __vcmpeq4 __vcmpeq4_musa
-#define __vcmpne4 __vcmpne4_musa
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-
-static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
-    return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-    }
-    return c;
-}
-
-static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
-    }
-    return c;
-}
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -17,8 +17,8 @@
 #define GGML_METAL_LOG_WARN(...)
 #define GGML_METAL_LOG_ERROR(...)
 #else
-#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
-#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
+#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
+#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
 #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #endif

@ -3039,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute(
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
            if (status == MTLCommandBufferStatusError) {
-                NSString * error_code = [command_buffer error].localizedDescription;
-                GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
+                GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
            }

            return GGML_STATUS_FAILED;
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_CLAMP:
+            return true;
        case GGML_OP_CONT:
+            return op->src[0]->type != GGML_TYPE_BF16;
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
+            // TODO: add support for the new F32 operations
+            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
--- a/include/llama.h
+++ b/include/llama.h
@ -1127,6 +1127,10 @@ extern "C" {
                             int32_t   n_logit_bias,
              const llama_logit_bias * logit_bias);

+
+    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
+    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
+
    /// @details Sample and accept a token from the idx-th output of the last evaluation
    //
    // Shorthand for:
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit c26d5fb85b4070a9e9c4e65d132c783b98086890
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -8,6 +8,7 @@
 #include <cstring>
 #include <ctime>
 #include <cfloat>
+#include <chrono>
 #include <cmath>
 #include <numeric>
 #include <random>
@ -162,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
    cur_p->size = k;
 }

+static uint32_t get_rng_seed(uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        // use system clock if std::random_device is not a true RNG
+        static bool is_rd_prng = std::random_device().entropy() == 0;
+        if (is_rd_prng) {
+            return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
+        }
+        std::random_device rd;
+        return rd();
+    }
+    return seed;
+}
+
 // llama_sampler API

 const char * llama_sampler_name(const struct llama_sampler * smpl) {
@ -387,6 +401,7 @@ struct llama_sampler * llama_sampler_init_greedy() {

 struct llama_sampler_dist {
    const uint32_t seed;
+          uint32_t seed_cur;

    std::mt19937 rng;
 };
@ -416,7 +431,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample

 static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_dist *) smpl->ctx;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }

 static void llama_sampler_dist_free(struct llama_sampler * smpl) {
@ -433,11 +449,13 @@ static struct llama_sampler_i llama_sampler_dist_i = {
 };

 struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_dist_i,
        /* .ctx   = */ new llama_sampler_dist {
-            /* .seed = */ seed,
-            /* .rng  = */ std::mt19937(seed),
+            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
+            /* .rng      = */ std::mt19937(seed_cur),
        },
    };
 }
@ -1032,6 +1050,7 @@ struct llama_sampler_mirostat {
    const int32_t n_vocab;

    const uint32_t seed;
+          uint32_t seed_cur;

    const float tau;
    const float eta;
@ -1100,7 +1119,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa
 static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
    ctx->mu = 2.0f*ctx->tau;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }

 static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
@ -1117,16 +1137,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
 };

 struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
+    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_mirostat_i,
        /* .ctx   = */ new llama_sampler_mirostat {
-            /* .n_vocab = */ n_vocab,
-            /* .seed    = */ seed,
-            /* .tau     = */ tau,
-            /* .eta     = */ eta,
-            /* .m       = */ m,
-            /* .mu      = */ 2.0f*tau,
-            /* .rng     = */ std::mt19937(seed),
+            /* .n_vocab  = */ n_vocab,
+            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
+            /* .tau      = */ tau,
+            /* .eta      = */ eta,
+            /* .m        = */ m,
+            /* .mu       = */ 2.0f*tau,
+            /* .rng      = */ std::mt19937(seed_cur),
        },
    };
 }
@ -1135,6 +1157,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see

 struct llama_sampler_mirostat_v2 {
    const uint32_t seed;
+          uint32_t seed_cur;

    const float tau;
    const float eta;
@ -1179,7 +1202,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
 static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
    ctx->mu = 2.0f*ctx->tau;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }

 static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
@ -1212,14 +1236,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
 };

 struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
+    auto seed_cur = get_rng_seed(seed);
    return new llama_sampler {
        /* .iface = */ &llama_sampler_mirostat_v2_i,
        /* .ctx   = */ new llama_sampler_mirostat_v2 {
-            /* .seed  = */ seed,
-            /* .tau   = */ tau,
-            /* .eta   = */ eta,
-            /* .mu    = */ 2.0f*tau,
-            /* .rng   = */ std::mt19937(seed),
+            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
+            /* .tau      = */ tau,
+            /* .eta      = */ eta,
+            /* .mu       = */ 2.0f*tau,
+            /* .rng      = */ std::mt19937(seed_cur),
        },
    };
 }
@ -1505,6 +1531,8 @@ struct llama_sampler * llama_sampler_init_penalties(
        ignore_eos = false;
    }

+    penalty_last_n = std::max(penalty_last_n, 0);
+
    return new llama_sampler {
        /* .iface = */ &llama_sampler_penalties_i,
        /* .ctx   = */ new llama_sampler_penalties {
@ -1568,6 +1596,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
        }
    }
 }
+
 static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
    return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
@ -1599,3 +1628,31 @@ struct llama_sampler * llama_sampler_init_logit_bias(
        },
    };
 }
+
+// utils
+
+uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
+    if (smpl->iface == &llama_sampler_dist_i) {
+        return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_i) {
+        return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_v2_i) {
+        return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_chain_i) {
+        const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
+        for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
+            const uint32_t seed = llama_sampler_get_seed(*it);
+            if (seed != LLAMA_DEFAULT_SEED) {
+                return seed;
+            }
+        }
+    }
+
+    return LLAMA_DEFAULT_SEED;
+}
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -9258,7 +9258,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
    // FIXME: zero-out NANs?
    states = ggml_mul(ctx, states, state_mask);

-    // copy states which won't be changed further (between n_seqs and n_rs)
+    // copy states which won't be changed further (between n_seqs and n_kv)
    ggml_build_forward_expand(graph,
        ggml_cpy(ctx,
            ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
@ -9365,9 +9365,9 @@ static struct ggml_tensor * llm_build_mamba(

        // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
        if (ssm_dt_b_c_rms) {
-            dt = ggml_rms_norm(ctx, dt, norm_rms_eps);
-            B = ggml_rms_norm(ctx, B, norm_rms_eps);
-            C = ggml_rms_norm(ctx, C, norm_rms_eps);
+            dt = ggml_rms_norm(ctx, ggml_cont(ctx, dt), norm_rms_eps);
+            B = ggml_rms_norm(ctx, ggml_cont(ctx, B), norm_rms_eps);
+            C = ggml_rms_norm(ctx, ggml_cont(ctx, C), norm_rms_eps);
        }

        // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
@ -17530,6 +17530,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;

        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
Author	SHA1	Message	Date
piDack	917dd369a0	Merge `63b6e73500` into `67155ab7f5`	2024-09-11 11:22:41 +02:00
Farbod Bijary	67155ab7f5	feat: Implements retrying logic for downloading models using --model-url flag (#9255 ) * feat: Implements retrying logic for downloading models using --model-url flag * Update common/common.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * Update common/common.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * apply comments * implements a retry function to avoid duplication * fix editorconfig * change function name --------- Co-authored-by: farbod <farbod.bjary82@gmail.com> Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2024-09-11 11:22:37 +02:00
Johannes Gäßler	5af118efda	CUDA: fix --split-mode row race condition (#9413 )	2024-09-11 10:22:40 +02:00
Georgi Gerganov	d2b496bff4	batched-bench : remove unused code (#9305 )	2024-09-11 10:03:54 +03:00
R0CKSTAR	b34e023480	musa: remove Clang builtins mapping (#9421 ) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>	2024-09-11 03:46:55 +02:00
Alberto Cabrera Pérez	51b6038636	sycl : update support conditions (#9394 ) * sycl : update support condition to im2col Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com> * Added TODO to remind supporting FP32 im2col --------- Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com>	2024-09-11 08:53:42 +08:00
Georgi Gerganov	cb9c933eb2	flake.lock: Update (#9360 ) Flake lock file updates: • Updated input 'flake-parts': 'github:hercules-ci/flake-parts/af510d4a62d071ea13925ce41c95e3dec816c01d?narHash=sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E%3D' (2024-08-30) → 'github:hercules-ci/flake-parts/567b938d64d4b4112ee253b9274472dc3a346eb6?narHash=sha256-%2Bebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y%3D' (2024-09-01) • Updated input 'flake-parts/nixpkgs-lib': '`a5d394176e`.tar.gz?narHash=sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q%3D' (2024-08-01) → '`356624c120`.tar.gz?narHash=sha256-Ss8QWLXdr2JCBPcYChJhz4xJm%2Bh/xjl4G0c0XlP6a74%3D' (2024-09-01) • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/71e91c409d1e654808b2621f28a327acfdad8dc2?narHash=sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w%3D' (2024-08-28) → 'github:NixOS/nixpkgs/574d1eac1c200690e27b8eb4e24887f8df7ac27c?narHash=sha256-v3rIhsJBOMLR8e/RNWxr828tB%2BWywYIoajrZKFM%2B0Gg%3D' (2024-09-06) Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2024-09-10 15:46:59 -07:00
Xuan Son Nguyen	6cd4e03444	arg : bring back missing ifdef (#9411 ) * arg : bring back missing ifdef * replace with llama_supports_gpu_offload	2024-09-10 22:41:29 +02:00
matteo	8d300bd35f	enable --special arg for llama-server (#9419 ) Co-authored-by: matteo serva <matteo.serva@gmail.com>	2024-09-10 22:40:59 +02:00
slaren	49006c67b4	llama : move random seed generation to the samplers (#9398 ) * llama_sampler_penalties : clamp penalty_last_n to zero	2024-09-10 18:04:25 +02:00
Georgi Gerganov	00ba2ff781	metal : fix compile warning with GGML_METAL_NDEBUG (#0 )	2024-09-10 10:17:43 +03:00
Daniel Bevenius	83008b7cfe	llama : update llm_build_copy_mask_state comment [no ci] (#9385 ) This commit updates the comment, which seems to contain a typo or be an outdated comment, in the copy_mask_state function changing the variable n_rs to n_kv. I believe this change is correct and what the comment wants to convey is to copy the states that are not going to be used in the upcoming processing, which are the tokens states from n_seqs up to the number of possible token states n_kv.	2024-09-10 10:03:21 +03:00
Molly Sophia	0b4ac75772	RWKV v6: Add time_mix_decay_w1/w2 in quant exclusion list (#9387 ) Signed-off-by: Molly Sophia <mollysophia379@gmail.com>	2024-09-10 10:02:30 +03:00
slaren	fb3f249815	make : do not run llama-gen-docs when building (#9399 )	2024-09-10 09:23:33 +03:00
pidack	63b6e73500	recommit for ci pass	2024-08-29 11:17:12 +08:00
pidack	99f2ac1a9d	Merge branch 'master' of github.com:ggerganov/llama.cpp into mfalcon_mamba_cuda	2024-08-29 10:36:51 +08:00
pidack	316a049533	add restrict for dst	2024-08-29 10:36:33 +08:00
pidack	5999d6d06e	fix conflicts	2024-08-28 09:49:17 +08:00
pidack	0e682ced5e	add restrict	2024-08-27 20:54:39 +08:00
pidack	eec0e8ca81	memory access pattern	2024-08-27 20:51:26 +08:00
pidack	e53b14f152	del debug ingo	2024-08-27 19:33:28 +08:00
pidack	21c16fa5ed	fix trailing whitespace	2024-08-27 19:10:57 +08:00
pidack	1928967874	resolve test-backend-ops conflicts	2024-08-27 17:31:40 +08:00
pidack	40f47872b3	Merge branch 'master' of github.com:ggerganov/llama.cpp into mfalcon_mamba_cuda	2024-08-27 17:08:23 +08:00
pidack	b423a6df5e	fix ssm_scan numerical error & others update	2024-08-27 16:51:21 +08:00
pidack	8dd323b496	Merge branch 'master' of github.com:ggerganov/llama.cpp into mfalcon_mamba_cuda	2024-08-27 09:44:18 +08:00
pidack	20d390bea4	10x performance improve 4 cuda ssm conv & scan	2024-08-26 17:33:23 +08:00
Jan Ploski	fae826fb56	Fix failed assertions while running Falcon Mamba	2024-08-25 14:57:47 +02:00
Jan Ploski	061e520075	Update CUDA ops and tests to match implementation from commit `8fb57ac0` (llama : use im2col and mul_mat to perform convolution for Mamba); GPU version breaks with assert because of unsupported MUL_MAT	2024-08-25 00:19:37 +02:00
Jan Ploski	12c913c52c	Fix backend test for ssm_conv CUDA op not working	2024-08-24 23:43:42 +02:00
Jan Ploski	64fbd320ef	Add patch to test cases provided by @compilade; test for ssm_conv fails	2024-08-24 23:43:36 +02:00
Jan Ploski	25f9e65d3a	Update CUDA ops ssm_conv and ssm_scan to match CPU implementation from PR #7531 (as per `eb589d5e`)	2024-08-24 23:43:30 +02:00
Jan Ploski	cc365b045b	Add GGML_OP_SSM_CONF, GGML_OP_SSM_SCAN to supported ops for CUDA backend + test case for each op	2024-08-24 23:43:24 +02:00
Jan Ploski	f809568fa1	Add initial/naive CUDA kernels for the GGML_OP_SSM_CONV and GGML_OP_SSM_SCAN ops	2024-08-24 23:43:10 +02:00
				`@ -0,0 +1 @@`
				`Subproject commit c26d5fb85b4070a9e9c4e65d132c783b98086890`