diff --git a/README.md b/README.md index c945e125c..73041b1a2 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ Typically finetunes of the base models below are supported as well. - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) +- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) diff --git a/common/arg.cpp b/common/arg.cpp index 86f65f294..a1cd5830f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -173,7 +173,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx std::string arg; const std::string arg_prefix = "--"; gpt_params & params = ctx_arg.params; - gpt_sampler_params & sparams = params.sparams; std::unordered_map arg_to_options; for (auto & opt : ctx_arg.options) { @@ -283,10 +282,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx params.kv_overrides.back().key[0] = 0; } - if (sparams.seed == LLAMA_DEFAULT_SEED) { - sparams.seed = time(NULL); - } - return true; } @@ -831,7 +826,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params) { params.special = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"-cnv", "--conversation"}, format( @@ -917,7 +912,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_sparam()); add_opt(llama_arg( {"-s", "--seed"}, "SEED", - format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed), + format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED), [](gpt_params & params, const std::string & value) { params.sparams.seed = std::stoul(value); } @@ -1430,20 +1425,18 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } - else if (arg_next == "row") { + } else if (arg_next == "row") { #ifdef GGML_USE_SYCL fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); exit(1); #endif // GGML_USE_SYCL params.split_mode = LLAMA_SPLIT_MODE_ROW; - } - else { + } else { throw std::invalid_argument("invalid value"); } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n"); + } } )); add_opt(llama_arg( @@ -1463,14 +1456,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); + params.tensor_split[i] = std::stof(split_arg[i]); } else { - params.tensor_split[i] = 0.0f; + params.tensor_split[i] = 0.0f; } } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n"); + } } )); add_opt(llama_arg( @@ -1478,9 +1471,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), [](gpt_params & params, int value) { params.main_gpu = value; -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); + } } )); add_opt(llama_arg( diff --git a/common/common.cpp b/common/common.cpp index 11fc1e223..f3304b357 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -56,14 +56,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) -#define GGML_USE_CUDA_SYCL -#endif - -#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) -#define GGML_USE_CUDA_SYCL_VULKAN -#endif - #if defined(LLAMA_USE_CURL) #ifdef __linux__ #include @@ -950,11 +942,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p #ifdef LLAMA_USE_CURL +#define CURL_MAX_RETRY 3 +#define CURL_RETRY_DELAY_SECONDS 2 + + static bool starts_with(const std::string & str, const std::string & prefix) { // While we wait for C++20's std::string::starts_with... return str.rfind(prefix, 0) == 0; } +static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { + int remaining_attempts = max_attempts; + + while (remaining_attempts > 0) { + fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); + + CURLcode res = curl_easy_perform(curl); + if (res == CURLE_OK) { + return true; + } + + int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; + fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); + + remaining_attempts--; + std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); + } + + fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); + return false; +} + static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { // Initialize libcurl @@ -1058,9 +1076,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); - CURLcode res = curl_easy_perform(curl.get()); - if (res != CURLE_OK) { - fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + if (!was_perform_successful) { return false; } @@ -1135,11 +1152,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat }; // start the download - fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); - auto res = curl_easy_perform(curl.get()); - if (res != CURLE_OK) { - fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + if (!was_perform_successful) { return false; } diff --git a/common/sampling.cpp b/common/sampling.cpp index 8e429cac5..c07b5e940 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -310,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context return cur_p.data[cur_p.selected].id; } +uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) { + return llama_sampler_get_seed(gsmpl->chain); +} + // helpers llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) { diff --git a/common/sampling.h b/common/sampling.h index 0a4461fab..d0e1a9203 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -60,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl); + // helpers // access the internal list of current candidate tokens diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ff4955f9c..59a0b81a1 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -31,6 +31,7 @@ import re import requests import sys import json +import shutil from hashlib import sha256 from enum import IntEnum, auto @@ -125,12 +126,27 @@ def download_model(model): if tokt == TOKENIZER_TYPE.UGM: files.append("spiece.model") - for file in files: - save_path = f"models/tokenizers/{name}/{file}" - if os.path.isfile(save_path): - logger.info(f"{name}: File {save_path} already exists - skipping") - continue - download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) + if os.path.isdir(repo): + # If repo is a path on the file system, copy the directory + for file in files: + src_path = os.path.join(repo, file) + dst_path = f"models/tokenizers/{name}/{file}" + if os.path.isfile(dst_path): + logger.info(f"{name}: File {dst_path} already exists - skipping") + continue + if os.path.isfile(src_path): + shutil.copy2(src_path, dst_path) + logger.info(f"{name}: Copied {src_path} to {dst_path}") + else: + logger.warning(f"{name}: Source file {src_path} does not exist") + else: + # If repo is a URL, download the files + for file in files: + save_path = f"models/tokenizers/{name}/{file}" + if os.path.isfile(save_path): + logger.info(f"{name}: File {save_path} already exists - skipping") + continue + download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) for model in models: diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index c2e854ea4..ec00fcf78 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -3,32 +3,10 @@ #include "llama.h" #include -#include #include #include #include -// mutates the input string -static std::vector parse_list(char * p) { - std::vector ret; - - char * q = p; - - while (*p) { - if (*p == ',') { - *p = '\0'; - ret.push_back(std::atoi(q)); - q = p + 1; - } - - ++p; - } - - ret.push_back(std::atoi(q)); - - return ret; -} - static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 5661cf0b7..e94ae2955 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -90,8 +90,6 @@ int main(int argc, char ** argv) { print_build_info(); - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 9a527e244..7e252ce09 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -159,8 +159,6 @@ int main(int argc, char ** argv) { print_build_info(); - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); @@ -301,6 +299,9 @@ int main(int argc, char ** argv) { LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); } } + smpl = gpt_sampler_init(model, sparams); + + LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl)); LOG_TEE("sampling: \n%s\n", sparams.print().c_str()); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); @@ -340,8 +341,6 @@ int main(int argc, char ** argv) { std::vector embd; - smpl = gpt_sampler_init(model, sparams); - while (n_remain != 0 || params.interactive) { // predict if (!embd.empty()) { diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index f36caa42e..3ac455e69 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -18,8 +18,8 @@ struct llava_context { }; static void show_additional_info(int /*argc*/, char ** argv) { - LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); + LOG_TEE("\nexample usage:\n\n%s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); } static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { @@ -255,7 +255,7 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { return 1; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b986a865a..f41be5308 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -191,8 +191,6 @@ int main(int argc, char ** argv) { print_build_info(); - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); @@ -470,8 +468,10 @@ int main(int argc, char ** argv) { exit(1); } + LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl)); LOG_TEE("sampling params: \n%s\n", sparams.print().c_str()); - LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); + LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); + LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); // group-attention state diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 1bdb6521c..29ff86bbc 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) { print_build_info(); - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7495821f9..5b263f646 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1266,6 +1266,7 @@ struct server_context { {"n_predict", slot.n_predict}, // Server configured n_predict {"model", params.model_alias}, {"seed", slot.sparams.seed}, + {"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0}, {"temperature", slot.sparams.temp}, {"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, diff --git a/flake.lock b/flake.lock index 10e1f8a29..e9382ff3d 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1725024810, - "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=", + "lastModified": 1725234343, + "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d", + "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1724819573, - "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=", + "lastModified": 1725634671, + "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2", + "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c", "type": "github" }, "original": { @@ -36,14 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1722555339, - "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=", + "lastModified": 1725233747, + "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=", "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" }, "original": { "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" } }, "root": { diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 06930ba2e..24b8b752c 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1942,7 +1942,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) { GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return nullptr; } - + ggml_cann_set_device(ctx->device); ggml_backend_t cann_backend = new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), /* .interface = */ ggml_backend_cann_interface, diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 78d70cd7a..4935f8818 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q( // nrows_dst == nrows of the matrix that the kernel writes into const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst}; + // The stream-k decomposition is only faster for recent NVIDIA GPUs. + // Also its fixup needs to allocate a temporary buffer in the memory pool. + // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. + const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11; + const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; switch (src0->type) { case GGML_TYPE_Q4_0: diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index e8a957447..021a25682 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -2742,6 +2742,7 @@ struct mmq_args { int64_t ne00; int64_t ne01; int64_t stride01; int64_t ne10; int64_t ne11; int64_t stride11; int64_t ne0; + bool use_stream_k; }; template @@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; const dim3 block_nums_xy_tiling(nty, ntx, 1); - const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; - if (!use_stream_k) { + if (!args.use_stream_k) { if (args.ne01 % mmq_y == 0) { constexpr bool need_check = false; mul_mat_q<<>> diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index e50a103ac..8df571149 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -130,42 +130,3 @@ #define cudaKernelNodeParams musaKernelNodeParams #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed #define cudaStreamEndCapture musaStreamEndCapture - -// XXX: Clang builtins mapping -#define __vsub4 __vsub4_musa -#define __vcmpeq4 __vcmpeq4_musa -#define __vcmpne4 __vcmpne4_musa - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); - -static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 4f03b01e7..e60350399 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + return true; case GGML_OP_CONT: + return op->src[0]->type != GGML_TYPE_BF16; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: return true; case GGML_OP_ROPE: return ggml_is_contiguous(op->src[0]); case GGML_OP_IM2COL: + // TODO: add support for the new F32 operations + return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_2D: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: diff --git a/include/llama.h b/include/llama.h index d0e0b3a61..c3408aaf1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1127,6 +1127,10 @@ extern "C" { int32_t n_logit_bias, const llama_logit_bias * logit_bias); + + // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise + LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); + /// @details Sample and accept a token from the idx-th output of the last evaluation // // Shorthand for: diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index d17e4427e..87d975f60 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -162,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } +static uint32_t get_rng_seed(uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { + // use system clock if std::random_device is not a true RNG + static bool is_rd_prng = std::random_device().entropy() == 0; + if (is_rd_prng) { + return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count(); + } + std::random_device rd; + return rd(); + } + return seed; +} + // llama_sampler API const char * llama_sampler_name(const struct llama_sampler * smpl) { @@ -387,6 +401,7 @@ struct llama_sampler * llama_sampler_init_greedy() { struct llama_sampler_dist { const uint32_t seed; + uint32_t seed_cur; std::mt19937 rng; }; @@ -416,7 +431,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample static void llama_sampler_dist_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_dist *) smpl->ctx; - ctx->rng = std::mt19937(ctx->seed); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static void llama_sampler_dist_free(struct llama_sampler * smpl) { @@ -433,11 +449,13 @@ static struct llama_sampler_i llama_sampler_dist_i = { }; struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { + auto seed_cur = get_rng_seed(seed); return new llama_sampler { /* .iface = */ &llama_sampler_dist_i, /* .ctx = */ new llama_sampler_dist { - /* .seed = */ seed, - /* .rng = */ std::mt19937(seed), + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), }, }; } @@ -1032,6 +1050,7 @@ struct llama_sampler_mirostat { const int32_t n_vocab; const uint32_t seed; + uint32_t seed_cur; const float tau; const float eta; @@ -1100,7 +1119,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_mirostat *) smpl->ctx; ctx->mu = 2.0f*ctx->tau; - ctx->rng = std::mt19937(ctx->seed); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static void llama_sampler_mirostat_free(struct llama_sampler * smpl) { @@ -1117,16 +1137,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = { }; struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { + auto seed_cur = get_rng_seed(seed); return new llama_sampler { /* .iface = */ &llama_sampler_mirostat_i, /* .ctx = */ new llama_sampler_mirostat { - /* .n_vocab = */ n_vocab, - /* .seed = */ seed, - /* .tau = */ tau, - /* .eta = */ eta, - /* .m = */ m, - /* .mu = */ 2.0f*tau, - /* .rng = */ std::mt19937(seed), + /* .n_vocab = */ n_vocab, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .tau = */ tau, + /* .eta = */ eta, + /* .m = */ m, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed_cur), }, }; } @@ -1135,6 +1157,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see struct llama_sampler_mirostat_v2 { const uint32_t seed; + uint32_t seed_cur; const float tau; const float eta; @@ -1179,7 +1202,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; ctx->mu = 2.0f*ctx->tau; - ctx->rng = std::mt19937(ctx->seed); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) { @@ -1212,14 +1236,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = { }; struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { + auto seed_cur = get_rng_seed(seed); return new llama_sampler { /* .iface = */ &llama_sampler_mirostat_v2_i, /* .ctx = */ new llama_sampler_mirostat_v2 { - /* .seed = */ seed, - /* .tau = */ tau, - /* .eta = */ eta, - /* .mu = */ 2.0f*tau, - /* .rng = */ std::mt19937(seed), + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .tau = */ tau, + /* .eta = */ eta, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed_cur), }, }; } @@ -1505,6 +1531,8 @@ struct llama_sampler * llama_sampler_init_penalties( ignore_eos = false; } + penalty_last_n = std::max(penalty_last_n, 0); + return new llama_sampler { /* .iface = */ &llama_sampler_penalties_i, /* .ctx = */ new llama_sampler_penalties { @@ -1568,6 +1596,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to } } } + static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx; return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data()); @@ -1600,6 +1629,34 @@ struct llama_sampler * llama_sampler_init_logit_bias( }; } +// utils + +uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { + if (smpl->iface == &llama_sampler_dist_i) { + return ((const llama_sampler_dist *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_mirostat_i) { + return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_mirostat_v2_i) { + return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_chain_i) { + const auto * ctx = (const llama_sampler_chain *) smpl->ctx; + for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) { + const uint32_t seed = llama_sampler_get_seed(*it); + if (seed != LLAMA_DEFAULT_SEED) { + return seed; + } + } + } + + return LLAMA_DEFAULT_SEED; +} + // perf struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) { diff --git a/src/llama.cpp b/src/llama.cpp index d3ba80eba..4801e4f2c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16078,19 +16078,21 @@ static int llama_decode_internal( return -1; } - for (uint32_t i = 0; i < n_tokens_all; ++i) { - if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); - return -1; - } - } - const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT + if (batch_all.token) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); + return -1; + } + } + } + GGML_ASSERT(n_tokens_all <= cparams.n_batch); GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); @@ -16377,19 +16379,21 @@ static int llama_encode_internal( return -1; } - for (uint32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); - return -1; - } - } - const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + if (batch.token) { + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); + return -1; + } + } + } + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");