diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index eb69b82c4..56d22bc0c 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -129,6 +129,8 @@ jobs: - name: Server bench id: server_bench + env: + HEAD_REF: ${{ github.head_ref || github.ref_name }} run: | set -eux @@ -137,7 +139,7 @@ jobs: python bench.py \ --runner-label ${{ env.RUNNER_LABEL }} \ --name ${{ github.job }} \ - --branch ${{ github.head_ref || github.ref_name }} \ + --branch $HEAD_REF \ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ --scenario script.js \ --duration ${{ github.event.inputs.duration || env.DURATION }} \ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b9246659a..74b5d4f69 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -47,7 +47,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -105,7 +105,7 @@ jobs: sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF + cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -222,7 +222,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF cmake --build . --config Release -j $(nproc) - name: Test @@ -696,22 +696,20 @@ jobs: strategy: matrix: include: - - build: 'rpc-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' - build: 'noavx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx2-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' - build: 'avx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx512-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'openblas-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'kompute-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' - build: 'vulkan-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' - build: 'llvm-arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'msvc-arm64' diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 4e0374fc6..46e80aecd 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -6,15 +6,13 @@ on: - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - - 'requirements.txt' - - 'requirements/*.txt' + - '**/requirements*.txt' pull_request: paths: - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - - 'requirements.txt' - - 'requirements/*.txt' + - '**/requirements*.txt' concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} diff --git a/README.md b/README.md index 1283f6805..7f48fde6e 100644 --- a/README.md +++ b/README.md @@ -186,10 +186,12 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption +- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage **Infrastructure:** - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp +- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs **Games:** - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index a518b766d..438452eab 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -369,6 +369,9 @@ namespace grammar_parser { } // Validate the state to ensure that all rules are defined for (const auto & rule : state.rules) { + if (rule.empty()) { + throw std::runtime_error("Undefined rule"); + } for (const auto & elem : rule) { if (elem.type == LLAMA_GRETYPE_RULE_REF) { // Ensure that the rule at that location exists diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 91c33c34a..7dce99c9a 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -17,9 +17,9 @@ For example: ```bash ./bin/llama-export-lora \ - -m open-llama-3b-v2-q8_0.gguf \ - -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ - --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf + -m open-llama-3b-v2.gguf \ + -o open-llama-3b-v2-english2tokipona-chat.gguf \ + --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf ``` Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 3176d6e26..c7e5ca788 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -10,6 +10,12 @@ static bool g_verbose = false; +struct tensor_transformation { + struct ggml_tensor * in; + struct ggml_tensor * out; + bool is_copy; +}; + static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ int id = gguf_find_key(ctx_gguf, key.c_str()); return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); @@ -198,8 +204,7 @@ struct lora_merge_ctx { } // mapping base tensor to out tensor (same shape with base, but different type) - // if out_tensor == nullptr, we only copy it - std::vector> base_to_out_tensors; + std::vector trans; for (auto & it : base_model.tensors) { bool t_a = true; bool t_b = true; @@ -212,14 +217,22 @@ struct lora_merge_ctx { // only copy struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); ggml_set_name(cpy_tensor, base_tensor->name); - base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr)); + trans.push_back({ + cpy_tensor, + cpy_tensor, + true, + }); gguf_add_tensor(ctx_out, cpy_tensor); } else if (t_a && t_b) { // need merging struct ggml_tensor * out_tensor = ggml_new_tensor( ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); ggml_set_name(out_tensor, base_tensor->name); - base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor)); + trans.push_back({ + base_tensor, + out_tensor, + false, + }); gguf_add_tensor(ctx_out, out_tensor); } else { throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); @@ -234,12 +247,12 @@ struct lora_merge_ctx { // process base model tensors size_t n_merged = 0; - for (auto & it : base_to_out_tensors) { - if (it.second != nullptr) { - merge_tensor(it.first, it.second); + for (auto & it : trans) { + if (!it.is_copy) { + merge_tensor(it.in, it.out); n_merged++; } else { - copy_tensor(it.first); + copy_tensor(it.in); } } @@ -252,7 +265,7 @@ struct lora_merge_ctx { } printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); - printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size()); + printf("%s : wrote %ld tensors to output file\n", __func__, trans.size()); } void copy_tensor(struct ggml_tensor * base) { @@ -285,6 +298,10 @@ struct lora_merge_ctx { for (size_t i = 0; i < adapters.size(); ++i) { auto t_a = adapters[i]->get_tensor(name_lora_a); auto t_b = adapters[i]->get_tensor(name_lora_b); + // TODO: add support for quantized lora + if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) { + throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32"); + } inp_a[i] = ggml_dup_tensor(ctx, t_a); inp_b[i] = ggml_dup_tensor(ctx, t_b); } diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index dfe5fbe62..cbcbf26c9 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -2,4 +2,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu pillow~=10.2.0 torch~=2.2.1 -torchvision==0.17.1 +torchvision~=0.17.1 diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 360f571e4..1621c7c43 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -631,6 +631,7 @@ struct server_context { bool clean_kv_cache = true; bool add_bos_token = true; + bool has_eos_token = false; int32_t n_ctx; // total context for all clients / slots @@ -693,7 +694,7 @@ struct server_context { n_ctx = llama_n_ctx(ctx); add_bos_token = llama_should_add_bos_token(model); - GGML_ASSERT(llama_add_eos_token(model) != 1); + has_eos_token = llama_add_eos_token(model) != 1; return true; } @@ -1031,7 +1032,7 @@ struct server_context { { slot.sparams.logit_bias.clear(); - if (json_value(data, "ignore_eos", false)) { + if (json_value(data, "ignore_eos", false) && has_eos_token) { slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index c7aae0267..64927332e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -244,6 +244,8 @@ #define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_ABORTED 1 +#define GGML_ROPE_TYPE_NEOX 2 + #define GGUF_MAGIC "GGUF" #define GGUF_VERSION 3 @@ -1455,8 +1457,8 @@ extern "C" { struct ggml_tensor * b); // rotary position embedding - // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED) - // if mode & 2 == 1, GPT-NeoX style + // if (mode & 1) - skip n_past elements (NOT SUPPORTED) + // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style // // b is an int32 vector with size a->ne[2], it contains the positions GGML_API struct ggml_tensor * ggml_rope( diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 8c4132f5b..a4ec8418e 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2881,7 +2881,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; // init cos/sin cache ggml_cann_pool_alloc sin_allocator( diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 99ec1dd98..88f586d68 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -226,7 +226,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const int32_t * pos = (const int32_t *) src1_d; diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index aad189430..995f1934b 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -2313,7 +2313,7 @@ static enum ggml_status ggml_metal_graph_compute( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; id pipeline = nil; diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index c7545bcc1..1f06f78fa 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -226,7 +226,7 @@ void ggml_sycl_op_rope( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const int32_t * pos = (const int32_t *) src1_dd; diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 867328372..c0504e434 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -4053,7 +4053,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const case GGML_OP_ROPE: { const int mode = ((const int32_t *) dst->op_params)[2]; - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; if (is_neox) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a9033ed11..932812486 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -14131,7 +14131,7 @@ static void ggml_compute_forward_rope_f32( float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const float * freq_factors = NULL; if (src2 != NULL) { @@ -14256,7 +14256,7 @@ static void ggml_compute_forward_rope_f16( float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - const bool is_neox = mode & 2; + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const float * freq_factors = NULL; if (src2 != NULL) { @@ -21168,7 +21168,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p (int64_t) info->ne[2] * (int64_t) info->ne[3]; - if (ne % ggml_blck_size(info->type) != 0) { + if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) { fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n", __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); fclose(file); diff --git a/ggml/src/kompute-shaders/op_rope_f16.comp b/ggml/src/kompute-shaders/op_rope_f16.comp index 1a4058b3f..0ecfb2eab 100644 --- a/ggml/src/kompute-shaders/op_rope_f16.comp +++ b/ggml/src/kompute-shaders/op_rope_f16.comp @@ -11,7 +11,7 @@ void main() { const uint i2 = gl_WorkGroupID.y; const uint i1 = gl_WorkGroupID.x; - const bool is_neox = (pcs.mode & 2) != 0; + const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0; float corr_dims[2]; rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); diff --git a/ggml/src/kompute-shaders/op_rope_f32.comp b/ggml/src/kompute-shaders/op_rope_f32.comp index 65e03827a..cec0fd9a5 100644 --- a/ggml/src/kompute-shaders/op_rope_f32.comp +++ b/ggml/src/kompute-shaders/op_rope_f32.comp @@ -11,7 +11,7 @@ void main() { const uint i2 = gl_WorkGroupID.y; const uint i1 = gl_WorkGroupID.x; - const bool is_neox = (pcs.mode & 2) != 0; + const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0; float corr_dims[2]; rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); diff --git a/ggml/src/kompute-shaders/rope_common.comp b/ggml/src/kompute-shaders/rope_common.comp index 7b9394cb2..df4702896 100644 --- a/ggml/src/kompute-shaders/rope_common.comp +++ b/ggml/src/kompute-shaders/rope_common.comp @@ -1,5 +1,7 @@ #include "common.comp" +#define GGML_ROPE_TYPE_NEOX 2 + // TODO: use a local size of 32 or more (Metal uses 1024) layout(local_size_x = 1) in; diff --git a/include/llama.h b/include/llama.h index 6d0bfa12f..e00581f64 100644 --- a/include/llama.h +++ b/include/llama.h @@ -95,13 +95,10 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, }; - // note: these values should be synchronized with ggml_rope - // TODO: maybe move this enum to ggml.h (ggml_rope_type) enum llama_rope_type { LLAMA_ROPE_TYPE_NONE = -1, - LLAMA_ROPE_TYPE_NORM = 0, - LLAMA_ROPE_TYPE_NEOX = 2, - LLAMA_ROPE_TYPE_GLM = 4, + LLAMA_ROPE_TYPE_NORM = 0, + LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX, }; enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 8910f6d65..8f4841d9d 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra constexpr float bucket_low = -10.0f; constexpr float bucket_high = 10.0f; constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); - constexpr float bucker_inter = -bucket_low * bucket_scale; + constexpr float bucket_inter = -bucket_low * bucket_scale; std::vector bucket_idx(candidates->size); std::vector histo(nbuckets, 0); for (int i = 0; i < (int)candidates->size; ++i) { const float val = candidates->data[i].logit; - int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); + int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); ib = std::max(0, std::min(nbuckets-1, ib)); bucket_idx[i] = ib; ++histo[ib]; diff --git a/src/llama.cpp b/src/llama.cpp index 5e54c3af4..e1a060aed 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3575,13 +3575,8 @@ namespace GGUFMeta { using llama_buf_map = std::unordered_map; -// TODO: update when needed or think of some clever automatic way to do this -static size_t llama_model_max_nodes(const llama_model & /*model*/) { - //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B - // return 32768; - //} - - return 8192; +static size_t llama_model_max_nodes(const llama_model & model) { + return std::max(8192, model.tensors_by_name.size()*5); } struct llama_model_loader {