diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c36eaadfb..181ef37e2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -375,7 +375,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -401,7 +401,7 @@ jobs: continue-on-error: true steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: add oneAPI to apt shell: bash @@ -442,7 +442,7 @@ jobs: continue-on-error: true steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: add oneAPI to apt shell: bash @@ -546,7 +546,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v1 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -576,7 +576,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v1 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -610,7 +610,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v1 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -969,14 +969,14 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install id: depends run: | $ErrorActionPreference = "Stop" write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait write-host "Completed AMD HIP SDK installation" diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 99feb28f2..29f8fd444 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -173,6 +173,7 @@ jobs: if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests + $env:PYTHONIOENCODING = ":replace" behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - name: Slow tests diff --git a/CMakeLists.txt b/CMakeLists.txt index a31320635..244019313 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o # determining _precisely_ which defines are necessary for the llama-config # package. # +set(GGML_TRANSIENT_DEFINES) get_target_property(GGML_DIRECTORY ggml SOURCE_DIR) get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS) +if (GGML_DIR_DEFINES) + list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES}) +endif() get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) -set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES}) +if (GGML_TARGET_DEFINES) + list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES}) +endif() get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) diff --git a/Makefile b/Makefile index 97ef37c0e..8d3fd3ee8 100644 --- a/Makefile +++ b/Makefile @@ -434,7 +434,7 @@ endif # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue -ifndef RISCV +ifndef RISCV_CROSS_COMPILE ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) # Use all CPU extensions that are available: @@ -514,7 +514,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),) MK_CXXFLAGS += -mlasx endif -else +ifneq ($(filter riscv64%,$(UNAME_M)),) + MK_CFLAGS += -march=rv64gcv -mabi=lp64d + MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d +endif + +else # RISC-V CROSS COMPILATION MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d endif @@ -1454,7 +1459,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - ./llama-gen-docs libllava.a: examples/llava/llava.cpp \ examples/llava/llava.h \ diff --git a/README.md b/README.md index c945e125c..73041b1a2 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ Typically finetunes of the base models below are supported as well. - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) +- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) diff --git a/common/arg.cpp b/common/arg.cpp index c5134be51..ce6a27614 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -173,7 +173,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx std::string arg; const std::string arg_prefix = "--"; gpt_params & params = ctx_arg.params; - gpt_sampler_params & sparams = params.sparams; std::unordered_map arg_to_options; for (auto & opt : ctx_arg.options) { @@ -283,10 +282,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx params.kv_overrides.back().key[0] = 0; } - if (sparams.seed == LLAMA_DEFAULT_SEED) { - sparams.seed = time(NULL); - } - return true; } @@ -823,7 +818,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params) { params.special = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"-cnv", "--conversation"}, format( @@ -909,7 +904,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_sparam()); add_opt(llama_arg( {"-s", "--seed"}, "SEED", - format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed), + format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED), [](gpt_params & params, const std::string & value) { params.sparams.seed = std::stoul(value); } @@ -1422,20 +1417,18 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } - else if (arg_next == "row") { + } else if (arg_next == "row") { #ifdef GGML_USE_SYCL fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); exit(1); #endif // GGML_USE_SYCL params.split_mode = LLAMA_SPLIT_MODE_ROW; - } - else { + } else { throw std::invalid_argument("invalid value"); } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n"); + } } )); add_opt(llama_arg( @@ -1455,14 +1448,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); + params.tensor_split[i] = std::stof(split_arg[i]); } else { - params.tensor_split[i] = 0.0f; + params.tensor_split[i] = 0.0f; } } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n"); + } } )); add_opt(llama_arg( @@ -1470,9 +1463,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), [](gpt_params & params, int value) { params.main_gpu = value; -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); + } } )); add_opt(llama_arg( diff --git a/common/common.cpp b/common/common.cpp index 5395eaa0e..c492ae0cc 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -56,14 +56,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) -#define GGML_USE_CUDA_SYCL -#endif - -#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) -#define GGML_USE_CUDA_SYCL_VULKAN -#endif - #if defined(LLAMA_USE_CURL) #ifdef __linux__ #include @@ -949,11 +941,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p #ifdef LLAMA_USE_CURL +#define CURL_MAX_RETRY 3 +#define CURL_RETRY_DELAY_SECONDS 2 + + static bool starts_with(const std::string & str, const std::string & prefix) { // While we wait for C++20's std::string::starts_with... return str.rfind(prefix, 0) == 0; } +static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { + int remaining_attempts = max_attempts; + + while (remaining_attempts > 0) { + fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); + + CURLcode res = curl_easy_perform(curl); + if (res == CURLE_OK) { + return true; + } + + int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; + fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); + + remaining_attempts--; + std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); + } + + fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); + return false; +} + static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { // Initialize libcurl @@ -1057,9 +1075,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); - CURLcode res = curl_easy_perform(curl.get()); - if (res != CURLE_OK) { - fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + if (!was_perform_successful) { return false; } @@ -1134,11 +1151,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat }; // start the download - fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); - auto res = curl_easy_perform(curl.get()); - if (res != CURLE_OK) { - fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + if (!was_perform_successful) { return false; } @@ -1812,6 +1828,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false"); fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); + fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false"); fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); diff --git a/common/sampling.cpp b/common/sampling.cpp index 21b956462..4498feb11 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -310,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context return cur_p.data[cur_p.selected].id; } +uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) { + return llama_sampler_get_seed(gsmpl->chain); +} + // helpers llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) { diff --git a/common/sampling.h b/common/sampling.h index 0a4461fab..d0e1a9203 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -60,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl); + // helpers // access the internal list of current candidate tokens diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0a9bbc829..01a8a50a2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -302,6 +302,8 @@ class Model: gguf.MODEL_TENSOR.TIME_MIX_FIRST, gguf.MODEL_TENSOR.TIME_MIX_W1, gguf.MODEL_TENSOR.TIME_MIX_W2, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, ) ) or not new_name.endswith(".weight") @@ -624,6 +626,9 @@ class Model: if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct res = "exaone" + if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": + # ref: https://huggingface.co/microsoft/phi-2 + res = "phi-2" if res is None: logger.warning("\n") @@ -2769,6 +2774,8 @@ class Rwkv6Model(Model): self.gguf_writer.add_tokenizer_model("rwkv") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ff4955f9c..021f65abd 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -31,6 +31,7 @@ import re import requests import sys import json +import shutil from hashlib import sha256 from enum import IntEnum, auto @@ -97,6 +98,7 @@ models = [ {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", }, {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, + {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, ] @@ -125,12 +127,27 @@ def download_model(model): if tokt == TOKENIZER_TYPE.UGM: files.append("spiece.model") - for file in files: - save_path = f"models/tokenizers/{name}/{file}" - if os.path.isfile(save_path): - logger.info(f"{name}: File {save_path} already exists - skipping") - continue - download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) + if os.path.isdir(repo): + # If repo is a path on the file system, copy the directory + for file in files: + src_path = os.path.join(repo, file) + dst_path = f"models/tokenizers/{name}/{file}" + if os.path.isfile(dst_path): + logger.info(f"{name}: File {dst_path} already exists - skipping") + continue + if os.path.isfile(src_path): + shutil.copy2(src_path, dst_path) + logger.info(f"{name}: Copied {src_path} to {dst_path}") + else: + logger.warning(f"{name}: Source file {src_path} does not exist") + else: + # If repo is a URL, download the files + for file in files: + save_path = f"models/tokenizers/{name}/{file}" + if os.path.isfile(save_path): + logger.info(f"{name}: File {save_path} already exists - skipping") + continue + download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) for model in models: diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ddd347a2a..d1c94e580 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -363,7 +363,13 @@ if __name__ == '__main__': yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B))) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - dest = super().modify_tensors(data_torch, name, bid) + dest = list(super().modify_tensors(data_torch, name, bid)) + # some archs may have the same tensor for lm_head and output (tie word embeddings) + # in this case, adapters targeting lm_head will fail when using llama-export-lora + # therefore, we ignore them for now + # see: https://github.com/ggerganov/llama.cpp/issues/9065 + if name == "lm_head.weight" and len(dest) == 0: + raise ValueError("lm_head is present in adapter, but is ignored in base model") for dest_name, dest_data in dest: assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index a91e7f4bd..89a4566c4 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -3,32 +3,10 @@ #include "llama.h" #include -#include #include #include #include -// mutates the input string -static std::vector parse_list(char * p) { - std::vector ret; - - char * q = p; - - while (*p) { - if (*p == ',') { - *p = '\0'; - ret.push_back(std::atoi(q)); - q = p + 1; - } - - ++p; - } - - ret.push_back(std::atoi(q)); - - return ret; -} - static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 97622f4f4..922daf528 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -183,7 +183,7 @@ int main(int argc, char ** argv) { ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); - TENSOR_DUMP(gf->nodes[0]); + TENSOR_DUMP(ggml_graph_node(gf, 0)); printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); @@ -224,7 +224,7 @@ int main(int argc, char ** argv) { // Let's use the F32 result from above as a reference for the quantized multiplication - float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); + float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0)); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); printf("=====================================================================================\n"); @@ -252,7 +252,7 @@ int main(int argc, char ** argv) { // Check that the matrix multiplication result is in the right ballpark // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different - float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]); + float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0)); float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 05c66856c..a969c486d 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -226,8 +226,8 @@ static ggml_status compute_piter( result.eigenvectors.resize(params.n_batch); result.distances.resize(params.n_batch); // get output nodes - for (int i = 0; i < gf->n_nodes; ++i) { - auto node = gf->nodes[i]; + for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { + auto node = ggml_graph_node(gf, i); int iter = -1; // find b_tensor (without copying data from device) if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index da7c79253..db00c6363 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -90,8 +90,6 @@ int main(int argc, char ** argv) { print_build_info(); - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index ff324926a..90126ad1e 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -370,7 +370,7 @@ struct lora_merge_ctx { // write data to output file { - auto result = gf->nodes[gf->n_nodes - 1]; + auto * result = ggml_graph_node(gf, -1); size_t len = ggml_nbytes(result); if (read_buf.size() < len) { read_buf.resize(len); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 9a527e244..7e252ce09 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -159,8 +159,6 @@ int main(int argc, char ** argv) { print_build_info(); - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); @@ -301,6 +299,9 @@ int main(int argc, char ** argv) { LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); } } + smpl = gpt_sampler_init(model, sparams); + + LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl)); LOG_TEE("sampling: \n%s\n", sparams.print().c_str()); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); @@ -340,8 +341,6 @@ int main(int argc, char ** argv) { std::vector embd; - smpl = gpt_sampler_init(model, sparams); - while (n_remain != 0 || params.interactive) { // predict if (!embd.empty()) { diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 06a65fba4..4f783f3ce 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: ```sh -python ./examples/llava/convert_image_encoder_to_gguf \ +python ./examples/llava/convert_image_encoder_to_gguf.py \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B/llava.projector \ --output-dir path/to/MobileVLM-1.7B \ @@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \ ``` ```sh -python ./examples/llava/convert_image_encoder_to_gguf \ +python ./examples/llava/convert_image_encoder_to_gguf.py \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ --output-dir path/to/MobileVLM-1.7B_V2 \ @@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \ 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B +python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown ``` -5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` +5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k` ```sh -./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s +./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s ``` Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9b890571e..5dfb333d1 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 851af0f00..e162586ed 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); ggml_build_forward_expand(gf, flatten); ggml_graph_compute_with_ctx(model.ctx, gf, 1); - struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor* result = ggml_graph_node(gf, -1); memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 3475bbce5..afc74d279 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -18,8 +18,8 @@ struct llava_context { }; static void show_additional_info(int /*argc*/, char ** argv) { - LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); + LOG_TEE("\nexample usage:\n\n%s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); } static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { @@ -255,7 +255,7 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { return 1; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b986a865a..f41be5308 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -191,8 +191,6 @@ int main(int argc, char ** argv) { print_build_info(); - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); @@ -470,8 +468,10 @@ int main(int argc, char ** argv) { exit(1); } + LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl)); LOG_TEE("sampling params: \n%s\n", sparams.print().c_str()); - LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); + LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); + LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); // group-attention state diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index c7d617988..04df65b0a 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) { print_build_info(); - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 3ee4eb971..62680cda4 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET llama-quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/server/README.md b/examples/server/README.md index 79196e9c1..44a73ca0a 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`. *Options:* - `content`: Set the text to tokenize. + `content`: (Required) The text to tokenize. - `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` + `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` + + `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false` + +**Response:** + +Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise. + + +If `with_pieces` is `false`: +```json +{ + "tokens": [123, 456, 789] +} +``` + +If `with_pieces` is `true`: +```json +{ + "tokens": [ + {"id": 123, "piece": "Hello"}, + {"id": 456, "piece": " world"}, + {"id": 789, "piece": "!"} + ] +} +``` + +With input 'รก' (utf8 hex: C3 A1) on tinyllama/stories260k +```json +{ + "tokens": [ + {"id": 198, "piece": [195]}, // hex C3 + {"id": 164, "piece": [161]} // hex A1 + ] +} +``` ### POST `/detokenize`: Convert tokens to text diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 22343cea5..a9964812d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1266,6 +1266,7 @@ struct server_context { {"n_predict", slot.n_predict}, // Server configured n_predict {"model", params.model_alias}, {"seed", slot.sparams.seed}, + {"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0}, {"temperature", slot.sparams.temp}, {"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, @@ -3017,12 +3018,39 @@ int main(int argc, char ** argv) { const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) { const json body = json::parse(req.body); - std::vector tokens; + json tokens_response = json::array(); if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); - tokens = ctx_server.tokenize(body.at("content"), add_special); + const bool with_pieces = json_value(body, "with_pieces", false); + std::vector tokens = ctx_server.tokenize(body.at("content"), add_special); + + if (with_pieces) { + for (const auto& token : tokens) { + std::string piece = llama_token_to_piece(ctx_server.ctx, token); + json piece_json; + + // Check if the piece is valid UTF-8 + if (is_valid_utf8(piece)) { + piece_json = piece; + } else { + // If not valid UTF-8, store as array of byte values + piece_json = json::array(); + for (unsigned char c : piece) { + piece_json.push_back(static_cast(c)); + } + } + + tokens_response.push_back({ + {"id", token}, + {"piece", piece_json} + }); + } + } else { + tokens_response = tokens; + } } - const json data = format_tokenizer_response(tokens); + + const json data = format_tokenizer_response(tokens_response); res_ok(res, data); }; diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 22b889e12..a66723eb1 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + import asyncio import json import os @@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context): context.tokenize_add_special = True +@step("tokenizing with pieces") +@async_run_until_complete +async def step_tokenize_with_pieces(context): + context.tokenized_text = context_text(context) + async with aiohttp.ClientSession() as session: + tokenize_args = {"content": context.tokenized_text, "with_pieces": True} + if getattr(context, "tokenize_add_special", None) is not None: + tokenize_args["add_special"] = context.tokenize_add_special + + async with session.post( + f"{context.base_url}/tokenize", json=tokenize_args + ) as response: + assert response.status == 200 + tokenize_json = await response.json() + context.tokens_with_pieces = tokenize_json["tokens"] + + +@step("tokens are given with pieces") +@async_run_until_complete +async def step_tokenize_with_pieces(context): + # Verify that the response contains both token IDs and pieces + assert all( + "id" in token and "piece" in token for token in context.tokens_with_pieces + ) + + @step('tokenizing') @async_run_until_complete async def step_tokenize(context): diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index edfce65b6..adb1a1cb9 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso return res; } -static json format_tokenizer_response(const std::vector & tokens) { +static bool is_valid_utf8(const std::string & str) { + const unsigned char* bytes = reinterpret_cast(str.data()); + const unsigned char* end = bytes + str.length(); + + while (bytes < end) { + if (*bytes <= 0x7F) { + // 1-byte sequence (0xxxxxxx) + bytes++; + } else if ((*bytes & 0xE0) == 0xC0) { + // 2-byte sequence (110xxxxx 10xxxxxx) + if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80) + return false; + bytes += 2; + } else if ((*bytes & 0xF0) == 0xE0) { + // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80) + return false; + bytes += 3; + } else if ((*bytes & 0xF8) == 0xF0) { + // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || + (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80) + return false; + bytes += 4; + } else { + // Invalid UTF-8 lead byte + return false; + } + } + + return true; +} + +static json format_tokenizer_response(const json & tokens) { return json { {"tokens", tokens} }; diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index 111366fb0..a8cf0aa64 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -4,33 +4,23 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: MIT -INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" source /opt/intel/oneapi/setvars.sh -if [ $# -gt 0 ]; then - GGML_SYCL_DEVICE=$1 - GGML_SYCL_SINGLE_GPU=1 -else - GGML_SYCL_DEVICE=0 - GGML_SYCL_SINGLE_GPU=0 -fi - #export GGML_SYCL_DEBUG=1 - #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. -if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then +INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:" +MODEL_FILE=llama-2-7b.Q4_0.gguf +NGL=33 + +if [ $# -gt 0 ]; then + GGML_SYCL_DEVICE=$1 echo "use $GGML_SYCL_DEVICE as main GPU" #use signle GPU only - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none + else #use multiple GPUs with same max compute units - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 fi - -#use main GPU only -#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none - -#use multiple GPUs with same max compute units -#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 diff --git a/flake.lock b/flake.lock index 10e1f8a29..e9382ff3d 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1725024810, - "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=", + "lastModified": 1725234343, + "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d", + "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1724819573, - "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=", + "lastModified": 1725634671, + "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2", + "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c", "type": "github" }, "original": { @@ -36,14 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1722555339, - "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=", + "lastModified": 1725233747, + "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=", "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" }, "original": { "type": "tarball", - "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" + "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz" } }, "root": { diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h index ca73211fe..031ad1ce2 100644 --- a/ggml/include/ggml-cann.h +++ b/ggml/include/ggml-cann.h @@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device); */ GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void); +/** + * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. + * + * @return A pointer to the host buffer type interface. + */ +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); + /** * @brief Retrieves the description of a specific CANN device. * diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 536018b66..13026ab32 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -358,6 +358,7 @@ extern "C" { struct ggml_object; struct ggml_context; + struct ggml_cgraph; // NOTE: always add types at the end of the enum to keep backward compatibility enum ggml_type { @@ -575,23 +576,9 @@ extern "C" { GGML_TENSOR_FLAG_PARAM = 4, }; - // ggml object - struct ggml_object { - size_t offs; - size_t size; - - struct ggml_object * next; - - enum ggml_object_type type; - - char padding[4]; - }; - - static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); - // n-dimensional tensor struct ggml_tensor { - enum ggml_type type; + enum ggml_type type; GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); @@ -655,7 +642,7 @@ extern "C" { struct ggml_threadpool; // forward declaration, see ggml.c - typedef struct ggml_threadpool * ggml_threadpool_t; + typedef struct ggml_threadpool * ggml_threadpool_t; // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 @@ -671,35 +658,6 @@ extern "C" { void * abort_callback_data; }; - enum ggml_cgraph_eval_order { - GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, - GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, - GGML_CGRAPH_EVAL_ORDER_COUNT - }; - - typedef uint32_t ggml_bitset_t; - - struct ggml_hash_set { - size_t size; - ggml_bitset_t * used; // whether or not the keys are in use i.e. set - struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) - }; - - // computation graph - struct ggml_cgraph { - int size; - int n_nodes; - int n_leafs; - - struct ggml_tensor ** nodes; - struct ggml_tensor ** grads; - struct ggml_tensor ** leafs; - - struct ggml_hash_set visited_hash_set; - - enum ggml_cgraph_eval_order order; - }; - // scratch buffer struct ggml_scratch { size_t offs; @@ -2017,8 +1975,6 @@ extern "C" { typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); - #define GGML_N_TASKS_MAX -1 - GGML_API struct ggml_tensor * ggml_map_custom1( struct ggml_context * ctx, struct ggml_tensor * a, @@ -2088,30 +2044,35 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * tensor); - GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); // graph allocation in a context - GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false - GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads); - GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); - GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1); - GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads - GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); + GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false + GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); + GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); + GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads + GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); + + GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph); + GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i] + GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph); + GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph); + + GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); - GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads); - GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); - GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data @@ -2509,6 +2470,7 @@ extern "C" { GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_ssse3 (void); + GGML_API int ggml_cpu_has_riscv_v (void); GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_vsx (void); diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index 713731735..6d99c6bea 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -1,3 +1,4 @@ +#include "ggml-impl.h" #include "ggml-blas.h" #include "ggml-backend-impl.h" diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 06930ba2e..aa315b83f 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -30,6 +30,7 @@ #include #include +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-cann/aclnn_ops.h" #include "ggml-cann/common.h" @@ -1220,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) { return &ggml_backend_cann_buffer_types[device]; } +/** + * @brief Retrieves the name associated with a CANN host buffer type. + * + * This function returns the descriptive name associated with the specified + * CANN host buffer type context. + * + * @param buft Pointer to the host buffer type context. + * @return Const pointer to the C-style string containing the name. + */ +GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "CANN_Host"; + + GGML_UNUSED(buft); +} + +/** + * @brief Retrieves the name associated with a CANN host buffer. + * + * This function returns the descriptive name associated with the specified + * CANN host buffer context. + * + * @param buft Pointer to the host buffer context. + * @return Const pointer to the C-style string containing the name. + */ +GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) { + return "CANN_Host"; + + GGML_UNUSED(buffer); +} + +/** + * @brief Free resources associated with a CANN host buffer. + * + * This function frees the resources associated with a CANN host buffer, including + * its context. + * + * @param buffer The CANN host buffer to free. + */ +GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { + ACL_CHECK(aclrtFreeHost(buffer->context)); +} + +/** + * @brief Allocates a new CANN host buffer of the specified size. + * + * This function allocates a new CANN host buffer with the given size. + * @param size Size in bytes of the host buffer to allocate. + * @return Pointer to the allocated host buffer, or nullptr if allocation fails. + */ +static void * ggml_cann_host_malloc(size_t size) { + if (getenv("GGML_CANN_NO_PINNED") != nullptr) { + return nullptr; + } + + void * hostPtr = nullptr; + aclError err = aclrtMallocHost((void **) &hostPtr, size); + if (err != ACL_SUCCESS) { + + GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + size / 1024.0 / 1024.0, aclGetRecentErrMsg()); + return nullptr; + } + return hostPtr; +} + +/** + * @brief Allocates a new CANN host buffer of the specified type and size. + * + * @param buft Pointer to the host buffer type context. + * @param size Size in bytes of the host buffer to allocate. + * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails. + */ +GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * hostPtr = ggml_cann_host_malloc(size); + + if (hostPtr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size); + buffer->buft = buft; + buffer->iface.get_name = ggml_backend_cann_host_buffer_name; + buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free; + + return buffer; +} + +/** + * @brief Interface for managing CANN host buffer types in the GGML backend. + * + * Provides function pointers for allocating, querying properties, and managing + * memory for CANN buffer types in the GGML backend. + */ +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cann_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_cann_buffer_type_host; +} + /** * @brief Computes the forward operation for a given tensor using CANN * operations. @@ -1942,7 +2053,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) { GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return nullptr; } - + ggml_cann_set_device(ctx->device); ggml_backend_t cann_backend = new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), /* .interface = */ ggml_backend_cann_interface, diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index d53de4edd..54f1a7c2d 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1,5 +1,5 @@ #include "ggml-cuda.h" -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-cuda/common.cuh" diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 78d70cd7a..4935f8818 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q( // nrows_dst == nrows of the matrix that the kernel writes into const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst}; + // The stream-k decomposition is only faster for recent NVIDIA GPUs. + // Also its fixup needs to allocate a temporary buffer in the memory pool. + // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. + const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11; + const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; switch (src0->type) { case GGML_TYPE_Q4_0: diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index e8a957447..021a25682 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -2742,6 +2742,7 @@ struct mmq_args { int64_t ne00; int64_t ne01; int64_t stride01; int64_t ne10; int64_t ne11; int64_t stride11; int64_t ne0; + bool use_stream_k; }; template @@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; const dim3 block_nums_xy_tiling(nty, ntx, 1); - const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; - if (!use_stream_k) { + if (!args.use_stream_k) { if (args.ne01 % mmq_y == 0) { constexpr bool need_check = false; mul_mat_q<<>> diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index e50a103ac..8df571149 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -130,42 +130,3 @@ #define cudaKernelNodeParams musaKernelNodeParams #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed #define cudaStreamEndCapture musaStreamEndCapture - -// XXX: Clang builtins mapping -#define __vsub4 __vsub4_musa -#define __vcmpeq4 __vcmpeq4_musa -#define __vcmpne4 __vcmpne4_musa - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); - -static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0x00 : 0xff; - } - return c; -} diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 961f3c67b..cb7f7728b 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif +enum ggml_cgraph_eval_order { + GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, + GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, + GGML_CGRAPH_EVAL_ORDER_COUNT +}; + // bitset +typedef uint32_t ggml_bitset_t; + static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8) #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) @@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) { #define GGML_HASHSET_FULL ((size_t)-1) #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) +struct ggml_hash_set { + size_t size; + ggml_bitset_t * used; // whether or not the keys are in use i.e. set + struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) +}; + struct ggml_hash_set ggml_hash_set_new(size_t size); void ggml_hash_set_free(struct ggml_hash_set * hash_set); @@ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g GGML_ABORT("fatal error"); } +// computation graph + +struct ggml_cgraph { + int size; + int n_nodes; + int n_leafs; + + struct ggml_tensor ** nodes; + struct ggml_tensor ** grads; + struct ggml_tensor ** leafs; + + struct ggml_hash_set visited_hash_set; + + enum ggml_cgraph_eval_order order; +}; + +struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index 41ac63fa4..7f0bd82d5 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -1,4 +1,4 @@ -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend.h" #include "ggml-backend-impl.h" #include "ggml-kompute.h" diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index f04e5af71..6c85acfec 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -1,7 +1,7 @@ #import "ggml-metal.h" +#import "ggml-impl.h" #import "ggml-backend-impl.h" -#import "ggml.h" #import @@ -17,8 +17,8 @@ #define GGML_METAL_LOG_WARN(...) #define GGML_METAL_LOG_ERROR(...) #else -#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #endif @@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute( // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel - const int n_nodes = gf->n_nodes; + const int n_nodes = gf->n_nodes; const int n_cb = ctx->n_cb; const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; @@ -3039,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute( if (status != MTLCommandBufferStatusCompleted) { GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - NSString * error_code = [command_buffer error].localizedDescription; - GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]); + GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index 9c600c7ca..a8a2eb85a 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -1,5 +1,5 @@ #include "ggml-rpc.h" -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 4f03b01e7..acef7c6d4 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -33,7 +33,7 @@ #include #include "ggml-sycl.h" -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-sycl/backend.hpp" @@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: + return true; case GGML_OP_CONT: + return op->src[0]->type != GGML_TYPE_BF16; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: return true; case GGML_OP_ROPE: return ggml_is_contiguous(op->src[0]); case GGML_OP_IM2COL: + // TODO: add support for the new F32 operations + return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_2D: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 83737c1d9..bad960510 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -21,7 +21,7 @@ #include #include -#include "ggml.h" +#include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-vulkan-shaders.hpp" diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d7157ca6d..493ff7fc0 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 +#define GGML_N_TASKS_MAX (-1) #define GGML_SOFT_MAX_UNROLL 4 #define GGML_VEC_DOT_UNROLL 2 @@ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F32x4_ADD vaddq_f32 #define GGML_F32x4_MUL vmulq_f32 #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#define GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f32(x[i], x[offset+i]); \ - } \ - res = GGML_F32x4_REDUCE_ONE(x[0]); \ +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ + } \ + (res) = GGML_F32x4_REDUCE_ONE((x)[0]); \ } #define GGML_F32_VEC GGML_F32x4 @@ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_ADD vaddq_f16 #define GGML_F16x8_MUL vmulq_f16 - #define GGML_F16x8_REDUCE(res, x) \ - do { \ - int offset = GGML_F16_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = vaddq_f16(x[i], x[offset+i]); \ - } \ - const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ - const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ - res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ + #define GGML_F16x8_REDUCE(res, x) \ + do { \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ + } \ + const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ + const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ + (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ } while (0) #define GGML_F16_VEC GGML_F16x8 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i]) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i]) #define GGML_F16_VEC_FMA GGML_F16x8_FMA #define GGML_F16_VEC_ADD GGML_F16x8_ADD #define GGML_F16_VEC_MUL GGML_F16x8_MUL @@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) #endif +// +// ggml object +// + +struct ggml_object { + size_t offs; + size_t size; + + struct ggml_object * next; + + enum ggml_object_type type; + + char padding[4]; +}; + +static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + // // ggml context // @@ -19161,6 +19179,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { ggml_hash_set_reset(&cgraph->visited_hash_set); } +int ggml_graph_size(struct ggml_cgraph * cgraph) { + return cgraph->size; +} + +struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) { + if (i < 0) { + GGML_ASSERT(cgraph->n_nodes + i >= 0); + return cgraph->nodes[cgraph->n_nodes + i]; + } + + GGML_ASSERT(i < cgraph->n_nodes); + return cgraph->nodes[i]; +} + +struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) { + return cgraph->nodes; +} + +int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) { + return cgraph->n_nodes; +} + +void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { + GGML_ASSERT(cgraph->size > cgraph->n_nodes); + cgraph->nodes[cgraph->n_nodes] = tensor; + cgraph->n_nodes++; +} + // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) static void set_numa_thread_affinity(int thread_n) { @@ -23242,6 +23288,14 @@ int ggml_cpu_has_arm_fma(void) { #endif } +int ggml_cpu_has_riscv_v(void) { +#if defined(__riscv_v_intrinsic) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_metal(void) { #if defined(GGML_USE_METAL) return 1; diff --git a/include/llama.h b/include/llama.h index 93b3e6e85..744ef9d90 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1056,6 +1056,9 @@ extern "C" { LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); + // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed + LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i); + // available samplers: LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void); @@ -1127,6 +1130,10 @@ extern "C" { int32_t n_logit_bias, const llama_logit_bias * logit_bias); + + // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise + LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); + /// @details Sample and accept a token from the idx-th output of the last evaluation // // Shorthand for: diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 6f448b80c..c828dc359 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -162,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } +static uint32_t get_rng_seed(uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { + // use system clock if std::random_device is not a true RNG + static bool is_rd_prng = std::random_device().entropy() == 0; + if (is_rd_prng) { + return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count(); + } + std::random_device rd; + return rd(); + } + return seed; +} + // llama_sampler API const char * llama_sampler_name(const struct llama_sampler * smpl) { @@ -335,13 +349,26 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) { const auto * p = (const llama_sampler_chain *) chain->ctx; - if (i < 0 || i >= (int32_t) p->samplers.size()) { + if (i < 0 || (size_t) i >= p->samplers.size()) { return nullptr; } return p->samplers[i]; } +struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) { + auto * p = (llama_sampler_chain *) chain->ctx; + + if (i < 0 || (size_t) i >= p->samplers.size()) { + return nullptr; + } + + auto * result = p->samplers[i]; + p->samplers.erase(p->samplers.begin() + i); + + return result; +} + int llama_sampler_chain_n(const struct llama_sampler * chain) { const auto * p = (const llama_sampler_chain *) chain->ctx; @@ -387,6 +414,7 @@ struct llama_sampler * llama_sampler_init_greedy() { struct llama_sampler_dist { const uint32_t seed; + uint32_t seed_cur; std::mt19937 rng; }; @@ -416,7 +444,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample static void llama_sampler_dist_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_dist *) smpl->ctx; - ctx->rng = std::mt19937(ctx->seed); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static void llama_sampler_dist_free(struct llama_sampler * smpl) { @@ -433,11 +462,13 @@ static struct llama_sampler_i llama_sampler_dist_i = { }; struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { + auto seed_cur = get_rng_seed(seed); return new llama_sampler { /* .iface = */ &llama_sampler_dist_i, /* .ctx = */ new llama_sampler_dist { - /* .seed = */ seed, - /* .rng = */ std::mt19937(seed), + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), }, }; } @@ -1032,6 +1063,7 @@ struct llama_sampler_mirostat { const int32_t n_vocab; const uint32_t seed; + uint32_t seed_cur; const float tau; const float eta; @@ -1100,7 +1132,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_mirostat *) smpl->ctx; ctx->mu = 2.0f*ctx->tau; - ctx->rng = std::mt19937(ctx->seed); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static void llama_sampler_mirostat_free(struct llama_sampler * smpl) { @@ -1117,16 +1150,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = { }; struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { + auto seed_cur = get_rng_seed(seed); return new llama_sampler { /* .iface = */ &llama_sampler_mirostat_i, /* .ctx = */ new llama_sampler_mirostat { - /* .n_vocab = */ n_vocab, - /* .seed = */ seed, - /* .tau = */ tau, - /* .eta = */ eta, - /* .m = */ m, - /* .mu = */ 2.0f*tau, - /* .rng = */ std::mt19937(seed), + /* .n_vocab = */ n_vocab, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .tau = */ tau, + /* .eta = */ eta, + /* .m = */ m, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed_cur), }, }; } @@ -1135,6 +1170,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see struct llama_sampler_mirostat_v2 { const uint32_t seed; + uint32_t seed_cur; const float tau; const float eta; @@ -1179,7 +1215,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; ctx->mu = 2.0f*ctx->tau; - ctx->rng = std::mt19937(ctx->seed); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) { @@ -1212,14 +1249,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = { }; struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { + auto seed_cur = get_rng_seed(seed); return new llama_sampler { /* .iface = */ &llama_sampler_mirostat_v2_i, /* .ctx = */ new llama_sampler_mirostat_v2 { - /* .seed = */ seed, - /* .tau = */ tau, - /* .eta = */ eta, - /* .mu = */ 2.0f*tau, - /* .rng = */ std::mt19937(seed), + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .tau = */ tau, + /* .eta = */ eta, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed_cur), }, }; } @@ -1505,6 +1544,8 @@ struct llama_sampler * llama_sampler_init_penalties( ignore_eos = false; } + penalty_last_n = std::max(penalty_last_n, 0); + return new llama_sampler { /* .iface = */ &llama_sampler_penalties_i, /* .ctx = */ new llama_sampler_penalties { @@ -1568,6 +1609,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to } } } + static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx; return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data()); @@ -1599,3 +1641,31 @@ struct llama_sampler * llama_sampler_init_logit_bias( }, }; } + +// utils + +uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { + if (smpl->iface == &llama_sampler_dist_i) { + return ((const llama_sampler_dist *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_mirostat_i) { + return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_mirostat_v2_i) { + return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur; + } + + if (smpl->iface == &llama_sampler_chain_i) { + const auto * ctx = (const llama_sampler_chain *) smpl->ctx; + for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) { + const uint32_t seed = llama_sampler_get_seed(*it); + if (seed != LLAMA_DEFAULT_SEED) { + return seed; + } + } + } + + return LLAMA_DEFAULT_SEED; +} diff --git a/src/llama.cpp b/src/llama.cpp index 39e20440e..cdc3f1856 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2156,6 +2156,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer if (host_buffer) { buft = ggml_backend_sycl_host_buffer_type(); } +#elif defined(GGML_USE_CANN) + if (host_buffer) { + buft = ggml_backend_cann_host_buffer_type(); + } #elif defined(GGML_USE_CPU_HBM) buft = ggml_backend_cpu_hbm_buffer_type(); #elif defined(GGML_USE_VULKAN) @@ -9258,7 +9262,7 @@ static struct ggml_tensor * llm_build_copy_mask_state( // FIXME: zero-out NANs? states = ggml_mul(ctx, states, state_mask); - // copy states which won't be changed further (between n_seqs and n_rs) + // copy states which won't be changed further (between n_seqs and n_kv) ggml_build_forward_expand(graph, ggml_cpy(ctx, ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)), @@ -9877,8 +9881,8 @@ struct llm_build_context { struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { // find result_norm tensor for input struct ggml_tensor * inp = nullptr; - for (int i = gf->n_nodes - 1; i >= 0; --i) { - inp = gf->nodes[i]; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + inp = ggml_graph_node(gf, i); if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { break; } else { @@ -16076,19 +16080,21 @@ static int llama_decode_internal( return -1; } - for (uint32_t i = 0; i < n_tokens_all; ++i) { - if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); - return -1; - } - } - const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT + if (batch_all.token) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); + return -1; + } + } + } + GGML_ASSERT(n_tokens_all <= cparams.n_batch); GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); @@ -16205,8 +16211,8 @@ static int llama_decode_internal( ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); // the output is always the last tensor in the graph - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; + struct ggml_tensor * res = ggml_graph_node(gf, -1); + struct ggml_tensor * embd = ggml_graph_node(gf, -2); if (lctx.n_outputs == 0) { // no output @@ -16215,9 +16221,9 @@ static int llama_decode_internal( } else if (cparams.embeddings) { res = nullptr; // do not extract logits for embedding case embd = nullptr; - for (int i = gf->n_nodes - 1; i >= 0; --i) { - if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { - embd = gf->nodes[i]; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { + embd = ggml_graph_node(gf, i); break; } } @@ -16375,19 +16381,21 @@ static int llama_encode_internal( return -1; } - for (uint32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); - return -1; - } - } - const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + if (batch.token) { + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); + return -1; + } + } + } + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); @@ -16432,15 +16440,15 @@ static int llama_encode_internal( // there are two cases here if (llama_model_has_decoder(&lctx.model)) { // first case is an encoder-decoder T5 model where embeddings are passed to decoder - embd = gf->nodes[gf->n_nodes - 1]; + embd = ggml_graph_node(gf, -1); GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); } else { // second case is an encoder-only T5 model if (cparams.embeddings) { // only output embeddings if required - embd = gf->nodes[gf->n_nodes - 1]; + embd = ggml_graph_node(gf, -1); if (strcmp(embd->name, "result_embd_pooled") != 0) { - embd = gf->nodes[gf->n_nodes - 2]; + embd = ggml_graph_node(gf, -2); } GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); } @@ -17530,6 +17538,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name.find("time_mix_first.weight") == std::string::npos; quantize &= name.find("time_mix_w1.weight") == std::string::npos; quantize &= name.find("time_mix_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; @@ -18486,7 +18496,7 @@ struct llama_context * llama_new_context_with_model( // note: the number of splits during measure is higher than during inference due to the kv shift int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes); + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf)); LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); } } @@ -20666,6 +20676,7 @@ const char * llama_print_system_info(void) { s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | "; s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 635de01d7..aa7896def 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -519,7 +519,7 @@ struct test_case { // add sentinels as graph nodes so that they are checked in the callback for (ggml_tensor * sentinel : sentinels) { - gf->nodes[gf->n_nodes++] = sentinel; + ggml_graph_add_node(gf, sentinel); } // randomize tensors @@ -679,9 +679,9 @@ struct test_case { // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU - int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1; + int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; for (int i = 1; i < n_runs; i++) { - gf->nodes[gf->n_nodes++] = out; + ggml_graph_add_node(gf, out); } // calculate memory @@ -696,11 +696,11 @@ struct test_case { } return size; }; - for (int i = 0; i < gf->n_nodes; i++) { - if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) { + for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { + if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) { continue; } - mem += tensor_op_size(gf->nodes[i]); + mem += tensor_op_size(ggml_graph_node(gf, i)); } // run @@ -804,7 +804,7 @@ struct test_case { ggml_graph_cpy(gf, gb); ggml_build_backward_expand(ctx, gf, gb, false); if (expect.size() != 1 || expect[0] != 0.0f) { - GGML_ASSERT(gb->n_nodes > gf->n_nodes); + GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf)); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE); }