Merge branch 'ggerganov:master' into master

This commit is contained in:
Vinesh Janarthanan 2024-09-12 22:14:53 -05:00 committed by GitHub
commit 69c97bbead
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
50 changed files with 712 additions and 343 deletions

View File

@ -375,7 +375,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -401,7 +401,7 @@ jobs:
continue-on-error: true continue-on-error: true
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: add oneAPI to apt - name: add oneAPI to apt
shell: bash shell: bash
@ -442,7 +442,7 @@ jobs:
continue-on-error: true continue-on-error: true
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: add oneAPI to apt - name: add oneAPI to apt
shell: bash shell: bash
@ -546,7 +546,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -576,7 +576,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -610,7 +610,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v4
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -969,14 +969,14 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Install - name: Install
id: depends id: depends
run: | run: |
$ErrorActionPreference = "Stop" $ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer" write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK" write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation" write-host "Completed AMD HIP SDK installation"

View File

@ -173,6 +173,7 @@ jobs:
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
$env:PYTHONIOENCODING = ":replace"
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests - name: Slow tests

View File

@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
# determining _precisely_ which defines are necessary for the llama-config # determining _precisely_ which defines are necessary for the llama-config
# package. # package.
# #
set(GGML_TRANSIENT_DEFINES)
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR) get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS) get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
if (GGML_DIR_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
endif()
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES}) if (GGML_TARGET_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
endif()
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)

View File

@ -434,7 +434,7 @@ endif
# TODO: probably these flags need to be tweaked on some architectures # TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue # feel free to update the Makefile for your architecture and send a pull request or issue
ifndef RISCV ifndef RISCV_CROSS_COMPILE
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
@ -514,7 +514,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
MK_CXXFLAGS += -mlasx MK_CXXFLAGS += -mlasx
endif endif
else ifneq ($(filter riscv64%,$(UNAME_M)),)
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
else # RISC-V CROSS COMPILATION
MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif endif
@ -1454,7 +1459,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
./llama-gen-docs
libllava.a: examples/llava/llava.cpp \ libllava.a: examples/llava/llava.cpp \
examples/llava/llava.h \ examples/llava/llava.h \

View File

@ -89,6 +89,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

View File

@ -173,7 +173,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
std::string arg; std::string arg;
const std::string arg_prefix = "--"; const std::string arg_prefix = "--";
gpt_params & params = ctx_arg.params; gpt_params & params = ctx_arg.params;
gpt_sampler_params & sparams = params.sparams;
std::unordered_map<std::string, llama_arg *> arg_to_options; std::unordered_map<std::string, llama_arg *> arg_to_options;
for (auto & opt : ctx_arg.options) { for (auto & opt : ctx_arg.options) {
@ -283,10 +282,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
params.kv_overrides.back().key[0] = 0; params.kv_overrides.back().key[0] = 0;
} }
if (sparams.seed == LLAMA_DEFAULT_SEED) {
sparams.seed = time(NULL);
}
return true; return true;
} }
@ -823,7 +818,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
[](gpt_params & params) { [](gpt_params & params) {
params.special = true; params.special = true;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(llama_arg( add_opt(llama_arg(
{"-cnv", "--conversation"}, {"-cnv", "--conversation"},
format( format(
@ -909,7 +904,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
).set_sparam()); ).set_sparam());
add_opt(llama_arg( add_opt(llama_arg(
{"-s", "--seed"}, "SEED", {"-s", "--seed"}, "SEED",
format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed), format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED),
[](gpt_params & params, const std::string & value) { [](gpt_params & params, const std::string & value) {
params.sparams.seed = std::stoul(value); params.sparams.seed = std::stoul(value);
} }
@ -1422,20 +1417,18 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.split_mode = LLAMA_SPLIT_MODE_NONE; params.split_mode = LLAMA_SPLIT_MODE_NONE;
} else if (arg_next == "layer") { } else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER; params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} } else if (arg_next == "row") {
else if (arg_next == "row") {
#ifdef GGML_USE_SYCL #ifdef GGML_USE_SYCL
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
exit(1); exit(1);
#endif // GGML_USE_SYCL #endif // GGML_USE_SYCL
params.split_mode = LLAMA_SPLIT_MODE_ROW; params.split_mode = LLAMA_SPLIT_MODE_ROW;
} } else {
else {
throw std::invalid_argument("invalid value"); throw std::invalid_argument("invalid value");
} }
#ifndef GGML_USE_CUDA_SYCL_VULKAN if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUDA_SYCL_VULKAN }
} }
)); ));
add_opt(llama_arg( add_opt(llama_arg(
@ -1455,14 +1448,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
} }
for (size_t i = 0; i < llama_max_devices(); ++i) { for (size_t i = 0; i < llama_max_devices(); ++i) {
if (i < split_arg.size()) { if (i < split_arg.size()) {
params.tensor_split[i] = std::stof(split_arg[i]); params.tensor_split[i] = std::stof(split_arg[i]);
} else { } else {
params.tensor_split[i] = 0.0f; params.tensor_split[i] = 0.0f;
} }
} }
#ifndef GGML_USE_CUDA_SYCL_VULKAN if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CUDA_SYCL_VULKAN }
} }
)); ));
add_opt(llama_arg( add_opt(llama_arg(
@ -1470,9 +1463,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
[](gpt_params & params, int value) { [](gpt_params & params, int value) {
params.main_gpu = value; params.main_gpu = value;
#ifndef GGML_USE_CUDA_SYCL_VULKAN if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUDA_SYCL_VULKAN }
} }
)); ));
add_opt(llama_arg( add_opt(llama_arg(

View File

@ -56,14 +56,6 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
#define GGML_USE_CUDA_SYCL
#endif
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
#define GGML_USE_CUDA_SYCL_VULKAN
#endif
#if defined(LLAMA_USE_CURL) #if defined(LLAMA_USE_CURL)
#ifdef __linux__ #ifdef __linux__
#include <linux/limits.h> #include <linux/limits.h>
@ -949,11 +941,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#ifdef LLAMA_USE_CURL #ifdef LLAMA_USE_CURL
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool starts_with(const std::string & str, const std::string & prefix) { static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with... // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0; return str.rfind(prefix, 0) == 0;
} }
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
return true;
}
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
return false;
}
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
@ -1057,9 +1075,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback)); curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
CURLcode res = curl_easy_perform(curl.get()); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (res != CURLE_OK) { if (!was_perform_successful) {
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return false; return false;
} }
@ -1134,11 +1151,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
}; };
// start the download // start the download
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
auto res = curl_easy_perform(curl.get()); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (res != CURLE_OK) { if (!was_perform_successful) {
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return false; return false;
} }
@ -1812,6 +1828,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false"); fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");

View File

@ -310,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
return cur_p.data[cur_p.selected].id; return cur_p.data[cur_p.selected].id;
} }
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
return llama_sampler_get_seed(gsmpl->chain);
}
// helpers // helpers
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) { llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {

View File

@ -60,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
// //
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
// helpers // helpers
// access the internal list of current candidate tokens // access the internal list of current candidate tokens

View File

@ -302,6 +302,8 @@ class Model:
gguf.MODEL_TENSOR.TIME_MIX_FIRST, gguf.MODEL_TENSOR.TIME_MIX_FIRST,
gguf.MODEL_TENSOR.TIME_MIX_W1, gguf.MODEL_TENSOR.TIME_MIX_W1,
gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_W2,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
) )
) )
or not new_name.endswith(".weight") or not new_name.endswith(".weight")
@ -624,6 +626,9 @@ class Model:
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
res = "exaone" res = "exaone"
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
# ref: https://huggingface.co/microsoft/phi-2
res = "phi-2"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -2769,6 +2774,8 @@ class Rwkv6Model(Model):
self.gguf_writer.add_tokenizer_model("rwkv") self.gguf_writer.add_tokenizer_model("rwkv")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"] block_count = self.hparams["num_hidden_layers"]

View File

@ -31,6 +31,7 @@ import re
import requests import requests
import sys import sys
import json import json
import shutil
from hashlib import sha256 from hashlib import sha256
from enum import IntEnum, auto from enum import IntEnum, auto
@ -97,6 +98,7 @@ models = [
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", }, {'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", }, {'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
] ]
@ -125,12 +127,27 @@ def download_model(model):
if tokt == TOKENIZER_TYPE.UGM: if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model") files.append("spiece.model")
for file in files: if os.path.isdir(repo):
save_path = f"models/tokenizers/{name}/{file}" # If repo is a path on the file system, copy the directory
if os.path.isfile(save_path): for file in files:
logger.info(f"{name}: File {save_path} already exists - skipping") src_path = os.path.join(repo, file)
continue dst_path = f"models/tokenizers/{name}/{file}"
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) if os.path.isfile(dst_path):
logger.info(f"{name}: File {dst_path} already exists - skipping")
continue
if os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)
logger.info(f"{name}: Copied {src_path} to {dst_path}")
else:
logger.warning(f"{name}: Source file {src_path} does not exist")
else:
# If repo is a URL, download the files
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
logger.info(f"{name}: File {save_path} already exists - skipping")
continue
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
for model in models: for model in models:

View File

@ -363,7 +363,13 @@ if __name__ == '__main__':
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B))) yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
dest = super().modify_tensors(data_torch, name, bid) dest = list(super().modify_tensors(data_torch, name, bid))
# some archs may have the same tensor for lm_head and output (tie word embeddings)
# in this case, adapters targeting lm_head will fail when using llama-export-lora
# therefore, we ignore them for now
# see: https://github.com/ggerganov/llama.cpp/issues/9065
if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest: for dest_name, dest_data in dest:
assert isinstance(dest_data, LoraTorchTensor) assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B() lora_a, lora_b = dest_data.get_lora_A_B()

View File

@ -3,32 +3,10 @@
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
#include <cmath>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
// mutates the input string
static std::vector<int> parse_list(char * p) {
std::vector<int> ret;
char * q = p;
while (*p) {
if (*p == ',') {
*p = '\0';
ret.push_back(std::atoi(q));
q = p + 1;
}
++p;
}
ret.push_back(std::atoi(q));
return ret;
}
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n"); LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);

View File

@ -183,7 +183,7 @@ int main(int argc, char ** argv) {
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
TENSOR_DUMP(gf->nodes[0]); TENSOR_DUMP(ggml_graph_node(gf, 0));
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
// Let's use the F32 result from above as a reference for the quantized multiplication // Let's use the F32 result from above as a reference for the quantized multiplication
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
printf("=====================================================================================\n"); printf("=====================================================================================\n");
@ -252,7 +252,7 @@ int main(int argc, char ** argv) {
// Check that the matrix multiplication result is in the right ballpark // Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]); float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6

View File

@ -226,8 +226,8 @@ static ggml_status compute_piter(
result.eigenvectors.resize(params.n_batch); result.eigenvectors.resize(params.n_batch);
result.distances.resize(params.n_batch); result.distances.resize(params.n_batch);
// get output nodes // get output nodes
for (int i = 0; i < gf->n_nodes; ++i) { for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
auto node = gf->nodes[i]; auto node = ggml_graph_node(gf, i);
int iter = -1; int iter = -1;
// find b_tensor (without copying data from device) // find b_tensor (without copying data from device)
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {

View File

@ -90,8 +90,6 @@ int main(int argc, char ** argv) {
print_build_info(); print_build_info();
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);

View File

@ -370,7 +370,7 @@ struct lora_merge_ctx {
// write data to output file // write data to output file
{ {
auto result = gf->nodes[gf->n_nodes - 1]; auto * result = ggml_graph_node(gf, -1);
size_t len = ggml_nbytes(result); size_t len = ggml_nbytes(result);
if (read_buf.size() < len) { if (read_buf.size() < len) {
read_buf.resize(len); read_buf.resize(len);

View File

@ -159,8 +159,6 @@ int main(int argc, char ** argv) {
print_build_info(); print_build_info();
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
LOG("%s: llama backend init\n", __func__); LOG("%s: llama backend init\n", __func__);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -301,6 +299,9 @@ int main(int argc, char ** argv) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
} }
} }
smpl = gpt_sampler_init(model, sparams);
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_TEE("sampling: \n%s\n", sparams.print().c_str()); LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n"); LOG_TEE("\n\n");
@ -340,8 +341,6 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd; std::vector<llama_token> embd;
smpl = gpt_sampler_init(model, sparams);
while (n_remain != 0 || params.interactive) { while (n_remain != 0 || params.interactive) {
// predict // predict
if (!embd.empty()) { if (!embd.empty()) {

View File

@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
```sh ```sh
python ./examples/llava/convert_image_encoder_to_gguf \ python ./examples/llava/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \ -m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B/llava.projector \ --llava-projector path/to/MobileVLM-1.7B/llava.projector \
--output-dir path/to/MobileVLM-1.7B \ --output-dir path/to/MobileVLM-1.7B \
@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
``` ```
```sh ```sh
python ./examples/llava/convert_image_encoder_to_gguf \ python ./examples/llava/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \ -m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
--output-dir path/to/MobileVLM-1.7B_V2 \ --output-dir path/to/MobileVLM-1.7B_V2 \
@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh ```sh
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
``` ```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` 5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
```sh ```sh
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s ./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
``` ```
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory. Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.

View File

@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_graph_compute(ctx->backend, gf); ggml_backend_graph_compute(ctx->backend, gf);
// the last node is the embedding tensor // the last node is the embedding tensor
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
// copy the embeddings to the location passed by the user // copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));

View File

@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
ggml_build_forward_expand(gf, flatten); ggml_build_forward_expand(gf, flatten);
ggml_graph_compute_with_ctx(model.ctx, gf, 1); ggml_graph_compute_with_ctx(model.ctx, gf, 1);
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor* result = ggml_graph_node(gf, -1);
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ): // append without newline tokens (default behavior in llava_arch when not using unpad ):

View File

@ -18,8 +18,8 @@ struct llava_context {
}; };
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1; return 1;
} }

View File

@ -191,8 +191,6 @@ int main(int argc, char ** argv) {
print_build_info(); print_build_info();
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
LOG("%s: llama backend init\n", __func__); LOG("%s: llama backend init\n", __func__);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -470,8 +468,10 @@ int main(int argc, char ** argv) {
exit(1); exit(1);
} }
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_TEE("sampling params: \n%s\n", sparams.print().c_str()); LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
// group-attention state // group-attention state

View File

@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) {
print_build_info(); print_build_info();
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);

View File

@ -1,6 +1,6 @@
set(TARGET llama-quantize) set(TARGET llama-quantize)
add_executable(${TARGET} quantize.cpp) add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.
*Options:* *Options:*
`content`: Set the text to tokenize. `content`: (Required) The text to tokenize.
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
**Response:**
Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
If `with_pieces` is `false`:
```json
{
"tokens": [123, 456, 789]
}
```
If `with_pieces` is `true`:
```json
{
"tokens": [
{"id": 123, "piece": "Hello"},
{"id": 456, "piece": " world"},
{"id": 789, "piece": "!"}
]
}
```
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
```json
{
"tokens": [
{"id": 198, "piece": [195]}, // hex C3
{"id": 164, "piece": [161]} // hex A1
]
}
```
### POST `/detokenize`: Convert tokens to text ### POST `/detokenize`: Convert tokens to text

View File

@ -1266,6 +1266,7 @@ struct server_context {
{"n_predict", slot.n_predict}, // Server configured n_predict {"n_predict", slot.n_predict}, // Server configured n_predict
{"model", params.model_alias}, {"model", params.model_alias},
{"seed", slot.sparams.seed}, {"seed", slot.sparams.seed},
{"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
{"temperature", slot.sparams.temp}, {"temperature", slot.sparams.temp},
{"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_range", slot.sparams.dynatemp_range},
{"dynatemp_exponent", slot.sparams.dynatemp_exponent}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
@ -3017,12 +3018,39 @@ int main(int argc, char ** argv) {
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) { const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
const json body = json::parse(req.body); const json body = json::parse(req.body);
std::vector<llama_token> tokens; json tokens_response = json::array();
if (body.count("content") != 0) { if (body.count("content") != 0) {
const bool add_special = json_value(body, "add_special", false); const bool add_special = json_value(body, "add_special", false);
tokens = ctx_server.tokenize(body.at("content"), add_special); const bool with_pieces = json_value(body, "with_pieces", false);
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
if (with_pieces) {
for (const auto& token : tokens) {
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
json piece_json;
// Check if the piece is valid UTF-8
if (is_valid_utf8(piece)) {
piece_json = piece;
} else {
// If not valid UTF-8, store as array of byte values
piece_json = json::array();
for (unsigned char c : piece) {
piece_json.push_back(static_cast<int>(c));
}
}
tokens_response.push_back({
{"id", token},
{"piece", piece_json}
});
}
} else {
tokens_response = tokens;
}
} }
const json data = format_tokenizer_response(tokens);
const json data = format_tokenizer_response(tokens_response);
res_ok(res, data); res_ok(res, data);
}; };

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import asyncio import asyncio
import json import json
import os import os
@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
context.tokenize_add_special = True context.tokenize_add_special = True
@step("tokenizing with pieces")
@async_run_until_complete
async def step_tokenize_with_pieces(context):
context.tokenized_text = context_text(context)
async with aiohttp.ClientSession() as session:
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
if getattr(context, "tokenize_add_special", None) is not None:
tokenize_args["add_special"] = context.tokenize_add_special
async with session.post(
f"{context.base_url}/tokenize", json=tokenize_args
) as response:
assert response.status == 200
tokenize_json = await response.json()
context.tokens_with_pieces = tokenize_json["tokens"]
@step("tokens are given with pieces")
@async_run_until_complete
async def step_tokenize_with_pieces(context):
# Verify that the response contains both token IDs and pieces
assert all(
"id" in token and "piece" in token for token in context.tokens_with_pieces
)
@step('tokenizing') @step('tokenizing')
@async_run_until_complete @async_run_until_complete
async def step_tokenize(context): async def step_tokenize(context):

View File

@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
return res; return res;
} }
static json format_tokenizer_response(const std::vector<llama_token> & tokens) { static bool is_valid_utf8(const std::string & str) {
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
const unsigned char* end = bytes + str.length();
while (bytes < end) {
if (*bytes <= 0x7F) {
// 1-byte sequence (0xxxxxxx)
bytes++;
} else if ((*bytes & 0xE0) == 0xC0) {
// 2-byte sequence (110xxxxx 10xxxxxx)
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
return false;
bytes += 2;
} else if ((*bytes & 0xF0) == 0xE0) {
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
return false;
bytes += 3;
} else if ((*bytes & 0xF8) == 0xF0) {
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
return false;
bytes += 4;
} else {
// Invalid UTF-8 lead byte
return false;
}
}
return true;
}
static json format_tokenizer_response(const json & tokens) {
return json { return json {
{"tokens", tokens} {"tokens", tokens}
}; };

View File

@ -4,33 +4,23 @@
# Copyright (C) 2024 Intel Corporation # Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1
GGML_SYCL_SINGLE_GPU=1
else
GGML_SYCL_DEVICE=0
GGML_SYCL_SINGLE_GPU=0
fi
#export GGML_SYCL_DEBUG=1 #export GGML_SYCL_DEBUG=1
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
MODEL_FILE=llama-2-7b.Q4_0.gguf
NGL=33
if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1
echo "use $GGML_SYCL_DEVICE as main GPU" echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only #use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
else else
#use multiple GPUs with same max compute units #use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
fi fi
#use main GPU only
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
#use multiple GPUs with same max compute units
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0

View File

@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib" "nixpkgs-lib": "nixpkgs-lib"
}, },
"locked": { "locked": {
"lastModified": 1725024810, "lastModified": 1725234343,
"narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=", "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
"owner": "hercules-ci", "owner": "hercules-ci",
"repo": "flake-parts", "repo": "flake-parts",
"rev": "af510d4a62d071ea13925ce41c95e3dec816c01d", "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1724819573, "lastModified": 1725634671,
"narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=", "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "71e91c409d1e654808b2621f28a327acfdad8dc2", "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -36,14 +36,14 @@
}, },
"nixpkgs-lib": { "nixpkgs-lib": {
"locked": { "locked": {
"lastModified": 1722555339, "lastModified": 1725233747,
"narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=", "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
"type": "tarball", "type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
}, },
"original": { "original": {
"type": "tarball", "type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz" "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
} }
}, },
"root": { "root": {

View File

@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
*/ */
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void); GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
/**
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
*
* @return A pointer to the host buffer type interface.
*/
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
/** /**
* @brief Retrieves the description of a specific CANN device. * @brief Retrieves the description of a specific CANN device.
* *

View File

@ -358,6 +358,7 @@ extern "C" {
struct ggml_object; struct ggml_object;
struct ggml_context; struct ggml_context;
struct ggml_cgraph;
// NOTE: always add types at the end of the enum to keep backward compatibility // NOTE: always add types at the end of the enum to keep backward compatibility
enum ggml_type { enum ggml_type {
@ -575,23 +576,9 @@ extern "C" {
GGML_TENSOR_FLAG_PARAM = 4, GGML_TENSOR_FLAG_PARAM = 4,
}; };
// ggml object
struct ggml_object {
size_t offs;
size_t size;
struct ggml_object * next;
enum ggml_object_type type;
char padding[4];
};
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// n-dimensional tensor // n-dimensional tensor
struct ggml_tensor { struct ggml_tensor {
enum ggml_type type; enum ggml_type type;
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
@ -655,7 +642,7 @@ extern "C" {
struct ggml_threadpool; // forward declaration, see ggml.c struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t; typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute() // the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287 // since https://github.com/ggerganov/ggml/issues/287
@ -671,35 +658,6 @@ extern "C" {
void * abort_callback_data; void * abort_callback_data;
}; };
enum ggml_cgraph_eval_order {
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
GGML_CGRAPH_EVAL_ORDER_COUNT
};
typedef uint32_t ggml_bitset_t;
struct ggml_hash_set {
size_t size;
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
};
// computation graph
struct ggml_cgraph {
int size;
int n_nodes;
int n_leafs;
struct ggml_tensor ** nodes;
struct ggml_tensor ** grads;
struct ggml_tensor ** leafs;
struct ggml_hash_set visited_hash_set;
enum ggml_cgraph_eval_order order;
};
// scratch buffer // scratch buffer
struct ggml_scratch { struct ggml_scratch {
size_t offs; size_t offs;
@ -2017,8 +1975,6 @@ extern "C" {
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
#define GGML_N_TASKS_MAX -1
GGML_API struct ggml_tensor * ggml_map_custom1( GGML_API struct ggml_tensor * ggml_map_custom1(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -2088,30 +2044,35 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * tensor); struct ggml_tensor * tensor);
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
// graph allocation in a context // graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads); GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1); GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads); GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params); GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute() // ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data // when plan.work_size > 0, caller must allocate memory for plan.work_data
@ -2509,6 +2470,7 @@ extern "C" {
GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_riscv_v (void);
GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_rpc (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);

View File

@ -1,3 +1,4 @@
#include "ggml-impl.h"
#include "ggml-blas.h" #include "ggml-blas.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"

View File

@ -30,6 +30,7 @@
#include <cstring> #include <cstring>
#include <mutex> #include <mutex>
#include "ggml-impl.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-cann/aclnn_ops.h" #include "ggml-cann/aclnn_ops.h"
#include "ggml-cann/common.h" #include "ggml-cann/common.h"
@ -1220,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
return &ggml_backend_cann_buffer_types[device]; return &ggml_backend_cann_buffer_types[device];
} }
/**
* @brief Retrieves the name associated with a CANN host buffer type.
*
* This function returns the descriptive name associated with the specified
* CANN host buffer type context.
*
* @param buft Pointer to the host buffer type context.
* @return Const pointer to the C-style string containing the name.
*/
GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
return "CANN_Host";
GGML_UNUSED(buft);
}
/**
* @brief Retrieves the name associated with a CANN host buffer.
*
* This function returns the descriptive name associated with the specified
* CANN host buffer context.
*
* @param buft Pointer to the host buffer context.
* @return Const pointer to the C-style string containing the name.
*/
GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
return "CANN_Host";
GGML_UNUSED(buffer);
}
/**
* @brief Free resources associated with a CANN host buffer.
*
* This function frees the resources associated with a CANN host buffer, including
* its context.
*
* @param buffer The CANN host buffer to free.
*/
GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context));
}
/**
* @brief Allocates a new CANN host buffer of the specified size.
*
* This function allocates a new CANN host buffer with the given size.
* @param size Size in bytes of the host buffer to allocate.
* @return Pointer to the allocated host buffer, or nullptr if allocation fails.
*/
static void * ggml_cann_host_malloc(size_t size) {
if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
return nullptr;
}
void * hostPtr = nullptr;
aclError err = aclrtMallocHost((void **) &hostPtr, size);
if (err != ACL_SUCCESS) {
GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
return nullptr;
}
return hostPtr;
}
/**
* @brief Allocates a new CANN host buffer of the specified type and size.
*
* @param buft Pointer to the host buffer type context.
* @param size Size in bytes of the host buffer to allocate.
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
*/
GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * hostPtr = ggml_cann_host_malloc(size);
if (hostPtr == nullptr) {
// fallback to cpu buffer
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
}
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
buffer->buft = buft;
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
return buffer;
}
/**
* @brief Interface for managing CANN host buffer types in the GGML backend.
*
* Provides function pointers for allocating, querying properties, and managing
* memory for CANN buffer types in the GGML backend.
*/
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
/* .iface = */ {
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .context = */ nullptr,
};
return &ggml_backend_cann_buffer_type_host;
}
/** /**
* @brief Computes the forward operation for a given tensor using CANN * @brief Computes the forward operation for a given tensor using CANN
* operations. * operations.
@ -1942,7 +2053,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__); GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
return nullptr; return nullptr;
} }
ggml_cann_set_device(ctx->device);
ggml_backend_t cann_backend = ggml_backend_t cann_backend =
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
/* .interface = */ ggml_backend_cann_interface, /* .interface = */ ggml_backend_cann_interface,

View File

@ -1,5 +1,5 @@
#include "ggml-cuda.h" #include "ggml-cuda.h"
#include "ggml.h" #include "ggml-impl.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-cuda/common.cuh" #include "ggml-cuda/common.cuh"

View File

@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
// nrows_dst == nrows of the matrix that the kernel writes into // nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst}; // The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:

View File

@ -2742,6 +2742,7 @@ struct mmq_args {
int64_t ne00; int64_t ne01; int64_t stride01; int64_t ne00; int64_t ne01; int64_t stride01;
int64_t ne10; int64_t ne11; int64_t stride11; int64_t ne10; int64_t ne11; int64_t stride11;
int64_t ne0; int64_t ne0;
bool use_stream_k;
}; };
template<ggml_type type> template<ggml_type type>
@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
const dim3 block_nums_xy_tiling(nty, ntx, 1); const dim3 block_nums_xy_tiling(nty, ntx, 1);
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; if (!args.use_stream_k) {
if (!use_stream_k) {
if (args.ne01 % mmq_y == 0) { if (args.ne01 % mmq_y == 0) {
constexpr bool need_check = false; constexpr bool need_check = false;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>> mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>

View File

@ -130,42 +130,3 @@
#define cudaKernelNodeParams musaKernelNodeParams #define cudaKernelNodeParams musaKernelNodeParams
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
#define cudaStreamEndCapture musaStreamEndCapture #define cudaStreamEndCapture musaStreamEndCapture
// XXX: Clang builtins mapping
#define __vsub4 __vsub4_musa
#define __vcmpeq4 __vcmpeq4_musa
#define __vcmpne4 __vcmpne4_musa
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
return __vsubss4(a, b);
}
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
unsigned int c;
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
for (int i = 0; i < 4; ++i) {
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
}
return c;
}
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
unsigned int c;
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
for (int i = 0; i < 4; ++i) {
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
}
return c;
}

View File

@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif #endif
enum ggml_cgraph_eval_order {
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
GGML_CGRAPH_EVAL_ORDER_COUNT
};
// bitset // bitset
typedef uint32_t ggml_bitset_t;
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8) #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
#define GGML_HASHSET_FULL ((size_t)-1) #define GGML_HASHSET_FULL ((size_t)-1)
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
struct ggml_hash_set {
size_t size;
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
};
struct ggml_hash_set ggml_hash_set_new(size_t size); struct ggml_hash_set ggml_hash_set_new(size_t size);
void ggml_hash_set_free(struct ggml_hash_set * hash_set); void ggml_hash_set_free(struct ggml_hash_set * hash_set);
@ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
// computation graph
struct ggml_cgraph {
int size;
int n_nodes;
int n_leafs;
struct ggml_tensor ** nodes;
struct ggml_tensor ** grads;
struct ggml_tensor ** leafs;
struct ggml_hash_set visited_hash_set;
enum ggml_cgraph_eval_order order;
};
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -1,4 +1,4 @@
#include "ggml.h" #include "ggml-impl.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-kompute.h" #include "ggml-kompute.h"

View File

@ -1,7 +1,7 @@
#import "ggml-metal.h" #import "ggml-metal.h"
#import "ggml-impl.h"
#import "ggml-backend-impl.h" #import "ggml-backend-impl.h"
#import "ggml.h"
#import <Foundation/Foundation.h> #import <Foundation/Foundation.h>
@ -17,8 +17,8 @@
#define GGML_METAL_LOG_WARN(...) #define GGML_METAL_LOG_WARN(...)
#define GGML_METAL_LOG_ERROR(...) #define GGML_METAL_LOG_ERROR(...)
#else #else
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) #define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#endif #endif
@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute(
// create multiple command buffers and enqueue them // create multiple command buffers and enqueue them
// then, we encode the graph into the command buffers in parallel // then, we encode the graph into the command buffers in parallel
const int n_nodes = gf->n_nodes; const int n_nodes = gf->n_nodes;
const int n_cb = ctx->n_cb; const int n_cb = ctx->n_cb;
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
@ -3039,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute(
if (status != MTLCommandBufferStatusCompleted) { if (status != MTLCommandBufferStatusCompleted) {
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
if (status == MTLCommandBufferStatusError) { if (status == MTLCommandBufferStatusError) {
NSString * error_code = [command_buffer error].localizedDescription; GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
} }
return GGML_STATUS_FAILED; return GGML_STATUS_FAILED;

View File

@ -1,5 +1,5 @@
#include "ggml-rpc.h" #include "ggml-rpc.h"
#include "ggml.h" #include "ggml-impl.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include <cinttypes> #include <cinttypes>

View File

@ -33,7 +33,7 @@
#include <sycl/half_type.hpp> #include <sycl/half_type.hpp>
#include "ggml-sycl.h" #include "ggml-sycl.h"
#include "ggml.h" #include "ggml-impl.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-sycl/backend.hpp" #include "ggml-sycl/backend.hpp"
@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
return true;
case GGML_OP_CONT: case GGML_OP_CONT:
return op->src[0]->type != GGML_TYPE_BF16;
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
return true; return true;
case GGML_OP_ROPE: case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]); return ggml_is_contiguous(op->src[0]);
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
// TODO: add support for the new F32 operations
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:

View File

@ -21,7 +21,7 @@
#include <memory> #include <memory>
#include <mutex> #include <mutex>
#include "ggml.h" #include "ggml-impl.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-vulkan-shaders.hpp" #include "ggml-vulkan-shaders.hpp"

View File

@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
#define GGML_DEBUG 0 #define GGML_DEBUG 0
#define GGML_GELU_FP16 #define GGML_GELU_FP16
#define GGML_GELU_QUICK_FP16 #define GGML_GELU_QUICK_FP16
#define GGML_N_TASKS_MAX (-1)
#define GGML_SOFT_MAX_UNROLL 4 #define GGML_SOFT_MAX_UNROLL 4
#define GGML_VEC_DOT_UNROLL 2 #define GGML_VEC_DOT_UNROLL 2
@ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
#define GGML_F32x4_ADD vaddq_f32 #define GGML_F32x4_ADD vaddq_f32
#define GGML_F32x4_MUL vmulq_f32 #define GGML_F32x4_MUL vmulq_f32
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
#define GGML_F32x4_REDUCE(res, x) \ #define GGML_F32x4_REDUCE(res, x) \
{ \ { \
int offset = GGML_F32_ARR >> 1; \ int offset = GGML_F32_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = vaddq_f32(x[i], x[offset+i]); \ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
} \ } \
offset >>= 1; \ offset >>= 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = vaddq_f32(x[i], x[offset+i]); \ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
} \ } \
offset >>= 1; \ offset >>= 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = vaddq_f32(x[i], x[offset+i]); \ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
} \ } \
res = GGML_F32x4_REDUCE_ONE(x[0]); \ (res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
} }
#define GGML_F32_VEC GGML_F32x4 #define GGML_F32_VEC GGML_F32x4
@ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
#define GGML_F16x8_ADD vaddq_f16 #define GGML_F16x8_ADD vaddq_f16
#define GGML_F16x8_MUL vmulq_f16 #define GGML_F16x8_MUL vmulq_f16
#define GGML_F16x8_REDUCE(res, x) \ #define GGML_F16x8_REDUCE(res, x) \
do { \ do { \
int offset = GGML_F16_ARR >> 1; \ int offset = GGML_F16_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = vaddq_f16(x[i], x[offset+i]); \ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
} \ } \
offset >>= 1; \ offset >>= 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = vaddq_f16(x[i], x[offset+i]); \ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
} \ } \
offset >>= 1; \ offset >>= 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = vaddq_f16(x[i], x[offset+i]); \ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
} \ } \
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
} while (0) } while (0)
#define GGML_F16_VEC GGML_F16x8 #define GGML_F16_VEC GGML_F16x8
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1 #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i]) #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
#define GGML_F16_VEC_FMA GGML_F16x8_FMA #define GGML_F16_VEC_FMA GGML_F16x8_FMA
#define GGML_F16_VEC_ADD GGML_F16x8_ADD #define GGML_F16_VEC_ADD GGML_F16x8_ADD
#define GGML_F16_VEC_MUL GGML_F16x8_MUL #define GGML_F16_VEC_MUL GGML_F16x8_MUL
@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
#endif #endif
//
// ggml object
//
struct ggml_object {
size_t offs;
size_t size;
struct ggml_object * next;
enum ggml_object_type type;
char padding[4];
};
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// //
// ggml context // ggml context
// //
@ -19161,6 +19179,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
ggml_hash_set_reset(&cgraph->visited_hash_set); ggml_hash_set_reset(&cgraph->visited_hash_set);
} }
int ggml_graph_size(struct ggml_cgraph * cgraph) {
return cgraph->size;
}
struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
if (i < 0) {
GGML_ASSERT(cgraph->n_nodes + i >= 0);
return cgraph->nodes[cgraph->n_nodes + i];
}
GGML_ASSERT(i < cgraph->n_nodes);
return cgraph->nodes[i];
}
struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
return cgraph->nodes;
}
int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
return cgraph->n_nodes;
}
void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
GGML_ASSERT(cgraph->size > cgraph->n_nodes);
cgraph->nodes[cgraph->n_nodes] = tensor;
cgraph->n_nodes++;
}
// Android's libc implementation "bionic" does not support setting affinity // Android's libc implementation "bionic" does not support setting affinity
#if defined(__gnu_linux__) #if defined(__gnu_linux__)
static void set_numa_thread_affinity(int thread_n) { static void set_numa_thread_affinity(int thread_n) {
@ -23242,6 +23288,14 @@ int ggml_cpu_has_arm_fma(void) {
#endif #endif
} }
int ggml_cpu_has_riscv_v(void) {
#if defined(__riscv_v_intrinsic)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_metal(void) { int ggml_cpu_has_metal(void) {
#if defined(GGML_USE_METAL) #if defined(GGML_USE_METAL)
return 1; return 1;

View File

@ -1056,6 +1056,9 @@ extern "C" {
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
// available samplers: // available samplers:
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void); LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
@ -1127,6 +1130,10 @@ extern "C" {
int32_t n_logit_bias, int32_t n_logit_bias,
const llama_logit_bias * logit_bias); const llama_logit_bias * logit_bias);
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
/// @details Sample and accept a token from the idx-th output of the last evaluation /// @details Sample and accept a token from the idx-th output of the last evaluation
// //
// Shorthand for: // Shorthand for:

View File

@ -8,6 +8,7 @@
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <cfloat> #include <cfloat>
#include <chrono>
#include <cmath> #include <cmath>
#include <numeric> #include <numeric>
#include <random> #include <random>
@ -162,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
cur_p->size = k; cur_p->size = k;
} }
static uint32_t get_rng_seed(uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
// use system clock if std::random_device is not a true RNG
static bool is_rd_prng = std::random_device().entropy() == 0;
if (is_rd_prng) {
return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
}
std::random_device rd;
return rd();
}
return seed;
}
// llama_sampler API // llama_sampler API
const char * llama_sampler_name(const struct llama_sampler * smpl) { const char * llama_sampler_name(const struct llama_sampler * smpl) {
@ -335,13 +349,26 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) { struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
const auto * p = (const llama_sampler_chain *) chain->ctx; const auto * p = (const llama_sampler_chain *) chain->ctx;
if (i < 0 || i >= (int32_t) p->samplers.size()) { if (i < 0 || (size_t) i >= p->samplers.size()) {
return nullptr; return nullptr;
} }
return p->samplers[i]; return p->samplers[i];
} }
struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
auto * p = (llama_sampler_chain *) chain->ctx;
if (i < 0 || (size_t) i >= p->samplers.size()) {
return nullptr;
}
auto * result = p->samplers[i];
p->samplers.erase(p->samplers.begin() + i);
return result;
}
int llama_sampler_chain_n(const struct llama_sampler * chain) { int llama_sampler_chain_n(const struct llama_sampler * chain) {
const auto * p = (const llama_sampler_chain *) chain->ctx; const auto * p = (const llama_sampler_chain *) chain->ctx;
@ -387,6 +414,7 @@ struct llama_sampler * llama_sampler_init_greedy() {
struct llama_sampler_dist { struct llama_sampler_dist {
const uint32_t seed; const uint32_t seed;
uint32_t seed_cur;
std::mt19937 rng; std::mt19937 rng;
}; };
@ -416,7 +444,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample
static void llama_sampler_dist_reset(struct llama_sampler * smpl) { static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_dist *) smpl->ctx; auto * ctx = (llama_sampler_dist *) smpl->ctx;
ctx->rng = std::mt19937(ctx->seed); ctx->seed_cur = get_rng_seed(ctx->seed);
ctx->rng.seed(ctx->seed_cur);
} }
static void llama_sampler_dist_free(struct llama_sampler * smpl) { static void llama_sampler_dist_free(struct llama_sampler * smpl) {
@ -433,11 +462,13 @@ static struct llama_sampler_i llama_sampler_dist_i = {
}; };
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return new llama_sampler {
/* .iface = */ &llama_sampler_dist_i, /* .iface = */ &llama_sampler_dist_i,
/* .ctx = */ new llama_sampler_dist { /* .ctx = */ new llama_sampler_dist {
/* .seed = */ seed, /* .seed = */ seed,
/* .rng = */ std::mt19937(seed), /* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur),
}, },
}; };
} }
@ -1032,6 +1063,7 @@ struct llama_sampler_mirostat {
const int32_t n_vocab; const int32_t n_vocab;
const uint32_t seed; const uint32_t seed;
uint32_t seed_cur;
const float tau; const float tau;
const float eta; const float eta;
@ -1100,7 +1132,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa
static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) { static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_mirostat *) smpl->ctx; auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
ctx->mu = 2.0f*ctx->tau; ctx->mu = 2.0f*ctx->tau;
ctx->rng = std::mt19937(ctx->seed); ctx->seed_cur = get_rng_seed(ctx->seed);
ctx->rng.seed(ctx->seed_cur);
} }
static void llama_sampler_mirostat_free(struct llama_sampler * smpl) { static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
@ -1117,16 +1150,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
}; };
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return new llama_sampler {
/* .iface = */ &llama_sampler_mirostat_i, /* .iface = */ &llama_sampler_mirostat_i,
/* .ctx = */ new llama_sampler_mirostat { /* .ctx = */ new llama_sampler_mirostat {
/* .n_vocab = */ n_vocab, /* .n_vocab = */ n_vocab,
/* .seed = */ seed, /* .seed = */ seed,
/* .tau = */ tau, /* .seed_cur = */ seed_cur,
/* .eta = */ eta, /* .tau = */ tau,
/* .m = */ m, /* .eta = */ eta,
/* .mu = */ 2.0f*tau, /* .m = */ m,
/* .rng = */ std::mt19937(seed), /* .mu = */ 2.0f*tau,
/* .rng = */ std::mt19937(seed_cur),
}, },
}; };
} }
@ -1135,6 +1170,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
struct llama_sampler_mirostat_v2 { struct llama_sampler_mirostat_v2 {
const uint32_t seed; const uint32_t seed;
uint32_t seed_cur;
const float tau; const float tau;
const float eta; const float eta;
@ -1179,7 +1215,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) { static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
ctx->mu = 2.0f*ctx->tau; ctx->mu = 2.0f*ctx->tau;
ctx->rng = std::mt19937(ctx->seed); ctx->seed_cur = get_rng_seed(ctx->seed);
ctx->rng.seed(ctx->seed_cur);
} }
static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) { static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
@ -1212,14 +1249,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
}; };
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return new llama_sampler {
/* .iface = */ &llama_sampler_mirostat_v2_i, /* .iface = */ &llama_sampler_mirostat_v2_i,
/* .ctx = */ new llama_sampler_mirostat_v2 { /* .ctx = */ new llama_sampler_mirostat_v2 {
/* .seed = */ seed, /* .seed = */ seed,
/* .tau = */ tau, /* .seed_cur = */ seed_cur,
/* .eta = */ eta, /* .tau = */ tau,
/* .mu = */ 2.0f*tau, /* .eta = */ eta,
/* .rng = */ std::mt19937(seed), /* .mu = */ 2.0f*tau,
/* .rng = */ std::mt19937(seed_cur),
}, },
}; };
} }
@ -1505,6 +1544,8 @@ struct llama_sampler * llama_sampler_init_penalties(
ignore_eos = false; ignore_eos = false;
} }
penalty_last_n = std::max(penalty_last_n, 0);
return new llama_sampler { return new llama_sampler {
/* .iface = */ &llama_sampler_penalties_i, /* .iface = */ &llama_sampler_penalties_i,
/* .ctx = */ new llama_sampler_penalties { /* .ctx = */ new llama_sampler_penalties {
@ -1568,6 +1609,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
} }
} }
} }
static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) { static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx; const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data()); return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
@ -1599,3 +1641,31 @@ struct llama_sampler * llama_sampler_init_logit_bias(
}, },
}; };
} }
// utils
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
if (smpl->iface == &llama_sampler_dist_i) {
return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
}
if (smpl->iface == &llama_sampler_mirostat_i) {
return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
}
if (smpl->iface == &llama_sampler_mirostat_v2_i) {
return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
}
if (smpl->iface == &llama_sampler_chain_i) {
const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
const uint32_t seed = llama_sampler_get_seed(*it);
if (seed != LLAMA_DEFAULT_SEED) {
return seed;
}
}
}
return LLAMA_DEFAULT_SEED;
}

View File

@ -2156,6 +2156,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
if (host_buffer) { if (host_buffer) {
buft = ggml_backend_sycl_host_buffer_type(); buft = ggml_backend_sycl_host_buffer_type();
} }
#elif defined(GGML_USE_CANN)
if (host_buffer) {
buft = ggml_backend_cann_host_buffer_type();
}
#elif defined(GGML_USE_CPU_HBM) #elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type(); buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
@ -9258,7 +9262,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
// FIXME: zero-out NANs? // FIXME: zero-out NANs?
states = ggml_mul(ctx, states, state_mask); states = ggml_mul(ctx, states, state_mask);
// copy states which won't be changed further (between n_seqs and n_rs) // copy states which won't be changed further (between n_seqs and n_kv)
ggml_build_forward_expand(graph, ggml_build_forward_expand(graph,
ggml_cpy(ctx, ggml_cpy(ctx,
ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)), ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
@ -9877,8 +9881,8 @@ struct llm_build_context {
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
// find result_norm tensor for input // find result_norm tensor for input
struct ggml_tensor * inp = nullptr; struct ggml_tensor * inp = nullptr;
for (int i = gf->n_nodes - 1; i >= 0; --i) { for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
inp = gf->nodes[i]; inp = ggml_graph_node(gf, i);
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
break; break;
} else { } else {
@ -16076,19 +16080,21 @@ static int llama_decode_internal(
return -1; return -1;
} }
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
return -1;
}
}
const auto & model = lctx.model; const auto & model = lctx.model;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams; const auto & cparams = lctx.cparams;
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
if (batch_all.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
return -1;
}
}
}
GGML_ASSERT(n_tokens_all <= cparams.n_batch); GGML_ASSERT(n_tokens_all <= cparams.n_batch);
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@ -16205,8 +16211,8 @@ static int llama_decode_internal(
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
// the output is always the last tensor in the graph // the output is always the last tensor in the graph
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * res = ggml_graph_node(gf, -1);
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; struct ggml_tensor * embd = ggml_graph_node(gf, -2);
if (lctx.n_outputs == 0) { if (lctx.n_outputs == 0) {
// no output // no output
@ -16215,9 +16221,9 @@ static int llama_decode_internal(
} else if (cparams.embeddings) { } else if (cparams.embeddings) {
res = nullptr; // do not extract logits for embedding case res = nullptr; // do not extract logits for embedding case
embd = nullptr; embd = nullptr;
for (int i = gf->n_nodes - 1; i >= 0; --i) { for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
embd = gf->nodes[i]; embd = ggml_graph_node(gf, i);
break; break;
} }
} }
@ -16375,19 +16381,21 @@ static int llama_encode_internal(
return -1; return -1;
} }
for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
return -1;
}
}
const auto & model = lctx.model; const auto & model = lctx.model;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams; const auto & cparams = lctx.cparams;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
if (batch.token) {
for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
return -1;
}
}
}
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
@ -16432,15 +16440,15 @@ static int llama_encode_internal(
// there are two cases here // there are two cases here
if (llama_model_has_decoder(&lctx.model)) { if (llama_model_has_decoder(&lctx.model)) {
// first case is an encoder-decoder T5 model where embeddings are passed to decoder // first case is an encoder-decoder T5 model where embeddings are passed to decoder
embd = gf->nodes[gf->n_nodes - 1]; embd = ggml_graph_node(gf, -1);
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
} else { } else {
// second case is an encoder-only T5 model // second case is an encoder-only T5 model
if (cparams.embeddings) { if (cparams.embeddings) {
// only output embeddings if required // only output embeddings if required
embd = gf->nodes[gf->n_nodes - 1]; embd = ggml_graph_node(gf, -1);
if (strcmp(embd->name, "result_embd_pooled") != 0) { if (strcmp(embd->name, "result_embd_pooled") != 0) {
embd = gf->nodes[gf->n_nodes - 2]; embd = ggml_graph_node(gf, -2);
} }
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
} }
@ -17530,6 +17538,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= name.find("time_mix_first.weight") == std::string::npos; quantize &= name.find("time_mix_first.weight") == std::string::npos;
quantize &= name.find("time_mix_w1.weight") == std::string::npos; quantize &= name.find("time_mix_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_w2.weight") == std::string::npos; quantize &= name.find("time_mix_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
// do not quantize relative position bias (T5) // do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos; quantize &= name.find("attn_rel_b.weight") == std::string::npos;
@ -18486,7 +18496,7 @@ struct llama_context * llama_new_context_with_model(
// note: the number of splits during measure is higher than during inference due to the kv shift // note: the number of splits during measure is higher than during inference due to the kv shift
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes); LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf));
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
} }
} }
@ -20666,6 +20676,7 @@ const char * llama_print_system_info(void) {
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";

View File

@ -519,7 +519,7 @@ struct test_case {
// add sentinels as graph nodes so that they are checked in the callback // add sentinels as graph nodes so that they are checked in the callback
for (ggml_tensor * sentinel : sentinels) { for (ggml_tensor * sentinel : sentinels) {
gf->nodes[gf->n_nodes++] = sentinel; ggml_graph_add_node(gf, sentinel);
} }
// randomize tensors // randomize tensors
@ -679,9 +679,9 @@ struct test_case {
// duplicate the op // duplicate the op
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1; int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
for (int i = 1; i < n_runs; i++) { for (int i = 1; i < n_runs; i++) {
gf->nodes[gf->n_nodes++] = out; ggml_graph_add_node(gf, out);
} }
// calculate memory // calculate memory
@ -696,11 +696,11 @@ struct test_case {
} }
return size; return size;
}; };
for (int i = 0; i < gf->n_nodes; i++) { for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) { if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
continue; continue;
} }
mem += tensor_op_size(gf->nodes[i]); mem += tensor_op_size(ggml_graph_node(gf, i));
} }
// run // run
@ -804,7 +804,7 @@ struct test_case {
ggml_graph_cpy(gf, gb); ggml_graph_cpy(gf, gb);
ggml_build_backward_expand(ctx, gf, gb, false); ggml_build_backward_expand(ctx, gf, gb, false);
if (expect.size() != 1 || expect[0] != 0.0f) { if (expect.size() != 1 || expect[0] != 0.0f) {
GGML_ASSERT(gb->n_nodes > gf->n_nodes); GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE); GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
} }