mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 10:54:36 +00:00
Merge branch 'master' into compilade/fix-mpt-pretok
This commit is contained in:
commit
afa6119850
4
.github/labeler.yml
vendored
4
.github/labeler.yml
vendored
@ -16,7 +16,9 @@ SYCL:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-sycl.h
|
||||
- ggml/src/ggml-sycl.cpp
|
||||
- README-sycl.md
|
||||
- ggml/src/ggml-sycl/**
|
||||
- docs/backend/SYCL.md
|
||||
- examples/sycl/**
|
||||
Nvidia GPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
@ -50,9 +50,6 @@ endif()
|
||||
# option list
|
||||
#
|
||||
|
||||
# general
|
||||
option(LLAMA_CCACHE "llama: use ccache if available" ON)
|
||||
|
||||
# debug
|
||||
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
||||
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
|
||||
@ -77,7 +74,6 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
|
||||
# override ggml options
|
||||
set(GGML_CCACHE ${LLAMA_CCACHE})
|
||||
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
||||
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
|
||||
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
||||
@ -115,7 +111,10 @@ llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||
# build the library
|
||||
#
|
||||
|
||||
add_subdirectory(ggml)
|
||||
if (NOT TARGET ggml)
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
add_subdirectory(src)
|
||||
|
||||
#
|
||||
|
100
Makefile
100
Makefile
@ -64,10 +64,14 @@ TEST_TARGETS = \
|
||||
tests/test-tokenizer-1-spm
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
||||
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
|
||||
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
|
||||
|
||||
# Deprecation aliases
|
||||
ifdef LLAMA_CUBLAS
|
||||
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
|
||||
@ -193,7 +197,7 @@ ifdef GGML_RPC
|
||||
BUILD_TARGETS += rpc-server
|
||||
endif
|
||||
|
||||
default: $(BUILD_TARGETS)
|
||||
default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
@failures=0; \
|
||||
@ -228,7 +232,7 @@ test: $(TEST_TARGETS)
|
||||
fi
|
||||
@echo 'All tests passed.'
|
||||
|
||||
all: $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
|
||||
|
||||
ifdef RISCV_CROSS_COMPILE
|
||||
CC := riscv64-unknown-linux-gnu-gcc
|
||||
@ -245,17 +249,22 @@ MK_CFLAGS = -std=c11 -fPIC
|
||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||
MK_NVCCFLAGS = -std=c++11
|
||||
|
||||
ifndef LLAMA_NO_CCACHE
|
||||
ifdef LLAMA_NO_CCACHE
|
||||
GGML_NO_CCACHE := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifndef GGML_NO_CCACHE
|
||||
CCACHE := $(shell which ccache)
|
||||
ifdef CCACHE
|
||||
export CCACHE_SLOPPINESS = time_macros
|
||||
$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
|
||||
$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
|
||||
CC := $(CCACHE) $(CC)
|
||||
CXX := $(CCACHE) $(CXX)
|
||||
else
|
||||
$(info I ccache not found. Consider installing it for faster compilation.)
|
||||
endif # CCACHE
|
||||
endif # LLAMA_NO_CCACHE
|
||||
endif # GGML_NO_CCACHE
|
||||
|
||||
# clock_gettime came in POSIX.1b (1993)
|
||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||
@ -545,7 +554,7 @@ endif # GGML_BLIS
|
||||
|
||||
ifndef GGML_NO_LLAMAFILE
|
||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||
OBJ_GGML += ggml/src/sgemm.o
|
||||
OBJ_GGML += ggml/src/llamafile/sgemm.o
|
||||
endif
|
||||
|
||||
ifdef GGML_RPC
|
||||
@ -826,7 +835,8 @@ OBJ_GGML += \
|
||||
ggml/src/ggml.o \
|
||||
ggml/src/ggml-alloc.o \
|
||||
ggml/src/ggml-backend.o \
|
||||
ggml/src/ggml-quants.o
|
||||
ggml/src/ggml-quants.o \
|
||||
ggml/src/ggml-aarch64.o
|
||||
|
||||
OBJ_LLAMA = \
|
||||
src/llama.o \
|
||||
@ -926,6 +936,7 @@ $(info - LLAMA_NO_LLAMAFILE)
|
||||
$(info - LLAMA_NO_ACCELERATE)
|
||||
$(info - LLAMA_NO_OPENMP)
|
||||
$(info - LLAMA_NO_METAL)
|
||||
$(info - LLAMA_NO_CCACHE)
|
||||
$(info )
|
||||
endif
|
||||
|
||||
@ -959,15 +970,22 @@ ggml/src/ggml-quants.o: \
|
||||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-aarch64.o: \
|
||||
ggml/src/ggml-aarch64.c \
|
||||
ggml/include/ggml.h \
|
||||
ggml/src/ggml-aarch64.h \
|
||||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-blas.o: \
|
||||
ggml/src/ggml-blas.cpp \
|
||||
ggml/include/ggml-blas.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
ifndef GGML_NO_LLAMAFILE
|
||||
ggml/src/sgemm.o: \
|
||||
ggml/src/sgemm.cpp \
|
||||
ggml/src/sgemm.h \
|
||||
ggml/src/llamafile/sgemm.o: \
|
||||
ggml/src/llamafile/sgemm.cpp \
|
||||
ggml/src/llamafile/sgemm.h \
|
||||
ggml/include/ggml.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # GGML_NO_LLAMAFILE
|
||||
@ -1092,7 +1110,7 @@ clean:
|
||||
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
||||
rm -rvf $(BUILD_TARGETS)
|
||||
rm -rvf $(TEST_TARGETS)
|
||||
rm -rvf $(LEGACY_TARGETS)
|
||||
rm -rvf $(LEGACY_TARGETS_CLEAN)
|
||||
find examples pocs -type f -name "*.o" -delete
|
||||
|
||||
#
|
||||
@ -1488,3 +1506,61 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
||||
$(OBJ_GGML)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
#
|
||||
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
|
||||
#
|
||||
# Mark legacy binary targets as .PHONY so that they are always checked.
|
||||
.PHONY: main quantize perplexity embedding server finetune
|
||||
|
||||
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
|
||||
# Eventually we will want to remove these target from building all the time.
|
||||
main: examples/deprecation-warning/deprecation-warning.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
|
||||
|
||||
server: examples/deprecation-warning/deprecation-warning.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
|
||||
|
||||
quantize: examples/deprecation-warning/deprecation-warning.cpp
|
||||
ifneq (,$(wildcard quantize))
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
|
||||
@echo " Remove the 'quantize' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
||||
perplexity: examples/deprecation-warning/deprecation-warning.cpp
|
||||
ifneq (,$(wildcard perplexity))
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
|
||||
@echo " Remove the 'perplexity' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
||||
embedding: examples/deprecation-warning/deprecation-warning.cpp
|
||||
ifneq (,$(wildcard embedding))
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
|
||||
@echo " Remove the 'embedding' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
||||
finetune: examples/deprecation-warning/deprecation-warning.cpp
|
||||
ifneq (,$(wildcard finetune))
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
|
||||
@echo " Remove the 'finetune' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
@ -10,6 +10,7 @@ var sources = [
|
||||
"ggml/src/ggml-alloc.c",
|
||||
"ggml/src/ggml-backend.c",
|
||||
"ggml/src/ggml-quants.c",
|
||||
"ggml/src/ggml-aarch64.c",
|
||||
]
|
||||
|
||||
var resources: [Resource] = []
|
||||
|
@ -96,8 +96,9 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
- [x] [OLMo](https://allenai.org/olmo)
|
||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
@ -452,7 +453,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
|
||||
- [How to build](./docs/build.md)
|
||||
- [Running on Docker](./docs/docker.md)
|
||||
- [Build on Android](./docs/android.md)
|
||||
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
||||
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
|
||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
|
||||
**Seminal papers and background on the models**
|
||||
|
@ -1,3 +1,7 @@
|
||||
#if defined(_MSC_VER)
|
||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#include "common.h"
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
|
@ -630,7 +630,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
|
||||
buf << "[ ";
|
||||
|
||||
bool first = true;
|
||||
for (const auto &token : tokens)
|
||||
for (const auto & token : tokens)
|
||||
{
|
||||
if (!first) {
|
||||
buf << ", ";
|
||||
|
@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
|
||||
GGML_ASSERT(!original_logits.empty());
|
||||
}
|
||||
llama_token id = 0;
|
||||
// Get a pointer to the logits
|
||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||
|
||||
if (temp < 0.0) {
|
||||
// greedy sampling, with probs
|
||||
@ -324,6 +322,9 @@ static llama_token llama_sampling_sample_impl(
|
||||
}
|
||||
|
||||
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
||||
// Get a pointer to the logits
|
||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||
|
||||
// Create an array with a single token data element for the sampled id
|
||||
llama_token_data single_token_data = {id, logits[id], 0.0f};
|
||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
||||
@ -377,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
||||
if (ctx_sampling->grammar != NULL && !apply_grammar) {
|
||||
GGML_ASSERT(original_logits != NULL);
|
||||
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
||||
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
|
||||
*original_logits = {logits, logits + n_vocab};
|
||||
}
|
||||
|
||||
// apply params.logit_bias map
|
||||
@ -390,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
||||
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
|
||||
}
|
||||
|
||||
cur.clear();
|
||||
cur.resize(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||
|
@ -1400,7 +1400,7 @@ class LlamaModel(Model):
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self. _set_vocab_sentencepiece()
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
try:
|
||||
self._set_vocab_llama_hf()
|
||||
@ -2187,6 +2187,9 @@ class InternLM2Model(Model):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.IsByte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
# take care of ununsed raw token
|
||||
if piece.startswith('[UNUSED'):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
@ -2202,6 +2205,47 @@ class InternLM2Model(Model):
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||
|
||||
chat_eos_token = '<|im_end|>'
|
||||
chat_eos_token_id = None
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
||||
for token_id, foken_data in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
token = foken_data["content"]
|
||||
if token == chat_eos_token:
|
||||
chat_eos_token_id = token_id
|
||||
token = token.encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
assert(tokens[token_id] == token)
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
if foken_data.get("special"):
|
||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
tokenizer_file = self.dir_model / 'tokenizer.json'
|
||||
if tokenizer_file.is_file():
|
||||
with open(tokenizer_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
added_tokens = tokenizer_json.get("added_tokens", [])
|
||||
for foken_data in added_tokens:
|
||||
token_id = int(foken_data["id"])
|
||||
token = foken_data["content"]
|
||||
if token == chat_eos_token:
|
||||
chat_eos_token_id = token_id
|
||||
token = token.encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
assert(tokens[token_id] == token)
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
if foken_data.get("special"):
|
||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
@ -2211,28 +2255,16 @@ class InternLM2Model(Model):
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
old_eos = special_vocab.special_token_ids["eos"]
|
||||
if "chat" in os.path.basename(self.dir_model.absolute()):
|
||||
if chat_eos_token_id is not None:
|
||||
# For the chat model, we replace the eos with '<|im_end|>'.
|
||||
# TODO: this is a hack, should be fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
||||
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
|
||||
logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
|
||||
in chat mode so that the conversation can end normally.")
|
||||
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
||||
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
||||
" in chat mode so that the conversation can end normally.")
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _try_get_sft_eos(self, tokenizer):
|
||||
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
|
||||
im_end_list = tokenizer.Encode('<|im_end|>')
|
||||
eos_token = None
|
||||
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
|
||||
if len(unused_145_list) == 1:
|
||||
eos_token = unused_145_list[0]
|
||||
if len(im_end_list) == 1:
|
||||
eos_token = im_end_list[0]
|
||||
assert eos_token
|
||||
return eos_token
|
||||
|
||||
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
@ -2251,6 +2283,10 @@ in chat mode so that the conversation can end normally.")
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_heads = self.hparams["num_attention_heads"]
|
||||
|
@ -28,6 +28,7 @@ In order to build llama.cpp you have four different options.
|
||||
```
|
||||
|
||||
- Notes:
|
||||
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
|
||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
|
||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||
- For debug builds, run `make LLAMA_DEBUG=1`
|
||||
@ -41,6 +42,7 @@ In order to build llama.cpp you have four different options.
|
||||
|
||||
**Notes**:
|
||||
|
||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||
- For debug builds, there are two cases:
|
||||
|
51
examples/deprecation-warning/README.md
Normal file
51
examples/deprecation-warning/README.md
Normal file
@ -0,0 +1,51 @@
|
||||
# Migration notice for binary filenames
|
||||
|
||||
> [!IMPORTANT]
|
||||
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
||||
|
||||
This migration was important, but it is a breaking change that may not always be immediately obvious to users.
|
||||
|
||||
Please update all scripts and workflows to use the new binary names.
|
||||
|
||||
| Old Filename | New Filename |
|
||||
| ---- | ---- |
|
||||
| main | llama-cli |
|
||||
| server | llama-server |
|
||||
| llama-bench | llama-bench |
|
||||
| embedding | llama-embedding |
|
||||
| finetune | llama-finetune |
|
||||
| quantize | llama-quantize |
|
||||
| tokenize | llama-tokenize |
|
||||
| export-lora | llama-export-lora |
|
||||
| libllava.a | libllava.a |
|
||||
| baby-llama | llama-baby-llama |
|
||||
| batched | llama-batched |
|
||||
| batched-bench | llama-batched-bench |
|
||||
| benchmark-matmult | llama-benchmark-matmult |
|
||||
| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
|
||||
| eval-callback | llama-eval-callback |
|
||||
| gbnf-validator | llama-gbnf-validator |
|
||||
| gguf | llama-gguf |
|
||||
| gguf-split | llama-gguf-split |
|
||||
| gritlm | llama-gritlm |
|
||||
| imatrix | llama-imatrix |
|
||||
| infill | llama-infill |
|
||||
| llava-cli | llama-llava-cli |
|
||||
| lookahead | llama-lookahead |
|
||||
| lookup | llama-lookup |
|
||||
| lookup-create | llama-lookup-create |
|
||||
| lookup-merge | llama-lookup-merge |
|
||||
| lookup-stats | llama-lookup-stats |
|
||||
| parallel | llama-parallel |
|
||||
| passkey | llama-passkey |
|
||||
| perplexity | llama-perplexity |
|
||||
| q8dot | llama-q8dot |
|
||||
| quantize-stats | llama-quantize-stats |
|
||||
| retrieval | llama-retrieval |
|
||||
| save-load-state | llama-save-load-state |
|
||||
| simple | llama-simple |
|
||||
| speculative | llama-speculative |
|
||||
| train-text-from-scratch | llama-train-text-from-scratch |
|
||||
| vdot | llama-vdot |
|
||||
| tests/test-c.o | tests/test-c.o |
|
||||
|
35
examples/deprecation-warning/deprecation-warning.cpp
Normal file
35
examples/deprecation-warning/deprecation-warning.cpp
Normal file
@ -0,0 +1,35 @@
|
||||
// Warns users that this filename was deprecated, and provides a link for more information.
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
// Main
|
||||
int main(int argc, char** argv) {
|
||||
std::string filename = "main";
|
||||
if (argc >= 1) {
|
||||
filename = argv[0];
|
||||
}
|
||||
|
||||
// Get only the program name from the full path
|
||||
auto pos = filename.find_last_of('/');
|
||||
if (pos != std::string::npos) {
|
||||
filename = filename.substr(pos+1);
|
||||
}
|
||||
|
||||
// Append "llama-" to the beginning of filename to get the replacemnt filename
|
||||
auto replacement_filename = "llama-" + filename;
|
||||
|
||||
// The exception is if the filename is "main", then our replacement filename is "llama-cli"
|
||||
if (filename == "main") {
|
||||
replacement_filename = "llama-cli";
|
||||
}
|
||||
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
||||
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
|
||||
fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
return EXIT_FAILURE;
|
||||
}
|
@ -204,21 +204,17 @@ int main(int argc, char ** argv) {
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
bool suff_rm_leading_spc = params.escape;
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
std::vector<llama_token> embd_inp;
|
||||
std::vector<llama_token> embd_end;
|
||||
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
||||
const int space_token = 29871;
|
||||
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
||||
inp_sfx.erase(inp_sfx.begin());
|
||||
}
|
||||
|
||||
GGML_ASSERT(llama_token_prefix(model) >= 0);
|
||||
GGML_ASSERT(llama_token_suffix(model) >= 0);
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
@ -516,19 +512,14 @@ int main(int argc, char ** argv) {
|
||||
string_process_escapes(params.input_prefix);
|
||||
string_process_escapes(params.input_suffix);
|
||||
}
|
||||
suff_rm_leading_spc = params.escape;
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
|
||||
// tokenize new prefix and suffix
|
||||
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
||||
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
||||
inp_sfx.erase(inp_sfx.begin());
|
||||
}
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
|
@ -46,6 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
|
@ -884,7 +884,8 @@ struct server_context {
|
||||
|
||||
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
||||
slot_params default_params;
|
||||
llama_sampling_params default_sparams;
|
||||
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
|
||||
llama_sampling_params default_sparams = params.sparams;
|
||||
auto & data = task.data;
|
||||
|
||||
if (data.count("__oaicompat") != 0) {
|
||||
|
20
flake.lock
20
flake.lock
@ -5,11 +5,11 @@
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1717285511,
|
||||
"narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=",
|
||||
"lastModified": 1719994518,
|
||||
"narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8",
|
||||
"rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1719506693,
|
||||
"narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
|
||||
"lastModified": 1720031269,
|
||||
"narHash": "sha256-rwz8NJZV+387rnWpTYcXaRNvzUSnnF9aHONoJIYmiUQ=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
|
||||
"rev": "9f4128e00b0ae8ec65918efeba59db998750ead6",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -36,14 +36,14 @@
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1717284937,
|
||||
"narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=",
|
||||
"lastModified": 1719876945,
|
||||
"narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
|
@ -104,7 +104,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
|
||||
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
||||
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||
"ggml: BLAS library vendor")
|
||||
option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF)
|
||||
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
|
||||
|
||||
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
|
@ -383,6 +383,9 @@ extern "C" {
|
||||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_IQ1_M = 29,
|
||||
GGML_TYPE_BF16 = 30,
|
||||
GGML_TYPE_Q4_0_4_4 = 31,
|
||||
GGML_TYPE_Q4_0_4_8 = 32,
|
||||
GGML_TYPE_Q4_0_8_8 = 33,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
@ -424,6 +427,9 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
@ -2406,6 +2412,12 @@ extern "C" {
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
|
||||
int64_t k, int64_t bx);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
typedef struct {
|
||||
const char * type_name;
|
||||
@ -2418,6 +2430,11 @@ extern "C" {
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously;
|
||||
int64_t ncols; // number of columns to process simultaneously;
|
||||
int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
} ggml_type_traits_t;
|
||||
|
||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||
|
@ -238,12 +238,12 @@ if (GGML_BLAS)
|
||||
endif()
|
||||
|
||||
if (GGML_LLAMAFILE)
|
||||
message(STATUS "Using ggml SGEMM")
|
||||
message(STATUS "Using llamafile")
|
||||
|
||||
add_compile_definitions(GGML_USE_LLAMAFILE)
|
||||
|
||||
set(GGML_HEADERS_LLAMAFILE sgemm.h)
|
||||
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
|
||||
set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
|
||||
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
@ -1153,6 +1153,7 @@ add_library(ggml
|
||||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
|
||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||
ggml-aarch64.c ggml-aarch64.h
|
||||
)
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
|
2187
ggml/src/ggml-aarch64.c
Normal file
2187
ggml/src/ggml-aarch64.c
Normal file
File diff suppressed because it is too large
Load Diff
39
ggml/src/ggml-aarch64.h
Normal file
39
ggml/src/ggml-aarch64.h
Normal file
@ -0,0 +1,39 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// GEMV
|
||||
void ggml_gemv_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// GEMM
|
||||
void ggml_gemm_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -199,6 +199,30 @@ typedef struct {
|
||||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
|
||||
} block_q4_0x4;
|
||||
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
|
||||
} block_q4_0x8;
|
||||
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
||||
} block_q8_0x4;
|
||||
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
|
||||
} block_q8_0x8;
|
||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
#include "ggml-cuda/unary.cuh"
|
||||
#include "ggml-cuda/upscale.cuh"
|
||||
#include "ggml-cuda/conv-transpose-1d.cuh"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@ -2261,6 +2262,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_IM2COL:
|
||||
ggml_cuda_op_im2col(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_cuda_op_conv_transpose_1d(ctx,dst);
|
||||
break;
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_cuda_op_pool2d(ctx, dst);
|
||||
break;
|
||||
@ -2804,6 +2808,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
ggml_type src1_type = op->src[1]->type;
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
|
87
ggml/src/ggml-cuda/conv-transpose-1d.cu
Normal file
87
ggml/src/ggml-cuda/conv-transpose-1d.cu
Normal file
@ -0,0 +1,87 @@
|
||||
#include "conv-transpose-1d.cuh"
|
||||
|
||||
static __global__ void conv_transpose_1d_kernel(
|
||||
const int s0, const int p0, const int d0, const int output_size,
|
||||
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
|
||||
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
|
||||
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
|
||||
const float * src0, const float * src1, float * dst) {
|
||||
int global_index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (global_index >= output_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
int out_index = global_index / dst_ne0;
|
||||
|
||||
float accumulator = 0;
|
||||
|
||||
for (int c = 0; c < src0_ne2; c++) {
|
||||
int idx = global_index % dst_ne0;
|
||||
|
||||
int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
|
||||
int input_offset = src1_ne0 * c;
|
||||
|
||||
for (int i = 0; i < src1_ne0; i++) {
|
||||
if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
|
||||
continue;
|
||||
}
|
||||
int weight_idx = idx - i*s0;
|
||||
|
||||
float kernel_weight = src0[kernel_offset + weight_idx];
|
||||
float input_value = src1[input_offset+i];
|
||||
|
||||
accumulator += kernel_weight * input_value;
|
||||
}
|
||||
}
|
||||
dst[global_index] = accumulator;
|
||||
}
|
||||
|
||||
static void conv_transpose_1d_f32_f32_cuda(
|
||||
const int s0, const int p0, const int d0, const int output_size,
|
||||
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
|
||||
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
|
||||
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
|
||||
const float * src0, const float * src1, float * dst,
|
||||
cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
|
||||
conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
|
||||
s0,p0,d0,output_size,
|
||||
src0_ne0, src0_ne1, src0_ne2, src0_ne3,
|
||||
src1_ne0, src1_ne1, src1_ne2, src1_ne3,
|
||||
dst_ne0, dst_ne1, dst_ne2, dst_ne3,
|
||||
src0,src1, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
|
||||
const int s0 = opts[0];
|
||||
const int p0 = 0;//opts[3];
|
||||
const int d0 = 1;//opts[4];
|
||||
|
||||
const int64_t kernel_size = ggml_nelements(src0);
|
||||
const int64_t input_size = ggml_nelements(src1);
|
||||
const int64_t output_size = ggml_nelements(dst);
|
||||
|
||||
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
src0_d, src1_d, dst_d, stream);
|
||||
}
|
5
ggml/src/ggml-cuda/conv-transpose-1d.cuh
Normal file
5
ggml/src/ggml-cuda/conv-transpose-1d.cuh
Normal file
@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
@ -609,6 +609,10 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
@ -3814,43 +3814,47 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||
if (svcntb() == QK8_0) {
|
||||
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q4_0 * restrict x0 = &x[i + 0];
|
||||
const block_q4_0 * restrict x1 = &x[i + 1];
|
||||
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q4_0 * restrict x0 = &x[i + 0];
|
||||
const block_q4_0 * restrict x1 = &x[i + 1];
|
||||
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||
|
||||
// load x
|
||||
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
||||
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||
// load x
|
||||
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
||||
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||
|
||||
// 4-bit -> 8-bit
|
||||
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
|
||||
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
|
||||
// 4-bit -> 8-bit
|
||||
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
|
||||
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
|
||||
|
||||
// sub 8
|
||||
const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
|
||||
const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
|
||||
// sub 8
|
||||
const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
|
||||
const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
|
||||
|
||||
// load y
|
||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
// load y
|
||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
return;
|
||||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
#elif defined(__ARM_NEON)
|
||||
#endif
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
|
||||
@ -5422,31 +5426,35 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
if (svcntb() == QK8_0) {
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q8_0 * restrict x0 = &x[i + 0];
|
||||
const block_q8_0 * restrict x1 = &x[i + 1];
|
||||
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q8_0 * restrict x0 = &x[i + 0];
|
||||
const block_q8_0 * restrict x1 = &x[i + 1];
|
||||
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||
|
||||
// load x
|
||||
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
||||
const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
|
||||
// load x
|
||||
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
||||
const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
|
||||
|
||||
// load y
|
||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
// load y
|
||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
return;
|
||||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
#elif defined(__ARM_NEON)
|
||||
#endif
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
|
||||
@ -14760,6 +14768,16 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
|
||||
} \
|
||||
}
|
||||
|
||||
#define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
|
||||
const type * q = (const type *) (data); \
|
||||
for (size_t i = 0; i < (nb); ++i) { \
|
||||
for (size_t j = 0; j < (nr); ++j) { \
|
||||
if (!validate_fp16(q[i].d[j], i)) { \
|
||||
return false; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
|
||||
if (type < 0 || type >= GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid type %d\n", __func__, type);
|
||||
@ -14977,6 +14995,16 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
|
||||
} break;
|
||||
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
|
@ -3658,6 +3658,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||
#endif // SYCL_USE_XMX
|
||||
|
||||
// mmvq path is faster in the CUDA backend.
|
||||
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
|
||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||
|
||||
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// KQ single-batch
|
||||
ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
|
||||
|
@ -346,4 +346,10 @@ inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
|
||||
return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
|
||||
}
|
||||
|
||||
// Helper for accessing pointers with no warnings
|
||||
template <typename Tp, int dim>
|
||||
static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
|
||||
return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
|
||||
}
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
@ -158,7 +158,7 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
|
||||
sycl::range<3>(1, 1, 32),
|
||||
sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q4_K(vx, y, scale_local_acc.get_pointer(), item_ct1);
|
||||
dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -1835,10 +1835,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q4_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q4_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q4_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q4_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q4_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -1870,10 +1870,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q4_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q4_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q4_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q4_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q4_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -1950,10 +1950,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q4_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q4_1_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q4_1_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q4_1_acc_ct1),
|
||||
get_pointer(tile_x_dm_q4_1_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -1985,10 +1985,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q4_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q4_1_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q4_1_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q4_1_acc_ct1),
|
||||
get_pointer(tile_x_dm_q4_1_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2065,10 +2065,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q5_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q5_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q5_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2100,10 +2100,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q5_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q5_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q5_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2180,10 +2180,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q5_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_1_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q5_1_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_1_acc_ct1),
|
||||
get_pointer(tile_x_dm_q5_1_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2215,10 +2215,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q5_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_1_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q5_1_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_1_acc_ct1),
|
||||
get_pointer(tile_x_dm_q5_1_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2295,10 +2295,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q8_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q8_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q8_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q8_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q8_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2330,10 +2330,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q8_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_qs_q8_0_acc_ct1.get_pointer(),
|
||||
tile_x_d_q8_0_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_qs_q8_0_acc_ct1),
|
||||
get_pointer(tile_x_d_q8_0_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2412,11 +2412,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q2_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q2_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q2_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q2_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q2_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q2_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q2_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2450,11 +2450,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q2_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q2_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q2_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q2_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q2_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q2_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q2_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2537,12 +2537,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q3_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_qh_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q3_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_qh_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q3_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2578,12 +2578,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q3_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_qh_q3_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q3_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_qh_q3_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q3_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2663,11 +2663,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q4_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q4_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q4_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q4_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q4_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q4_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q4_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2701,11 +2701,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q4_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q4_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q4_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q4_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q4_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q4_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q4_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2784,11 +2784,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q5_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q5_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q5_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q5_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q5_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2822,11 +2822,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q5_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_q5_K_acc_ct1.get_pointer(),
|
||||
tile_x_dm_q5_K_acc_ct1.get_pointer(),
|
||||
tile_x_sc_q5_K_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_q5_K_acc_ct1),
|
||||
get_pointer(tile_x_dm_q5_K_acc_ct1),
|
||||
get_pointer(tile_x_sc_q5_K_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2905,11 +2905,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q6_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_acc_ct1.get_pointer(),
|
||||
tile_x_dm_acc_ct1.get_pointer(),
|
||||
tile_x_sc_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_acc_ct1),
|
||||
get_pointer(tile_x_dm_acc_ct1),
|
||||
get_pointer(tile_x_sc_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -2943,11 +2943,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mul_mat_q6_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
tile_x_ql_acc_ct1.get_pointer(),
|
||||
tile_x_dm_acc_ct1.get_pointer(),
|
||||
tile_x_sc_acc_ct1.get_pointer(),
|
||||
tile_y_qs_acc_ct1.get_pointer(),
|
||||
tile_y_ds_acc_ct1.get_pointer());
|
||||
get_pointer(tile_x_ql_acc_ct1),
|
||||
get_pointer(tile_x_dm_acc_ct1),
|
||||
get_pointer(tile_x_sc_acc_ct1),
|
||||
get_pointer(tile_y_qs_acc_ct1),
|
||||
get_pointer(tile_y_ds_acc_ct1));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -218,7 +218,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -265,7 +265,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
group_norm_f32(x, dst, group_size, ne_elements,
|
||||
eps_ct4, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
@ -306,7 +306,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ static void rope_norm(
|
||||
const int i = row*ne0 + i0;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||
const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
|
||||
@ -98,7 +98,7 @@ static void rope_neox(
|
||||
const int i = row*ne0 + i0/2;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||
const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
|
||||
|
@ -136,7 +136,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
|
||||
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
||||
nrows_y, scale, max_bias, m0,
|
||||
m1, n_head_log2, item_ct1,
|
||||
local_buf_acc.get_pointer());
|
||||
get_pointer(local_buf_acc));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
153
ggml/src/ggml.c
153
ggml/src/ggml.c
@ -4,7 +4,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include "ggml-aarch64.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
@ -37,12 +37,12 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_FEATURE_MATMUL_INT8
|
||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#undef GGML_USE_LLAMAFILE
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_LLAMAFILE
|
||||
#include "sgemm.h"
|
||||
#include <llamafile/sgemm.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
@ -692,6 +692,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
#else
|
||||
.nrows = 1,
|
||||
#endif
|
||||
.from_float_to_mat = quantize_mat_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q8_1] = {
|
||||
.type_name = "q8_1",
|
||||
@ -889,6 +890,54 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_4] = {
|
||||
.type_name = "q4_0_4x4",
|
||||
.blck_size = QK4_0,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.interleave_blcksize = 4,
|
||||
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_8] = {
|
||||
.type_name = "q4_0_4x8",
|
||||
.blck_size = QK4_0,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.interleave_blcksize = 8,
|
||||
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_8_8] = {
|
||||
.type_name = "q4_0_8x8",
|
||||
.blck_size = QK4_0,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 8,
|
||||
.interleave_blcksize = 8,
|
||||
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||
}
|
||||
};
|
||||
|
||||
@ -3188,6 +3237,9 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
@ -9432,6 +9484,9 @@ static void ggml_compute_forward_add(
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_add_q_f32(params, dst);
|
||||
} break;
|
||||
@ -9807,6 +9862,9 @@ static void ggml_compute_forward_add1(
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_add1_q_f32(params, dst);
|
||||
} break;
|
||||
@ -9932,6 +9990,9 @@ static void ggml_compute_forward_acc(
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
@ -12134,6 +12195,12 @@ static void ggml_compute_forward_mul_mat(
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||
int64_t const matmul_num_cols = type_traits[type].ncols;
|
||||
int64_t const interleave_blcksize = type_traits[type].interleave_blcksize;
|
||||
ggml_from_float_to_mat_t const from_float_to_mat
|
||||
= type_traits[vec_dot_type].from_float_to_mat;
|
||||
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||
ggml_gemm_t const gemm = type_traits[type].gemm;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
@ -12192,7 +12259,16 @@ UseGgmlGemm1:;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
||||
int64_t i11_processed = 0;
|
||||
if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
|
||||
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
||||
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
4, ne10, interleave_blcksize);
|
||||
}
|
||||
i11_processed = ne11 - ne11 % 4;
|
||||
}
|
||||
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
||||
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
||||
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
||||
ne10);
|
||||
@ -12273,6 +12349,28 @@ UseGgmlGemm2:;
|
||||
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
||||
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
||||
|
||||
if ((ggml_n_dims(src0) == 2) && gemv) {
|
||||
const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
|
||||
int64_t src0_start = (ith * ne01) / nth;
|
||||
int64_t src0_end = ((ith + 1) * ne01) / nth;
|
||||
src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
|
||||
src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
|
||||
if (src0_start >= src0_end) return;
|
||||
|
||||
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
||||
if (gemm && (ne11 > 3)) {
|
||||
gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
|
||||
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
|
||||
}
|
||||
for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
|
||||
gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
|
||||
(const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
|
||||
src0_end - src0_start);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
||||
int current_chunk = ith;
|
||||
|
||||
@ -12318,6 +12416,8 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
int64_t const matmul_num_cols = type_traits[type].ncols;
|
||||
ggml_gemv_t const gemv = type_traits[type].gemv;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||
@ -12403,6 +12503,34 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
const int64_t nr0 = ne01; // src0 rows
|
||||
const int64_t nr1 = cne1; // src1 rows
|
||||
|
||||
if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
|
||||
int64_t src0_cur_start = (ith * ne01) / nth;
|
||||
int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
|
||||
src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
|
||||
src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
|
||||
if (src0_cur_start >= src0_cur_end) return;
|
||||
|
||||
for (int ir1 = 0; ir1 < nr1; ir1++) {
|
||||
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
|
||||
const int id = row_mapping.i1; // selected expert index
|
||||
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = row_mapping.i2; // row index in src1
|
||||
|
||||
const int64_t i1 = id; // selected expert index
|
||||
const int64_t i2 = i12; // row
|
||||
|
||||
const char * src1_col = (const char *) wdata +
|
||||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12 * ne11) * row_size
|
||||
: (i11 * nb11 + i12 * nb12));
|
||||
|
||||
gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
|
||||
(const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||
|
||||
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||
@ -12704,6 +12832,9 @@ static void ggml_compute_forward_out_prod(
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_out_prod_q_f32(params, dst);
|
||||
} break;
|
||||
@ -12889,6 +13020,9 @@ static void ggml_compute_forward_set(
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
@ -13148,6 +13282,9 @@ static void ggml_compute_forward_get_rows(
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_get_rows_q(params, dst);
|
||||
} break;
|
||||
@ -13734,6 +13871,9 @@ static void ggml_compute_forward_clamp(
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
@ -20457,6 +20597,9 @@ size_t ggml_quantize_chunk(
|
||||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
@ -21759,8 +21902,6 @@ int ggml_cpu_has_neon(void) {
|
||||
|
||||
int ggml_cpu_has_sve(void) {
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// TODO: Currently, SVE 256 bit is only supported.
|
||||
GGML_ASSERT(svcntb() == QK8_0);
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
|
@ -79,5 +79,4 @@ python -m twine upload dist/*
|
||||
```
|
||||
|
||||
## TODO
|
||||
- [ ] Add tests
|
||||
- [ ] Include conversion scripts as command line entry points in this package.
|
||||
|
@ -6,7 +6,6 @@ from typing import Any, Callable
|
||||
from collections import deque
|
||||
|
||||
import numpy as np
|
||||
from numpy._typing import _Shape
|
||||
from numpy.typing import DTypeLike
|
||||
|
||||
|
||||
@ -219,7 +218,7 @@ class LazyNumpyTensor(LazyBase):
|
||||
_tensor_type = np.ndarray
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
|
||||
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
||||
# The initial idea was to use np.nan as the fill value,
|
||||
# but non-float types like np.int16 can't use that.
|
||||
# So zero it is.
|
||||
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.9.0"
|
||||
version = "0.9.1"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
@ -4,7 +4,7 @@ GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.
|
||||
|
||||
## Background
|
||||
|
||||
[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
|
||||
[Backus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
|
||||
|
||||
## Basics
|
||||
|
||||
|
@ -162,6 +162,9 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
@ -63,6 +63,7 @@ while read c; do
|
||||
src/ggml*.metal \
|
||||
src/ggml*.cu \
|
||||
src/ggml-cuda/* \
|
||||
src/ggml-sycl/* \
|
||||
include/ggml*.h \
|
||||
tests/test-opt.cpp \
|
||||
tests/test-grad0.cpp \
|
||||
@ -113,6 +114,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# src/ggml-quants.c -> ggml/src/ggml-quants.c
|
||||
# src/ggml-quants.h -> ggml/src/ggml-quants.h
|
||||
# src/ggml-rpc.cpp -> ggml/src/ggml-rpc.cpp
|
||||
# src/ggml-sycl/* -> ggml/src/ggml-sycl/
|
||||
# src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp
|
||||
# src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp
|
||||
#
|
||||
@ -153,6 +155,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
|
||||
|
@ -1 +1 @@
|
||||
5378ea0d3c2f25bcd330ecb226ad2db454be86d0
|
||||
e3b3846976c94163f2b3dd128cc959782653edbb
|
||||
|
@ -18,6 +18,7 @@ cp -rpv ../ggml/src/ggml-metal.metal ./ggml/src/ggml-metal.metal
|
||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml/src/ggml-quants.c
|
||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml/src/ggml-quants.h
|
||||
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp
|
||||
cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/
|
||||
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp
|
||||
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp
|
||||
|
||||
|
@ -57,6 +57,12 @@
|
||||
#include <io.h>
|
||||
#endif
|
||||
|
||||
#if __cplusplus >= 202000L
|
||||
#define LU8(x) (const char*)(u8##x)
|
||||
#else
|
||||
#define LU8(x) u8##x
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
@ -3782,6 +3788,9 @@ struct llama_model_loader {
|
||||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
||||
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
||||
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
@ -4475,6 +4484,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
@ -13201,6 +13213,8 @@ struct llm_build_context {
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
} else {
|
||||
GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
|
||||
|
||||
struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
|
||||
struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
|
||||
|
||||
@ -17759,6 +17773,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
|
||||
new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
new_type = GGML_TYPE_Q4_0;
|
||||
}
|
||||
}
|
||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||
@ -18071,6 +18089,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
@ -18381,6 +18402,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
int chunk_size_multiplier = 1;
|
||||
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
|
||||
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
||||
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
@ -18393,7 +18422,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
|
||||
chunk_size_multiplier;
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
@ -21508,12 +21538,12 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
|
||||
} else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "user") {
|
||||
ss << u8"<用户>";
|
||||
ss << LU8("<用户>");
|
||||
ss << trim(message->content);
|
||||
ss << "<AI>";
|
||||
} else {
|
||||
@ -21529,7 +21559,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||
} else if (role == "user") {
|
||||
ss << "User: " << message->content << "\n\n";
|
||||
} else if (role == "assistant") {
|
||||
ss << "Assistant: " << message->content << u8"<|end▁of▁sentence|>";
|
||||
ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
|
@ -1,3 +1,7 @@
|
||||
#if defined(_MSC_VER)
|
||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||
#endif
|
||||
|
||||
#include "unicode.h"
|
||||
#include "unicode-data.h"
|
||||
|
||||
|
@ -1266,6 +1266,32 @@ struct test_pool2d : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_CONV_TRANSPOSE_1D
|
||||
struct test_conv_transpose_1d : public test_case {
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
const std::array<int64_t, 4> ne_kernel;
|
||||
|
||||
const int s0; // stride
|
||||
const int p0; // padding
|
||||
const int d0; // dilation
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
|
||||
}
|
||||
|
||||
test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
|
||||
std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
||||
int s0 = 1, int p0 = 0, int d0 = 1)
|
||||
: ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
|
||||
ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_IM2COL
|
||||
struct test_im2col : public test_case {
|
||||
const ggml_type type_input;
|
||||
@ -1279,7 +1305,7 @@ struct test_im2col : public test_case {
|
||||
// padding
|
||||
const int p0;
|
||||
const int p1;
|
||||
// dilatation
|
||||
// dilation
|
||||
const int d0;
|
||||
const int d1;
|
||||
// mode
|
||||
@ -2098,6 +2124,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
|
||||
|
||||
test_cases.emplace_back(new test_conv_transpose_1d());
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
|
||||
|
||||
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
|
||||
|
Loading…
Reference in New Issue
Block a user