Merge remote-tracking branch 'origin/master' into tool-call

This commit is contained in:
ochafik 2024-12-14 15:08:00 +00:00
commit 7bfcd0a8dd
116 changed files with 12724 additions and 1362 deletions

View File

@ -8,11 +8,11 @@ arg1="$1"
shift shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert_hf_to_gguf.py "$@" exec python3 ./convert_hf_to_gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./llama-quantize "$@" exec ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./llama-cli "$@" exec ./llama-cli "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}" echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./llama-quantize "$i" "${i/f16/q4_0}" q4_0 exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
fi fi
done done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./llama-server "$@" exec ./llama-server "$@"
else else
echo "Unknown command: $arg1" echo "Unknown command: $arg1"
echo "Available commands: " echo "Available commands: "

View File

@ -662,6 +662,8 @@ jobs:
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64' - build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64-opencl-adreno'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
steps: steps:
- name: Clone - name: Clone
@ -703,6 +705,28 @@ jobs:
run: | run: |
choco install ninja choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
mkdir build && cd build
cmake .. `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build-arm64-release && cd build-arm64-release
cmake .. `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install --config release
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
@ -732,7 +756,7 @@ jobs:
- name: Test - name: Test
id: cmake_test id: cmake_test
# not all machines have native AVX-512 # not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: | run: |
cd build cd build
ctest -L main -C Release --verbose --timeout 900 ctest -L main -C Release --verbose --timeout 900

View File

@ -79,7 +79,7 @@ jobs:
# Setup nodejs (to be used for verifying bundled index.html) # Setup nodejs (to be used for verifying bundled index.html)
- uses: actions/setup-node@v4 - uses: actions/setup-node@v4
with: with:
node-version: 22 node-version: '22.11.0'
- name: Verify bundled index.html - name: Verify bundled index.html
id: verify_server_index_html id: verify_server_index_html

View File

@ -1,3 +1,5 @@
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
ci/ @ggerganov /ci/ @ggerganov
/.devops/ @ngxson
/examples/server/ @ngxson

View File

@ -22,6 +22,7 @@ BUILD_TARGETS = \
llama-infill \ llama-infill \
llama-llava-cli \ llama-llava-cli \
llama-minicpmv-cli\ llama-minicpmv-cli\
llama-qwen2vl-cli\
llama-lookahead \ llama-lookahead \
llama-lookup \ llama-lookup \
llama-lookup-create \ llama-lookup-create \
@ -1411,6 +1412,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift swift: examples/batched.swift
(cd examples/batched.swift; make build) (cd examples/batched.swift; make build)

View File

@ -110,6 +110,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
</details> </details>
@ -433,6 +434,20 @@ To learn more about model quantization, [read this documentation](examples/quant
</details> </details>
## [`llama-run`](examples/run)
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
- <details>
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
```bash
llama-run granite-code
```
</details>
[^3]: [https://github.com/containers/ramalama](RamaLama)
## [`llama-simple`](examples/simple) ## [`llama-simple`](examples/simple)

View File

@ -84,7 +84,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url # Use curl to download model url
if (LLAMA_CURL) if (LLAMA_CURL)
find_package(CURL REQUIRED) find_package(CURL REQUIRED)
add_definitions(-DLLAMA_USE_CURL) target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS}) include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED) find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})

View File

@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
} }
} }
const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_Q8_0,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_IQ4_NL,
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
};
static ggml_type kv_cache_type_from_str(const std::string & s) {
for (const auto & type : kv_cache_types) {
if (ggml_type_name(type) == s) {
return type;
}
}
throw std::runtime_error("Unsupported cache type: " + s);
}
static std::string get_all_kv_cache_types() {
std::ostringstream msg;
for (const auto & type : kv_cache_types) {
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
}
return msg.str();
}
// //
// CLI argument parsing functions // CLI argument parsing functions
// //
@ -591,7 +620,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { [](common_params & params) {
params.ctx_shift = false; params.ctx_shift = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg( add_opt(common_arg(
{"--chunks"}, "N", {"--chunks"}, "N",
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE", {"-ctk", "--cache-type-k"}, "TYPE",
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), string_format(
"KV cache data type for K\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.cache_type_k)
),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
// TODO: get the type right here params.cache_type_k = kv_cache_type_from_str(value);
params.cache_type_k = value;
} }
).set_env("LLAMA_ARG_CACHE_TYPE_K")); ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
add_opt(common_arg( add_opt(common_arg(
{"-ctv", "--cache-type-v"}, "TYPE", {"-ctv", "--cache-type-v"}, "TYPE",
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), string_format(
"KV cache data type for V\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.cache_type_v)
),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
// TODO: get the type right here params.cache_type_v = kv_cache_type_from_str(value);
params.cache_type_v = value;
} }
).set_env("LLAMA_ARG_CACHE_TYPE_V")); ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
add_opt(common_arg( add_opt(common_arg(
@ -1711,6 +1750,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.public_path = value; params.public_path = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
add_opt(common_arg(
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
[](common_params & params) {
params.webui = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
add_opt(common_arg( add_opt(common_arg(
{"--embedding", "--embeddings"}, {"--embedding", "--embeddings"},
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -2111,35 +2157,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_max = value; params.speculative.n_max = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
add_opt(common_arg( add_opt(common_arg(
{"--draft-min", "--draft-n-min"}, "N", {"--draft-min", "--draft-n-min"}, "N",
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_min = value; params.speculative.n_min = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
add_opt(common_arg( add_opt(common_arg(
{"--draft-p-split"}, "P", {"--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.speculative.p_split = std::stof(value); params.speculative.p_split = std::stof(value);
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
add_opt(common_arg( add_opt(common_arg(
{"--draft-p-min"}, "P", {"--draft-p-min"}, "P",
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.speculative.p_min = std::stof(value); params.speculative.p_min = std::stof(value);
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
add_opt(common_arg( add_opt(common_arg(
{"-cd", "--ctx-size-draft"}, "N", {"-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
[](common_params & params, int value) { [](common_params & params, int value) {
params.speculative.n_ctx = value; params.speculative.n_ctx = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
add_opt(common_arg( add_opt(common_arg(
{"-devd", "--device-draft"}, "<dev1,dev2,..>", {"-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@ -2159,14 +2205,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
} }
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
add_opt(common_arg( add_opt(common_arg(
{"-md", "--model-draft"}, "FNAME", {"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)", "draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.speculative.model = value; params.speculative.model = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
return ctx_arg; return ctx_arg;
} }

View File

@ -1054,38 +1054,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
return mparams; return mparams;
} }
static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "f32") {
return GGML_TYPE_F32;
}
if (s == "f16") {
return GGML_TYPE_F16;
}
if (s == "bf16") {
return GGML_TYPE_BF16;
}
if (s == "q8_0") {
return GGML_TYPE_Q8_0;
}
if (s == "q4_0") {
return GGML_TYPE_Q4_0;
}
if (s == "q4_1") {
return GGML_TYPE_Q4_1;
}
if (s == "iq4_nl") {
return GGML_TYPE_IQ4_NL;
}
if (s == "q5_0") {
return GGML_TYPE_Q5_0;
}
if (s == "q5_1") {
return GGML_TYPE_Q5_1;
}
throw std::runtime_error("Unsupported cache type: " + s);
}
struct llama_context_params common_context_params_to_llama(const common_params & params) { struct llama_context_params common_context_params_to_llama(const common_params & params) {
auto cparams = llama_context_default_params(); auto cparams = llama_context_default_params();
@ -1120,8 +1088,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
} }
cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_k = params.cache_type_k;
cparams.type_v = kv_cache_type_from_str(params.cache_type_v); cparams.type_v = params.cache_type_v;
return cparams; return cparams;
} }
@ -1147,12 +1115,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3 #define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2 #define CURL_RETRY_DELAY_SECONDS 2
static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts; int remaining_attempts = max_attempts;

View File

@ -43,9 +43,9 @@ using llama_tokens = std::vector<llama_token>;
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT; extern const char * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER; extern const char * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET; extern const char * LLAMA_BUILD_TARGET;
struct common_control_vector_load_info; struct common_control_vector_load_info;
@ -293,8 +293,8 @@ struct common_params {
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data bool check_tensors = false; // validate tensor data
std::string cache_type_k = "f16"; // KV cache data type for the K ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT std::string mmproj = ""; // path to multimodal projector // NOLINT
@ -446,6 +446,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts; return parts;
} }
static bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);

View File

@ -2001,6 +2001,29 @@ class Qwen2Model(Model):
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
@Model.register("Qwen2VLForConditionalGeneration")
class Qwen2VLModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN2VL
def set_gguf_parameters(self):
super().set_gguf_parameters()
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
mrope_section += [0] * max(0, 4 - len(mrope_section))
self.gguf_writer.add_rope_dimension_sections(mrope_section)
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
for name, data in super().get_tensors():
if name.startswith("visual."):
continue
yield name, data
@Model.register("Qwen2MoeForCausalLM") @Model.register("Qwen2MoeForCausalLM")
class Qwen2MoeModel(Model): class Qwen2MoeModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN2MOE model_arch = gguf.MODEL_ARCH.QWEN2MOE

View File

@ -20,7 +20,12 @@ else()
add_subdirectory(batched) add_subdirectory(batched)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(eval-callback) add_subdirectory(eval-callback)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(gbnf-validator) add_subdirectory(gbnf-validator)
endif()
add_subdirectory(gguf-hash) add_subdirectory(gguf-hash)
add_subdirectory(gguf-split) add_subdirectory(gguf-split)
add_subdirectory(gguf) add_subdirectory(gguf)
@ -46,12 +51,16 @@ else()
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(speculative-simple) add_subdirectory(speculative-simple)
add_subdirectory(tokenize) add_subdirectory(tokenize)
add_subdirectory(gen-docs)
if (NOT GGML_BACKEND_DL) if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading # these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator) add_subdirectory(cvector-generator)
add_subdirectory(export-lora) add_subdirectory(export-lora)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(quantize-stats) add_subdirectory(quantize-stats)
endif()
add_subdirectory(llava) add_subdirectory(llava)
if (GGML_RPC) if (GGML_RPC)
add_subdirectory(rpc) add_subdirectory(rpc)

View File

@ -287,7 +287,7 @@ struct split_strategy {
} }
void print_info() { void print_info() {
printf("n_split: %ld\n", ctx_outs.size()); printf("n_split: %zu\n", ctx_outs.size());
int i_split = 0; int i_split = 0;
for (auto & ctx_out : ctx_outs) { for (auto & ctx_out : ctx_outs) {
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors) // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@ -297,7 +297,7 @@ struct split_strategy {
total_size += ggml_nbytes(t); total_size += ggml_nbytes(t);
} }
total_size = total_size / 1000 / 1000; // convert to megabytes total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++; i_split++;
} }
} }

View File

@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
for (const auto & inst : params_instances) { for (const auto & inst : params_instances) {
params_idx++; params_idx++;
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
} }
// keep the same model between tests when possible // keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
} }
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
} }
test_gen(ctx, 1, t.n_threads); test_gen(ctx, 1, t.n_threads);
} }
@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
i + 1, params.reps); i + 1, params.reps);
} }
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
i + 1, params.reps); i + 1, params.reps);
} }
test_gen(ctx, t.n_gen, t.n_threads); test_gen(ctx, t.n_gen, t.n_threads);

View File

@ -43,3 +43,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TARGET llama-qwen2vl-cli)
add_executable(${TARGET} qwen2vl-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -102,7 +102,9 @@ static std::string format(const char * fmt, ...) {
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu" #define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.%s.embedding_length" #define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length" #define KEY_N_FF "clip.%s.feed_forward_length"
#define KEY_N_BLOCK "clip.%s.block_count" #define KEY_N_BLOCK "clip.%s.block_count"
@ -129,7 +131,8 @@ static std::string format(const char * fmt, ...) {
#define TN_TOKEN_EMBD "%s.token_embd.weight" #define TN_TOKEN_EMBD "%s.token_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd" #define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
@ -163,6 +166,7 @@ enum projector_type {
PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER, PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@ -171,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"}, { PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"}, { PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
}; };
@ -463,7 +468,8 @@ struct clip_vision_model {
// embeddings // embeddings
struct ggml_tensor * class_embedding; struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings; struct ggml_tensor * patch_embeddings_0;
struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
struct ggml_tensor * patch_bias; struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings; struct ggml_tensor * position_embeddings;
@ -553,6 +559,7 @@ struct clip_ctx {
bool has_vision_encoder = false; bool has_vision_encoder = false;
bool has_llava_projector = false; bool has_llava_projector = false;
bool has_minicpmv_projector = false; bool has_minicpmv_projector = false;
bool has_qwen2vl_merger = false;
int minicpmv_version = 2; int minicpmv_version = 2;
struct clip_vision_model vision_model; struct clip_vision_model vision_model;
@ -561,6 +568,7 @@ struct clip_ctx {
float image_mean[3]; float image_mean[3];
float image_std[3]; float image_std[3];
bool use_gelu = false; bool use_gelu = false;
bool use_silu = false;
int32_t ftype = 1; int32_t ftype = 1;
bool has_class_embedding = true; bool has_class_embedding = true;
@ -606,14 +614,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
image_size_height = imgs->data->ny; image_size_height = imgs->data->ny;
} }
} }
else if (ctx->has_qwen2vl_merger) {
// use the image's native resolution when image is avaible
if (is_inf) {
// if (imgs->data->nx && imgs->data->ny) {
image_size_width = imgs->data->nx;
image_size_height = imgs->data->ny;
}
}
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int patches_w = image_size_width / patch_size;
const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head; const int d_head = hidden_size / n_head;
int n_layer = hparams.n_layer; int n_layer = hparams.n_layer;
const float eps = hparams.eps; const float eps = hparams.eps;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
const int batch_size = imgs->size; const int batch_size = imgs->size;
@ -634,10 +654,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_set_name(inp_raw, "inp_raw"); ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw); ggml_set_input(inp_raw);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
if (ctx->has_qwen2vl_merger) {
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, patches_h, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
inp = ggml_reshape_3d(
ctx0, inp,
hidden_size, patches_w * patches_h, batch_size);
}
else {
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
}
if (ctx->has_patch_bias) { if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
@ -659,12 +699,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
} }
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions"); ggml_set_name(positions, "positions");
ggml_set_input(positions); ggml_set_input(positions);
if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
embeddings = embeddings =
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
}
if (ctx->has_minicpmv_projector) { if (ctx->has_minicpmv_projector) {
int pos_w = image_size_width/patch_size; int pos_w = image_size_width/patch_size;
@ -688,7 +730,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
// loop over layers // loop over layers
if (ctx->has_minicpmv_projector) { if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
// TODO: figure out why we doing thing in this way ???
n_layer += 1; n_layer += 1;
} }
for (int il = 0; il < n_layer - 1; il++) { for (int il = 0; il < n_layer - 1; il++) {
@ -710,8 +753,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
struct ggml_tensor * Q = struct ggml_tensor * Q =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) {
Q = ggml_rope_multi(
ctx0, Q, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
}
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
@ -719,6 +767,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) {
K = ggml_rope_multi(
ctx0, K, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
}
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
@ -758,6 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (ctx->use_gelu) { if (ctx->use_gelu) {
cur = ggml_gelu_inplace(ctx0, cur); cur = ggml_gelu_inplace(ctx0, cur);
} else if (ctx->use_silu) {
cur = ggml_silu_inplace(ctx0, cur);
} else { } else {
cur = ggml_gelu_quick_inplace(ctx0, cur); cur = ggml_gelu_quick_inplace(ctx0, cur);
} }
@ -769,6 +824,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
cur = ggml_add(ctx0, embeddings, cur); cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur; embeddings = cur;
} }
// post-layernorm // post-layernorm
@ -1030,6 +1086,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
}
// build the graph // build the graph
ggml_build_forward_expand(gf, embeddings); ggml_build_forward_expand(gf, embeddings);
@ -1206,6 +1275,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx); new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
} }
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
if (idx != -1) {
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
}
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(new_clip->has_vision_encoder);
@ -1214,6 +1287,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
idx = get_key_idx(ctx, KEY_USE_GELU); idx = get_key_idx(ctx, KEY_USE_GELU);
new_clip->use_gelu = gguf_get_val_bool(ctx, idx); new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
try {
idx = get_key_idx(ctx, KEY_USE_SILU);
new_clip->use_silu = gguf_get_val_bool(ctx, idx);
} catch (std::runtime_error & /*e*/) {
new_clip->use_silu = false;
}
if (verbosity >= 1) { if (verbosity >= 1) {
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
@ -1389,11 +1469,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
try { try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& /*e*/) { } catch(const std::exception& /*e*/) {
LOG_ERR("%s: failed to load vision model tensors\n", __func__); LOG_ERR("%s: failed to load vision model tensors\n", __func__);
} }
try {
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
} catch(const std::exception& /*e*/) {
new_clip->has_qwen2vl_merger = false;
}
// LLaVA projection // LLaVA projection
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
@ -1481,6 +1566,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
} }
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
}
else { else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@ -1519,6 +1610,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
clip_image_f32_batch batch; clip_image_f32_batch batch;
batch.size = 1; batch.size = 1;
batch.data = nullptr;
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_gallocr_reserve(new_clip->compute_alloc, gf); ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
@ -1532,6 +1624,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
ctx_clip->load_image_size = load_image_size; ctx_clip->load_image_size = load_image_size;
} }
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
return ctx_clip->load_image_size;
}
struct clip_image_size * clip_image_size_init() { struct clip_image_size * clip_image_size_init() {
struct clip_image_size * load_image_size = new struct clip_image_size(); struct clip_image_size * load_image_size = new struct clip_image_size();
load_image_size->width = 448; load_image_size->width = 448;
@ -1984,6 +2080,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
} }
return true; return true;
} }
else if (ctx->has_qwen2vl_merger) {
clip_image_u8 * resized = clip_image_u8_init();
auto patch_size = clip_patch_size(ctx) * 2;
int nx = ceil((float)img->nx / patch_size) * patch_size;
int ny = ceil((float)img->ny / patch_size) * patch_size;
bicubic_resize(*img, *resized, nx, ny);
res_imgs->data = new clip_image_f32[1];
// clip_image_f32 * res = clip_image_f32_init();
normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
// res_imgs->data[0] = *res;
res_imgs->size = 1;
// clip_image_f32_free(res);
clip_image_u8_free(resized);
return true;
}
bool pad_to_square = true; bool pad_to_square = true;
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
@ -2173,6 +2286,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
} }
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
clip_image_f32 img;
img.nx = img_w;
img.ny = img_h;
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
int32_t clip_image_size(const struct clip_ctx * ctx) { int32_t clip_image_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_size; return ctx->vision_model.hparams.image_size;
} }
@ -2194,6 +2314,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
} }
int clip_n_patches(const struct clip_ctx * ctx) { int clip_n_patches(const struct clip_ctx * ctx) {
clip_image_f32 img;
img.nx = ctx->vision_model.hparams.image_size;
img.ny = ctx->vision_model.hparams.image_size;
return clip_n_patches_by_img(ctx, &img);
}
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams; const auto & params = ctx->vision_model.hparams;
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@ -2207,6 +2334,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {
else if (ctx->minicpmv_version == 3) { else if (ctx->minicpmv_version == 3) {
n_patches = 64; n_patches = 64;
} }
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches = x_patch * y_patch;
} }
return n_patches; return n_patches;
@ -2335,7 +2467,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int image_size = hparams.image_size; const int image_size = hparams.image_size;
int image_size_width = image_size; int image_size_width = image_size;
int image_size_height = image_size; int image_size_height = image_size;
if (ctx->has_minicpmv_projector) { if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
image_size_width = imgs->data[0].nx; image_size_width = imgs->data[0].nx;
image_size_height = imgs->data[0].ny; image_size_height = imgs->data[0].ny;
} }
@ -2355,7 +2487,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
for (size_t i = 0; i < imgs->size; i++) { for (size_t i = 0; i < imgs->size; i++) {
const int nx = imgs->data[i].nx; const int nx = imgs->data[i].nx;
const int ny = imgs->data[i].ny; const int ny = imgs->data[i].ny;
if (!ctx->has_minicpmv_projector) { if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
GGML_ASSERT(nx == image_size && ny == image_size); GGML_ASSERT(nx == image_size && ny == image_size);
} }
@ -2413,9 +2545,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
for(int i=0;i<pos_w * pos_h;++i){ for(int i=0;i < pos_w * pos_h; ++i){
for(int j=0;j<embed_dim;++j){ for(int j=0; j < embed_dim; ++j){
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j]; pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
} }
} }
@ -2435,7 +2567,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
} }
if (ctx->has_qwen2vl_merger) {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
int* positions_data = (int*)malloc(ggml_nbytes(positions));
int ptr = 0;
for (int y = 0; y < ph; y+=2)
{ {
for (int x = 0; x < pw; x+=2)
{
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
positions_data[ptr] = y + dy;
positions_data[num_patches + ptr] = x + dx;
positions_data[num_patches * 2 + ptr] = y + dy;
positions_data[num_patches * 3 + ptr] = x + dx;
ptr++;
}
}
}
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
else {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions)); int* positions_data = (int*)malloc(ggml_nbytes(positions));
@ -2444,7 +2603,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data); free(positions_data);
}
{ {
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
@ -2456,6 +2614,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
free(patches_data); free(patches_data);
} }
} }
}
if (ggml_backend_is_cpu(ctx->backend)) { if (ggml_backend_is_cpu(ctx->backend)) {
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
@ -2626,6 +2785,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return 3584; return 3584;
} }
} }
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
return ctx->vision_model.mm_1_b->ne[0];
}
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@ -2637,3 +2799,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
} }
return 0; return 0;
} }
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
return ctx->has_qwen2vl_merger;
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3);
for (int i = 0; i < h*w*3; i++)
{
clip_img.buf[i] = img[i];
}
clip_img.nx = w;
clip_img.ny = h;
clip_image_encode(ctx, n_threads, &clip_img, vec);
return true;
}

View File

@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
@ -56,10 +57,12 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_u8 * clip_image_u8_init ();
@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -259,18 +259,24 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
if (clip_is_minicpmv(ctx_clip)) { if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
std::vector<float *> image_embd_v; std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size); image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init(); struct clip_image_size * load_image_size = clip_image_size_init();
for (size_t i = 0; i < img_res_v.size; i++) { for (size_t i = 0; i < img_res_v.size; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us(); const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
int patch_size=14; int patch_size=14;
load_image_size->width = img_res_v.data[i].nx; load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny; load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size); clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false; bool encoded = false;
if (clip_is_qwen2vl(ctx_clip)) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
else {
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) { if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
@ -278,6 +284,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
else if (has_minicpmv_projector == 3) { else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
} }
}
if (!encoded) { if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;
@ -290,8 +298,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
int n_img_pos_out = 0; int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); std::memcpy(
n_img_pos_out += clip_n_patches(ctx_clip); image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
} }
*n_img_pos = n_img_pos_out; *n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
@ -387,7 +398,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
if (clip_is_minicpmv(ctx_clip)) { if (clip_is_minicpmv(ctx_clip)) {
num_max_patches = 10; num_max_patches = 10;
} }
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model float * image_embd;
if (clip_is_qwen2vl(ctx_clip)) {
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
} else {
image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
}
if (!image_embd) { if (!image_embd) {
LOG_ERR("Unable to allocate memory for image embeddings\n"); LOG_ERR("Unable to allocate memory for image embeddings\n");
return false; return false;

View File

@ -0,0 +1,158 @@
import argparse
from typing import Dict
import torch
import numpy as np
from gguf import *
from transformers import (
Qwen2VLForConditionalGeneration,
Qwen2VLProcessor,
AutoProcessor,
Qwen2VLConfig
)
VISION = "clip.vision"
def k(raw_key: str, arch: str) -> str:
return raw_key.format(arch=arch)
def to_gguf_name(name: str) -> str:
og = name
name = name.replace("text_model", "t").replace("vision_model", "v")
name = name.replace("blocks", "blk").replace("embeddings.", "")
name = name.replace("attn.", "attn_")
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
name = name.replace("merger.mlp", 'mm')
print(f"[to_gguf_name] {og} --> {name}")
return name
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
vision_model = qwen2vl.visual
tensor_map = {}
for name, ten in vision_model.state_dict().items():
ten = ten.numpy()
if 'qkv' in name:
if ten.ndim == 2: # weight
c3, _ = ten.shape
else: # bias
c3 = ten.shape[0]
assert c3 % 3 == 0
c = c3 // 3
wq = ten[:c]
wk = ten[c: c * 2]
wv = ten[c * 2:]
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
elif 'merger' in name:
if name.endswith("ln_q.weight"):
tensor_map['v.post_ln.weight'] = ten
elif name.endswith("ln_q.bias"):
tensor_map['v.post_ln.bias'] = ten
else:
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
tensor_map[to_gguf_name(name)] = ten
elif 'patch_embed.proj.weight' in name:
# NOTE: split Conv3D into Conv2Ds
c1, c2, kt, kh, kw = ten.shape
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
else:
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
for new_name, ten in tensor_map.items():
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
tensor_map[new_name] = ten.astype(np.float32)
else:
tensor_map[new_name] = ten.astype(dtype)
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
return tensor_map
def main(args):
if args.data_type == 'fp32':
dtype = torch.float32
np_dtype = np.float32
ftype = 0
elif args.data_type == 'fp16':
dtype = torch.float32
np_dtype = np.float16
ftype = 1
else:
raise ValueError()
model_name = args.model_name
print("model_name: ", model_name)
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype=dtype, device_map="cpu"
)
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
vcfg = cfg.vision_config
if os.path.isdir(model_name):
if model_name.endswith(os.sep):
model_name = model_name[:-1]
model_name = os.path.basename(model_name)
fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
fout = GGUFWriter(path=fname_out, arch="clip")
fout.add_description("image encoder for Qwen2VL")
fout.add_file_type(ftype)
fout.add_bool("clip.has_text_encoder", False)
fout.add_bool("clip.has_vision_encoder", True)
fout.add_bool("clip.has_qwen2vl_merger", True)
fout.add_string("clip.projector_type", "qwen2vl_merger")
print(cfg.vision_config)
if 'silu' in cfg.vision_config.hidden_act.lower():
fout.add_bool("clip.use_silu", True)
fout.add_bool("clip.use_gelu", False)
elif 'gelu' in cfg.vision_config.hidden_act.lower():
fout.add_bool("clip.use_silu", False)
fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
else:
raise ValueError()
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
for name, data in tensor_map.items():
fout.add_tensor(name, data)
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder
fout.add_name(model_name)
"""
HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
"""
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
fout.write_header_to_file()
fout.write_kv_data_to_file()
fout.write_tensors_to_file()
fout.close()
print("save model as: ", fname_out)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
args = parser.parse_args()
main(args)

View File

@ -0,0 +1,581 @@
#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "ggml.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef NDEBUG
#include "ggml-alloc.h"
#include "ggml-backend.h"
#endif
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
const int patch_size = 14 * 2;
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
auto img_tokens = image_embed->n_image_pos;
// llama_pos mrope_pos[img_tokens * 4];
std::vector<llama_pos> mrope_pos;
mrope_pos.resize(img_tokens * 4);
for (int y = 0; y < ph; y++)
{
for (int x = 0; x < pw; x++)
{
int i = y * pw + x;
mrope_pos[i] = *st_pos_id;
mrope_pos[i + img_tokens] = *st_pos_id + y;
mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
mrope_pos[i + img_tokens * 3] = 0;
}
}
*st_pos_id += std::max(pw, ph);
int processed = 0;
std::vector<llama_pos> batch_mrope_pos;
batch_mrope_pos.resize(img_tokens * 4);
for (int i = 0; i < img_tokens; i += n_batch) {
int n_eval = img_tokens - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
// llama_pos batch_mrope_pos[n_eval * 4];
std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
llama_batch batch = {
int32_t(n_eval), // n_tokens
nullptr, // token
(image_embed->embed+i*n_embd), // embed
batch_mrope_pos.data(), // pos
nullptr, // n_seq_id
nullptr, // seq_id
nullptr, // logits
};
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
processed += n_eval;
}
return true;
}
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
int N = (int) tokens.size();
std::vector<llama_pos> pos;
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
auto batch = llama_batch_get_one(&tokens[i], n_eval);
// TODO: add mrope pos ids somewhere else
pos.resize(batch.n_tokens * 4);
std::fill(pos.begin(), pos.end(), 0);
for (int j = 0; j < batch.n_tokens * 3; j ++) {
pos[j] = *st_pos_id + (j % batch.n_tokens);
}
batch.pos = pos.data();
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
*st_pos_id += n_eval;
}
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
return true;
}
static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past, int * st_pos_id) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past, st_pos_id);
return ret.c_str();
}
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
static const char* IMG_BASE64_TAG_END = "\">";
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
}
static bool prompt_contains_image(const std::string& prompt) {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
return (begin != std::string::npos);
}
// replaces the base64 image tag in the prompt with `replacement`
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL;
}
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
auto required_bytes = base64::required_encode_size(base64_str.size());
auto img_bytes = std::vector<unsigned char>(required_bytes);
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) {
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
return NULL;
}
return embed;
}
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
if (begin == std::string::npos || end == std::string::npos) {
return prompt;
}
auto pre = prompt.substr(0, begin);
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
return pre + replacement + post;
}
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void print_usage(int, char ** argv) {
LOG("\n example usage:\n");
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
// load and preprocess the image
llava_image_embed * embed = NULL;
auto prompt = params->prompt;
if (prompt_contains_image(prompt)) {
if (!params->image.empty()) {
LOG_INF("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_ERR("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
}
}
return embed;
}
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
int n_past = 0;
int cur_pos_id = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
std::string system_prompt, user_prompt;
size_t image_pos = prompt.find("<|vision_start|>");
if (image_pos != std::string::npos) {
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
} else {
// llava-1.5 native mode
system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
if (image_embed != nullptr) {
auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
// generate the response
LOG("\n");
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
std::string response = "";
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
LOG("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
fflush(stdout);
}
common_sampler_free(smpl);
LOG("\n");
}
static struct llama_model * llava_init(common_params * params) {
llama_backend_init();
llama_numa_init(params->numa);
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
}
#ifndef NDEBUG
static void debug_test_mrope_2d() {
// 1. Initialize backend
ggml_backend_t backend = NULL;
std::string backend_name = "";
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
backend_name = "cuda";
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
backend = ggml_backend_cpu_init();
backend_name = "cpu";
}
// Calculate the size needed to allocate
size_t ctx_size = 0;
ctx_size += 2 * ggml_tensor_overhead(); // tensors
// no need to allocate anything else!
// 2. Allocate `ggml_context` to store tensor data
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
};
struct ggml_context * ctx = ggml_init(params);
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
ggml_set_name(pos, "pos");
ggml_set_input(pos);
std::vector<float> dummy_q;
dummy_q.resize(128 * 12 * 30);
std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
// memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
std::vector<int> pos_id;
pos_id.resize(30 * 4);
for (int i = 0; i < 30; i ++) {
pos_id[i] = i;
pos_id[i + 30] = i + 10;
pos_id[i + 60] = i + 20;
pos_id[i + 90] = i + 30;
}
int sections[4] = {32, 32, 0, 0};
// 4. Allocate a `ggml_backend_buffer` to store all tensors
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
// 5. Copy tensor data from main memory (RAM) to backend buffer
ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
// 6. Create a `ggml_cgraph` for mul_mat operation
struct ggml_cgraph * gf = NULL;
struct ggml_context * ctx_cgraph = NULL;
// create a temporally context to build the graph
struct ggml_init_params params0 = {
/*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
ctx_cgraph = ggml_init(params0);
gf = ggml_new_graph(ctx_cgraph);
struct ggml_tensor * result0 = ggml_rope_multi(
ctx_cgraph, inp_raw, pos, nullptr,
128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
0, 1, 32, 1);
// Add "result" tensor and all of its dependencies to the cgraph
ggml_build_forward_expand(gf, result0);
// 7. Create a `ggml_gallocr` for cgraph computation
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
ggml_gallocr_alloc_graph(allocr, gf);
// 9. Run the computation
int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
ggml_backend_graph_compute(backend, gf);
// 10. Retrieve results (output tensors)
// in this example, output tensor is always the last tensor in the graph
struct ggml_tensor * result = result0;
// struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
float * result_data = (float *)malloc(ggml_nbytes(result));
// because the tensor data is stored in device buffer, we need to copy it back to RAM
ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
const std::string bin_file = "mrope_2d_" + backend_name +".bin";
std::ofstream outFile(bin_file, std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
outFile.close();
std::cout << "Data successfully written to " + bin_file << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
free(result_data);
// 11. Free memory and exit
ggml_free(ctx_cgraph);
ggml_gallocr_free(allocr);
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
ggml_backend_free(backend);
}
static void debug_dump_img_embed(struct llava_context * ctx_llava) {
int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
int ne = n_embd * 4;
float vals[56 * 56 * 3];
// float embd[ne];
std::vector<float> embd;
embd.resize(ne);
for (int i = 0; i < 56*56; i++)
{
for (int c = 0; c < 3; c++)
vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
}
clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
std::ofstream outFile("img_embed.bin", std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
outFile.close();
std::cout << "Data successfully written to mrope.bin" << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
}
#endif
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
return 1;
}
common_init();
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
auto * model = llava_init(&params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1;
}
if (prompt_contains_image(params.prompt)) {
auto * ctx_llava = llava_init_context(&params, model);
auto * image_embed = load_image(ctx_llava, &params, "");
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
#ifndef NDEBUG
} else if (params.image[0].empty()) {
auto ctx_llava = llava_init_context(&params, model);
debug_test_mrope_2d();
debug_dump_img_embed(ctx_llava);
llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL;
llava_free(ctx_llava);
#endif
} else {
for (auto & image : params.image) {
auto * ctx_llava = llava_init_context(&params, model);
auto * image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) {
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
return 1;
}
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
}
llama_free_model(model);
return 0;
}

View File

@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
- [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)

View File

@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
} }
LOG_INF("Number of chunks: %ld\n", chunks.size()); LOG_INF("Number of chunks: %zu\n", chunks.size());
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);

View File

@ -1,5 +1,5 @@
set(TARGET llama-run) set(TARGET llama-run)
add_executable(${TARGET} run.cpp) add_executable(${TARGET} run.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -3,5 +3,45 @@
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models. The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
```bash ```bash
./llama-run Meta-Llama-3.1-8B-Instruct.gguf llama-run granite-code
...
```bash
llama-run -h
Description:
Runs a llm
Usage:
llama-run [options] model [prompt]
Options:
-c, --context-size <value>
Context size (default: 2048)
-n, --ngl <value>
Number of GPU layers (default: 0)
-h, --help
Show help message
Commands:
model
Model is a string with an optional prefix of
huggingface:// (hf://), ollama://, https:// or file://.
If no protocol is specified and a file exists in the specified
path, file:// is assumed, otherwise if a file does not exist in
the specified path, ollama:// is assumed. Models that are being
pulled are downloaded with .partial extension while being
downloaded and then renamed as the file without the .partial
extension when complete.
Examples:
llama-run llama3
llama-run ollama://granite-code
llama-run ollama://smollm:135m
llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
llama-run https://example.com/some-file1.gguf
llama-run some-file2.gguf
llama-run file://some-file3.gguf
llama-run --ngl 99 some-file4.gguf
llama-run --ngl 99 some-file5.gguf Hello World
... ...

View File

@ -1,113 +1,333 @@
#if defined(_WIN32) #if defined(_WIN32)
#include <windows.h> # include <windows.h>
#else #else
#include <unistd.h> # include <unistd.h>
#endif #endif
#include <climits> #if defined(LLAMA_USE_CURL)
# include <curl/curl.h>
#endif
#include <cstdarg>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <filesystem>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "common.h"
#include "json.hpp"
#include "llama-cpp.h" #include "llama-cpp.h"
typedef std::unique_ptr<char[]> char_array_ptr; #define printe(...) \
do { \
fprintf(stderr, __VA_ARGS__); \
} while (0)
struct Argument { class Opt {
std::string flag;
std::string help_text;
};
struct Options {
std::string model_path, prompt_non_interactive;
int ngl = 99;
int n_ctx = 2048;
};
class ArgumentParser {
public: public:
ArgumentParser(const char * program_name) : program_name(program_name) {} int init(int argc, const char ** argv) {
construct_help_str_();
void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") { // Parse arguments
string_args[flag] = &var; if (parse(argc, argv)) {
arguments.push_back({flag, help_text}); printe("Error: Failed to parse arguments.\n");
help();
return 1;
} }
void add_argument(const std::string & flag, int & var, const std::string & help_text = "") { // If help is requested, show help and exit
int_args[flag] = &var; if (help_) {
arguments.push_back({flag, help_text}); help();
return 2;
}
return 0; // Success
}
std::string model_;
std::string user_;
int context_size_ = 2048, ngl_ = -1;
private:
std::string help_str_;
bool help_ = false;
void construct_help_str_() {
help_str_ =
"Description:\n"
" Runs a llm\n"
"\n"
"Usage:\n"
" llama-run [options] model [prompt]\n"
"\n"
"Options:\n"
" -c, --context-size <value>\n"
" Context size (default: " +
std::to_string(context_size_);
help_str_ +=
")\n"
" -n, --ngl <value>\n"
" Number of GPU layers (default: " +
std::to_string(ngl_);
help_str_ +=
")\n"
" -h, --help\n"
" Show help message\n"
"\n"
"Commands:\n"
" model\n"
" Model is a string with an optional prefix of \n"
" huggingface:// (hf://), ollama://, https:// or file://.\n"
" If no protocol is specified and a file exists in the specified\n"
" path, file:// is assumed, otherwise if a file does not exist in\n"
" the specified path, ollama:// is assumed. Models that are being\n"
" pulled are downloaded with .partial extension while being\n"
" downloaded and then renamed as the file without the .partial\n"
" extension when complete.\n"
"\n"
"Examples:\n"
" llama-run llama3\n"
" llama-run ollama://granite-code\n"
" llama-run ollama://smollm:135m\n"
" llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
" llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
" llama-run https://example.com/some-file1.gguf\n"
" llama-run some-file2.gguf\n"
" llama-run file://some-file3.gguf\n"
" llama-run --ngl 99 some-file4.gguf\n"
" llama-run --ngl 99 some-file5.gguf Hello World\n";
} }
int parse(int argc, const char ** argv) { int parse(int argc, const char ** argv) {
int positional_args_i = 0;
for (int i = 1; i < argc; ++i) { for (int i = 1; i < argc; ++i) {
std::string arg = argv[i]; if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) {
if (string_args.count(arg)) { if (i + 1 >= argc) {
if (i + 1 < argc) {
*string_args[arg] = argv[++i];
} else {
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
print_usage();
return 1; return 1;
} }
} else if (int_args.count(arg)) {
if (i + 1 < argc) { context_size_ = std::atoi(argv[++i]);
if (parse_int_arg(argv[++i], *int_args[arg]) != 0) { } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) {
fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]); if (i + 1 >= argc) {
print_usage();
return 1; return 1;
} }
ngl_ = std::atoi(argv[++i]);
} else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
help_ = true;
return 0;
} else if (!positional_args_i) {
++positional_args_i;
model_ = argv[i];
} else if (positional_args_i == 1) {
++positional_args_i;
user_ = argv[i];
} else { } else {
fprintf(stderr, "error: missing value for %s\n", arg.c_str()); user_ += " " + std::string(argv[i]);
print_usage();
return 1;
}
} else {
fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
print_usage();
return 1;
} }
} }
if (string_args["-m"]->empty()) { return model_.empty(); // model_ is the only required value
fprintf(stderr, "error: -m is required\n"); }
print_usage();
void help() const { printf("%s", help_str_.c_str()); }
};
struct progress_data {
size_t file_size = 0;
std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
bool printed = false;
};
struct FileDeleter {
void operator()(FILE * file) const {
if (file) {
fclose(file);
}
}
};
typedef std::unique_ptr<FILE, FileDeleter> FILE_ptr;
#ifdef LLAMA_USE_CURL
class CurlWrapper {
public:
int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
const bool progress, std::string * response_str = nullptr) {
std::string output_file_partial;
curl = curl_easy_init();
if (!curl) {
return 1; return 1;
} }
progress_data data;
FILE_ptr out;
if (!output_file.empty()) {
output_file_partial = output_file + ".partial";
out.reset(fopen(output_file_partial.c_str(), "ab"));
}
set_write_options(response_str, out);
data.file_size = set_resume_point(output_file_partial);
set_progress_options(progress, data);
set_headers(headers);
perform(url);
if (!output_file.empty()) {
std::filesystem::rename(output_file_partial, output_file);
}
return 0; return 0;
} }
~CurlWrapper() {
if (chunk) {
curl_slist_free_all(chunk);
}
if (curl) {
curl_easy_cleanup(curl);
}
}
private: private:
const char * program_name; CURL * curl = nullptr;
std::unordered_map<std::string, std::string *> string_args; struct curl_slist * chunk = nullptr;
std::unordered_map<std::string, int *> int_args;
std::vector<Argument> arguments;
int parse_int_arg(const char * arg, int & value) { void set_write_options(std::string * response_str, const FILE_ptr & out) {
char * end; if (response_str) {
const long val = std::strtol(arg, &end, 10); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data);
if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) { curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str);
value = static_cast<int>(val); } else {
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.get());
}
}
size_t set_resume_point(const std::string & output_file) {
size_t file_size = 0;
if (std::filesystem::exists(output_file)) {
file_size = std::filesystem::file_size(output_file);
curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast<curl_off_t>(file_size));
}
return file_size;
}
void set_progress_options(bool progress, progress_data & data) {
if (progress) {
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data);
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
}
}
void set_headers(const std::vector<std::string> & headers) {
if (!headers.empty()) {
if (chunk) {
curl_slist_free_all(chunk);
chunk = 0;
}
for (const auto & header : headers) {
chunk = curl_slist_append(chunk, header.c_str());
}
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
}
}
void perform(const std::string & url) {
CURLcode res;
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
res = curl_easy_perform(curl);
if (res != CURLE_OK) {
printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
}
}
static std::string human_readable_time(double seconds) {
int hrs = static_cast<int>(seconds) / 3600;
int mins = (static_cast<int>(seconds) % 3600) / 60;
int secs = static_cast<int>(seconds) % 60;
std::ostringstream out;
if (hrs > 0) {
out << hrs << "h " << std::setw(2) << std::setfill('0') << mins << "m " << std::setw(2) << std::setfill('0')
<< secs << "s";
} else if (mins > 0) {
out << mins << "m " << std::setw(2) << std::setfill('0') << secs << "s";
} else {
out << secs << "s";
}
return out.str();
}
static std::string human_readable_size(curl_off_t size) {
static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" };
char length = sizeof(suffix) / sizeof(suffix[0]);
int i = 0;
double dbl_size = size;
if (size > 1024) {
for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) {
dbl_size = size / 1024.0;
}
}
std::ostringstream out;
out << std::fixed << std::setprecision(2) << dbl_size << " " << suffix[i];
return out.str();
}
static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
curl_off_t) {
progress_data * data = static_cast<progress_data *>(ptr);
if (total_to_download <= 0) {
return 0; return 0;
} }
return 1;
total_to_download += data->file_size;
const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size;
const curl_off_t percentage = (now_downloaded_plus_file_size * 100) / total_to_download;
const curl_off_t pos = (percentage / 5);
std::string progress_bar;
for (int i = 0; i < 20; ++i) {
progress_bar.append((i < pos) ? "" : " ");
} }
void print_usage() const { // Calculate download speed and estimated time to completion
printf("\nUsage:\n"); const auto now = std::chrono::steady_clock::now();
printf(" %s [OPTIONS]\n\n", program_name); const std::chrono::duration<double> elapsed_seconds = now - data->start_time;
printf("Options:\n"); const double speed = now_downloaded / elapsed_seconds.count();
for (const auto & arg : arguments) { const double estimated_time = (total_to_download - now_downloaded) / speed;
printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str()); printe("\r%ld%% |%s| %s/%s %.2f MB/s %s ", percentage, progress_bar.c_str(),
human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(),
speed / (1024 * 1024), human_readable_time(estimated_time).c_str());
fflush(stderr);
data->printed = true;
return 0;
} }
printf("\n"); // Function to write data to a file
static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
FILE * out = static_cast<FILE *>(stream);
return fwrite(ptr, size, nmemb, out);
}
// Function to capture data into a string
static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) {
std::string * str = static_cast<std::string *>(stream);
str->append(static_cast<char *>(ptr), size * nmemb);
return size * nmemb;
} }
}; };
#endif
class LlamaData { class LlamaData {
public: public:
@ -115,14 +335,16 @@ class LlamaData {
llama_sampler_ptr sampler; llama_sampler_ptr sampler;
llama_context_ptr context; llama_context_ptr context;
std::vector<llama_chat_message> messages; std::vector<llama_chat_message> messages;
std::vector<std::string> msg_strs;
std::vector<char> fmtted;
int init(const Options & opt) { int init(Opt & opt) {
model = initialize_model(opt.model_path, opt.ngl); model = initialize_model(opt);
if (!model) { if (!model) {
return 1; return 1;
} }
context = initialize_context(model, opt.n_ctx); context = initialize_context(model, opt.context_size_);
if (!context) { if (!context) {
return 1; return 1;
} }
@ -132,14 +354,122 @@ class LlamaData {
} }
private: private:
// Initializes the model and returns a unique pointer to it #ifdef LLAMA_USE_CURL
llama_model_ptr initialize_model(const std::string & model_path, const int ngl) { int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
llama_model_params model_params = llama_model_default_params(); const bool progress, std::string * response_str = nullptr) {
model_params.n_gpu_layers = ngl; CurlWrapper curl;
if (curl.init(url, headers, output_file, progress, response_str)) {
return 1;
}
llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params)); return 0;
}
#else
int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool,
std::string * = nullptr) {
printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return 1;
}
#endif
int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) {
// Find the second occurrence of '/' after protocol string
size_t pos = model.find('/');
pos = model.find('/', pos + 1);
if (pos == std::string::npos) {
return 1;
}
const std::string hfr = model.substr(0, pos);
const std::string hff = model.substr(pos + 1);
const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
return download(url, headers, bn, true);
}
int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
if (model.find('/') == std::string::npos) {
model = "library/" + model;
}
std::string model_tag = "latest";
size_t colon_pos = model.find(':');
if (colon_pos != std::string::npos) {
model_tag = model.substr(colon_pos + 1);
model = model.substr(0, colon_pos);
}
std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag;
std::string manifest_str;
const int ret = download(manifest_url, headers, "", false, &manifest_str);
if (ret) {
return ret;
}
nlohmann::json manifest = nlohmann::json::parse(manifest_str);
std::string layer;
for (const auto & l : manifest["layers"]) {
if (l["mediaType"] == "application/vnd.ollama.image.model") {
layer = l["digest"];
break;
}
}
std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer;
return download(blob_url, headers, bn, true);
}
std::string basename(const std::string & path) {
const size_t pos = path.find_last_of("/\\");
if (pos == std::string::npos) {
return path;
}
return path.substr(pos + 1);
}
int remove_proto(std::string & model_) {
const std::string::size_type pos = model_.find("://");
if (pos == std::string::npos) {
return 1;
}
model_ = model_.substr(pos + 3); // Skip past "://"
return 0;
}
int resolve_model(std::string & model_) {
const std::string bn = basename(model_);
const std::vector<std::string> headers = { "--header",
"Accept: application/vnd.docker.distribution.manifest.v2+json" };
int ret = 0;
if (string_starts_with(model_, "file://") || std::filesystem::exists(bn)) {
remove_proto(model_);
} else if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
remove_proto(model_);
ret = huggingface_dl(model_, headers, bn);
} else if (string_starts_with(model_, "ollama://")) {
remove_proto(model_);
ret = ollama_dl(model_, headers, bn);
} else if (string_starts_with(model_, "https://")) {
download(model_, headers, bn, true);
} else {
ret = ollama_dl(model_, headers, bn);
}
model_ = bn;
return ret;
}
// Initializes the model and returns a unique pointer to it
llama_model_ptr initialize_model(Opt & opt) {
ggml_backend_load_all();
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
resolve_model(opt.model_);
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
if (!model) { if (!model) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
} }
return model; return model;
@ -150,10 +480,9 @@ class LlamaData {
llama_context_params ctx_params = llama_context_default_params(); llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = n_ctx; ctx_params.n_ctx = n_ctx;
ctx_params.n_batch = n_ctx; ctx_params.n_batch = n_ctx;
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params)); llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
if (!context) { if (!context) {
fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__); printe("%s: error: failed to create the llama_context\n", __func__);
} }
return context; return context;
@ -170,23 +499,22 @@ class LlamaData {
} }
}; };
// Add a message to `messages` and store its content in `owned_content` // Add a message to `messages` and store its content in `msg_strs`
static void add_message(const char * role, const std::string & text, LlamaData & llama_data, static void add_message(const char * role, const std::string & text, LlamaData & llama_data) {
std::vector<char_array_ptr> & owned_content) { llama_data.msg_strs.push_back(std::move(text));
char_array_ptr content(new char[text.size() + 1]); llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() });
std::strcpy(content.get(), text.c_str());
llama_data.messages.push_back({role, content.get()});
owned_content.push_back(std::move(content));
} }
// Function to apply the chat template and resize `formatted` if needed // Function to apply the chat template and resize `formatted` if needed
static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) { static int apply_chat_template(LlamaData & llama_data, const bool append) {
int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), int result = llama_chat_apply_template(
llama_data.messages.size(), append, formatted.data(), formatted.size()); llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append,
if (result > static_cast<int>(formatted.size())) { append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
formatted.resize(result); if (append && result > static_cast<int>(llama_data.fmtted.size())) {
llama_data.fmtted.resize(result);
result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
llama_data.messages.size(), append, formatted.data(), formatted.size()); llama_data.messages.size(), append, llama_data.fmtted.data(),
llama_data.fmtted.size());
} }
return result; return result;
@ -199,7 +527,8 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr
prompt_tokens.resize(n_prompt_tokens); prompt_tokens.resize(n_prompt_tokens);
if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
true) < 0) { true) < 0) {
GGML_ABORT("failed to tokenize the prompt\n"); printe("failed to tokenize the prompt\n");
return -1;
} }
return n_prompt_tokens; return n_prompt_tokens;
@ -211,7 +540,7 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
if (n_ctx_used + batch.n_tokens > n_ctx) { if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n"); printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n"); printe("context size exceeded\n");
return 1; return 1;
} }
@ -223,7 +552,8 @@ static int convert_token_to_string(const llama_model_ptr & model, const llama_to
char buf[256]; char buf[256];
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
if (n < 0) { if (n < 0) {
GGML_ABORT("failed to convert token to piece\n"); printe("failed to convert token to piece\n");
return 1;
} }
piece = std::string(buf, n); piece = std::string(buf, n);
@ -238,19 +568,19 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
// helper function to evaluate a prompt and generate a response // helper function to evaluate a prompt and generate a response
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
std::vector<llama_token> prompt_tokens; std::vector<llama_token> tokens;
const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens); if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) {
if (n_prompt_tokens < 0) {
return 1; return 1;
} }
// prepare a batch for the prompt // prepare a batch for the prompt
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
llama_token new_token_id; llama_token new_token_id;
while (true) { while (true) {
check_context_size(llama_data.context, batch); check_context_size(llama_data.context, batch);
if (llama_decode(llama_data.context.get(), batch)) { if (llama_decode(llama_data.context.get(), batch)) {
GGML_ABORT("failed to decode\n"); printe("failed to decode\n");
return 1;
} }
// sample the next token, check is it an end of generation? // sample the next token, check is it an end of generation?
@ -273,22 +603,9 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
return 0; return 0;
} }
static int parse_arguments(const int argc, const char ** argv, Options & opt) {
ArgumentParser parser(argv[0]);
parser.add_argument("-m", opt.model_path, "model");
parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
parser.add_argument("-c", opt.n_ctx, "context_size");
parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
if (parser.parse(argc, argv)) {
return 1;
}
return 0;
}
static int read_user_input(std::string & user) { static int read_user_input(std::string & user) {
std::getline(std::cin, user); std::getline(std::cin, user);
return user.empty(); // Indicate an error or empty input return user.empty(); // Should have data in happy path
} }
// Function to generate a response based on the prompt // Function to generate a response based on the prompt
@ -296,7 +613,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
// Set response color // Set response color
printf("\033[33m"); printf("\033[33m");
if (generate(llama_data, prompt, response)) { if (generate(llama_data, prompt, response)) {
fprintf(stderr, "failed to generate response\n"); printe("failed to generate response\n");
return 1; return 1;
} }
@ -306,11 +623,10 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
} }
// Helper function to apply the chat template and handle errors // Helper function to apply the chat template and handle errors
static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted, static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
const bool is_user_input, int & output_length) { const int new_len = apply_chat_template(llama_data, append);
const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
if (new_len < 0) { if (new_len < 0) {
fprintf(stderr, "failed to apply the chat template\n"); printe("failed to apply the chat template\n");
return -1; return -1;
} }
@ -319,49 +635,56 @@ static int apply_chat_template_with_error_handling(const LlamaData & llama_data,
} }
// Helper function to handle user input // Helper function to handle user input
static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) { static int handle_user_input(std::string & user_input, const std::string & user_) {
if (!prompt_non_interactive.empty()) { if (!user_.empty()) {
user_input = prompt_non_interactive; user_input = user_;
return true; // No need for interactive input return 0; // No need for interactive input
} }
printf("\033[32m> \033[0m"); printf(
return !read_user_input(user_input); // Returns false if input ends the loop "\r "
"\r\033[32m> \033[0m");
return read_user_input(user_input); // Returns true if input ends the loop
} }
// Function to tokenize the prompt // Function to tokenize the prompt
static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) { static int chat_loop(LlamaData & llama_data, const std::string & user_) {
std::vector<char_array_ptr> owned_content;
std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
int prev_len = 0; int prev_len = 0;
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
while (true) { while (true) {
// Get user input // Get user input
std::string user_input; std::string user_input;
if (!handle_user_input(user_input, prompt_non_interactive)) { while (handle_user_input(user_input, user_)) {
break;
} }
add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data, add_message("user", user_.empty() ? user_input : user_, llama_data);
owned_content);
int new_len; int new_len;
if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) { if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
return 1; return 1;
} }
std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len); std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
std::string response; std::string response;
if (generate_response(llama_data, prompt, response)) { if (generate_response(llama_data, prompt, response)) {
return 1; return 1;
} }
if (!user_.empty()) {
break;
} }
add_message("assistant", response, llama_data);
if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
return 1;
}
}
return 0; return 0;
} }
static void log_callback(const enum ggml_log_level level, const char * text, void *) { static void log_callback(const enum ggml_log_level level, const char * text, void *) {
if (level == GGML_LOG_LEVEL_ERROR) { if (level == GGML_LOG_LEVEL_ERROR) {
fprintf(stderr, "%s", text); printe("%s", text);
} }
} }
@ -382,17 +705,20 @@ static std::string read_pipe_data() {
} }
int main(int argc, const char ** argv) { int main(int argc, const char ** argv) {
Options opt; Opt opt;
if (parse_arguments(argc, argv, opt)) { const int ret = opt.init(argc, argv);
if (ret == 2) {
return 0;
} else if (ret) {
return 1; return 1;
} }
if (!is_stdin_a_terminal()) { if (!is_stdin_a_terminal()) {
if (!opt.prompt_non_interactive.empty()) { if (!opt.user_.empty()) {
opt.prompt_non_interactive += "\n\n"; opt.user_ += "\n\n";
} }
opt.prompt_non_interactive += read_pipe_data(); opt.user_ += read_pipe_data();
} }
llama_log_set(log_callback, nullptr); llama_log_set(log_callback, nullptr);
@ -401,7 +727,7 @@ int main(int argc, const char ** argv) {
return 1; return 1;
} }
if (chat_loop(llama_data, opt.prompt_non_interactive)) { if (chat_loop(llama_data, opt.user_)) {
return 1; return 1;
} }

View File

@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) | | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache | | `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
@ -138,6 +138,7 @@ The project is under active development, and we are [looking for feedback and co
| -------- | ----------- | | -------- | ----------- |
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
| `-sp, --special` | special tokens output enabled (default: false) | | `-sp, --special` | special tokens output enabled (default: false) |
| `--no-warmup` | skip warming up the model with an empty run |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
@ -146,6 +147,7 @@ The project is under active development, and we are [looking for feedback and co
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) | | `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) | | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) | | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) | | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@ -163,13 +165,13 @@ The project is under active development, and we are [looking for feedback and co
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) | | `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) | | `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) | | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices | | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model | | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
@ -302,23 +304,23 @@ mkdir llama-client
cd llama-client cd llama-client
``` ```
Create a index.js file and put this inside: Create an index.js file and put this inside:
```javascript ```javascript
const prompt = `Building a website can be done in 10 simple steps:`; const prompt = "Building a website can be done in 10 simple steps:"
async function Test() { async function test() {
let response = await fetch("http://127.0.0.1:8080/completion", { let response = await fetch("http://127.0.0.1:8080/completion", {
method: 'POST', method: "POST",
body: JSON.stringify({ body: JSON.stringify({
prompt, prompt,
n_predict: 512, n_predict: 64,
}) })
}) })
console.log((await response.json()).content) console.log((await response.json()).content)
} }
Test() test()
``` ```
And run it: And run it:
@ -380,7 +382,7 @@ Multiple prompts are also supported. In this case, the completion result will be
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. `stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
`stop`: Specify a JSON array of stopping strings. `stop`: Specify a JSON array of stopping strings.
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
@ -441,11 +443,11 @@ These words will not be included in the completion, so make sure to add them to
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
**Response format** **Response format**
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. - Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:

File diff suppressed because one or more lines are too long

View File

@ -1145,9 +1145,9 @@ struct server_slot {
SLT_INF(*this, SLT_INF(*this,
"\n" "\n"
"\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
"\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
"\r total time = %10.2f ms / %5d tokens\n", " total time = %10.2f ms / %5d tokens\n",
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
t_token_generation, n_decoded, t_gen, n_gen_second, t_token_generation, n_decoded, t_gen, n_gen_second,
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
@ -4000,6 +4000,9 @@ int main(int argc, char ** argv) {
// Router // Router
// //
if (!params.webui) {
LOG_INF("Web UI is disabled\n");
} else {
// register static assets routes // register static assets routes
if (!params.public_path.empty()) { if (!params.public_path.empty()) {
// Set the base directory for serving static files // Set the base directory for serving static files
@ -4015,6 +4018,7 @@ int main(int argc, char ** argv) {
return false; return false;
}); });
} }
}
// register API routes // register API routes
svr->Get ("/health", handle_health); // public endpoint (no API key check) svr->Get ("/health", handle_health); // public endpoint (no API key check)

View File

@ -1,4 +1,5 @@
import pytest import pytest
import requests
from utils import * from utils import *
server = ServerPreset.tinyllama2() server = ServerPreset.tinyllama2()
@ -76,3 +77,20 @@ def test_load_split_model():
}) })
assert res.status_code == 200 assert res.status_code == 200
assert match_regex("(little|girl)+", res.body["content"]) assert match_regex("(little|girl)+", res.body["content"])
def test_no_webui():
global server
# default: webui enabled
server.start()
url = f"http://{server.server_host}:{server.server_port}"
res = requests.get(url)
assert res.status_code == 200
assert "<html>" in res.text
server.stop()
# with --no-webui
server.no_webui = True
server.start()
res = requests.get(url)
assert res.status_code == 404

View File

@ -74,6 +74,7 @@ class ServerProcess:
disable_ctx_shift: int | None = False disable_ctx_shift: int | None = False
draft_min: int | None = None draft_min: int | None = None
draft_max: int | None = None draft_max: int | None = None
no_webui: bool | None = None
# session variables # session variables
process: subprocess.Popen | None = None process: subprocess.Popen | None = None
@ -164,6 +165,8 @@ class ServerProcess:
server_args.extend(["--draft-max", self.draft_max]) server_args.extend(["--draft-max", self.draft_max])
if self.draft_min: if self.draft_min:
server_args.extend(["--draft-min", self.draft_min]) server_args.extend(["--draft-min", self.draft_min])
if self.no_webui:
server_args.append("--no-webui")
args = [str(arg) for arg in [server_path, *server_args]] args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}") print(f"bench: starting server with: {' '.join(args)}")

View File

@ -15,7 +15,7 @@
<!-- sidebar --> <!-- sidebar -->
<div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64"> <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
<label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label> <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
<div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4"> <div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
<div class="flex flex-row items-center justify-between mb-4 mt-4"> <div class="flex flex-row items-center justify-between mb-4 mt-4">
<h2 class="font-bold ml-4">Conversations</h2> <h2 class="font-bold ml-4">Conversations</h2>
@ -120,51 +120,25 @@
{{ messages.length === 0 ? 'Send a message to start' : '' }} {{ messages.length === 0 ? 'Send a message to start' : '' }}
</div> </div>
<div v-for="msg in messages" class="group"> <div v-for="msg in messages" class="group">
<div :class="{ <message-bubble
'chat': true, :config="config"
'chat-start': msg.role !== 'user', :msg="msg"
'chat-end': msg.role === 'user', :key="msg.id"
}"> :is-generating="isGenerating"
<div :class="{ :edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
'chat-bubble markdown': true, :regenerate-msg="regenerateMsg"></message-bubble>
'chat-bubble-base-300': msg.role !== 'user',
}">
<!-- textarea for editing message -->
<template v-if="editingMsg && editingMsg.id === msg.id">
<textarea
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
v-model="msg.content"></textarea>
<br/>
<button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
<button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
</template>
<!-- render message as markdown -->
<vue-markdown v-else :source="msg.content" />
</div>
</div>
<!-- actions for each message -->
<div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
<!-- user message -->
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
✍️ Edit
</button>
<!-- assistant message -->
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
🔄 Regenerate
</button>
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
📋 Copy
</button>
</div>
</div> </div>
<!-- pending (ongoing) assistant message --> <!-- pending (ongoing) assistant message -->
<div id="pending-msg" class="chat chat-start"> <div id="pending-msg" class="group">
<div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300"> <message-bubble
<span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span> v-if="pendingMsg"
<vue-markdown v-else :source="pendingMsg.content" /> :config="config"
</div> :msg="pendingMsg"
:key="pendingMsg.id"
:is-generating="isGenerating"
:edit-user-msg-and-regenerate="() => {}"
:regenerate-msg="() => {}"></message-bubble>
</div> </div>
</div> </div>
@ -227,6 +201,10 @@
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible"> <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Advanced config</summary> <summary class="collapse-title font-bold">Advanced config</summary>
<div class="collapse-content"> <div class="collapse-content">
<div class="flex flex-row items-center mb-2">
<input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
<span class="ml-4">Show tokens per second</span>
</div>
<label class="form-control mb-2"> <label class="form-control mb-2">
<!-- Custom parameters input --> <!-- Custom parameters input -->
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div> <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
@ -247,6 +225,66 @@
</div> </div>
<!-- Template to be used as message bubble -->
<template id="message-bubble">
<div :class="{
'chat': true,
'chat-start': msg.role !== 'user',
'chat-end': msg.role === 'user',
}">
<div :class="{
'chat-bubble markdown': true,
'chat-bubble-base-300': msg.role !== 'user',
}">
<!-- textarea for editing message -->
<template v-if="editingContent !== null">
<textarea
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
v-model="editingContent"></textarea>
<br/>
<button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
<button class="btn mt-2" @click="editMsg()">Submit</button>
</template>
<template v-else>
<!-- show loading dots for pending message -->
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
<!-- render message as markdown -->
<vue-markdown v-else :source="msg.content"></vue-markdown>
<!-- render timings if enabled -->
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
<div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
<div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
<b>Prompt</b><br/>
- Tokens: {{ timings.prompt_n }}<br/>
- Time: {{ timings.prompt_ms }} ms<br/>
- Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
<b>Generation</b><br/>
- Tokens: {{ timings.predicted_n }}<br/>
- Time: {{ timings.predicted_ms }} ms<br/>
- Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
</div>
</div>
</template>
</div>
</div>
<!-- actions for each message -->
<div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
<!-- user message -->
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
✍️ Edit
</button>
<!-- assistant message -->
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
🔄 Regenerate
</button>
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
📋 Copy
</button>
</div>
</template>
<!-- Template to be used by settings modal --> <!-- Template to be used by settings modal -->
<template id="settings-modal-short-input"> <template id="settings-modal-short-input">
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2"> <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">

View File

@ -13,6 +13,7 @@
"markdown-it": "^14.1.0", "markdown-it": "^14.1.0",
"postcss": "^8.4.49", "postcss": "^8.4.49",
"tailwindcss": "^3.4.15", "tailwindcss": "^3.4.15",
"textlinestream": "^1.1.1",
"vite-plugin-singlefile": "^2.0.3", "vite-plugin-singlefile": "^2.0.3",
"vue": "^3.5.13" "vue": "^3.5.13"
}, },
@ -2677,6 +2678,12 @@
"node": ">=14.0.0" "node": ">=14.0.0"
} }
}, },
"node_modules/textlinestream": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz",
"integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==",
"license": "MIT"
},
"node_modules/uc.micro": { "node_modules/uc.micro": {
"version": "2.1.0", "version": "2.1.0",
"resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz", "resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",

View File

@ -17,6 +17,7 @@
"markdown-it": "^14.1.0", "markdown-it": "^14.1.0",
"postcss": "^8.4.49", "postcss": "^8.4.49",
"tailwindcss": "^3.4.15", "tailwindcss": "^3.4.15",
"textlinestream": "^1.1.1",
"vite-plugin-singlefile": "^2.0.3", "vite-plugin-singlefile": "^2.0.3",
"vue": "^3.5.13" "vue": "^3.5.13"
} }

View File

@ -1,225 +0,0 @@
const paramDefaults = {
stream: true,
temperature: 0.2,
};
let generation_settings = null;
export class CompletionError extends Error {
constructor(message, name, data) {
super(message);
this.name = name;
}
};
// Completes the prompt as a generator. Recommended for most use cases.
//
// Example:
//
// import { llama } from '/completion.js'
//
// const request = llama("Tell me a joke", {n_predict: 800})
// for await (const chunk of request) {
// document.write(chunk.data.content)
// }
//
export async function* llama(prompt, params = {}, config = {}) {
let controller = config.controller;
const api_url = config.api_url?.replace(/\/+$/, '') || "";
if (!controller) {
controller = new AbortController();
}
const completionParams = { ...paramDefaults, ...params, prompt };
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
method: 'POST',
body: JSON.stringify(completionParams),
headers: {
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Accept': 'text/event-stream',
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
},
signal: controller.signal,
});
const status = response.status;
if (status !== 200) {
try {
const body = await response.json();
if (body && body.error && body.error.message) {
throw new CompletionError(body.error.message, 'ServerError');
}
} catch (err) {
throw new CompletionError(err.message, 'ServerError');
}
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
let content = "";
let leftover = ""; // Buffer for partially read lines
try {
let cont = true;
while (cont) {
const result = await reader.read();
if (result.done) {
break;
}
// Add any leftover data to the current chunk of data
const text = leftover + decoder.decode(result.value);
// Check if the last character is a line break
const endsWithLineBreak = text.endsWith('\n');
// Split the text into lines
let lines = text.split('\n');
// If the text doesn't end with a line break, then the last line is incomplete
// Store it in leftover to be added to the next chunk of data
if (!endsWithLineBreak) {
leftover = lines.pop();
} else {
leftover = ""; // Reset leftover if we have a line break at the end
}
// Parse all sse events and add them to result
const regex = /^(\S+):\s(.*)$/gm;
for (const line of lines) {
const match = regex.exec(line);
if (match) {
result[match[1]] = match[2];
if (result.data === '[DONE]') {
cont = false;
break;
}
// since we know this is llama.cpp, let's just decode the json in data
if (result.data) {
result.data = JSON.parse(result.data);
content += result.data.content;
// yield
yield result;
// if we got a stop token from server, we will break here
if (result.data.stop) {
if (result.data.generation_settings) {
generation_settings = result.data.generation_settings;
}
cont = false;
break;
}
}
if (result.error) {
try {
result.error = JSON.parse(result.error);
if (result.error.message.includes('slot unavailable')) {
// Throw an error to be caught by upstream callers
throw new Error('slot unavailable');
} else {
console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
}
} catch(e) {
console.error(`llama.cpp error ${result.error}`)
}
}
}
}
}
} catch (e) {
if (e.name !== 'AbortError') {
console.error("llama error: ", e);
}
throw e;
}
finally {
controller.abort();
}
return content;
}
// Call llama, return an event target that you can subscribe to
//
// Example:
//
// import { llamaEventTarget } from '/completion.js'
//
// const conn = llamaEventTarget(prompt)
// conn.addEventListener("message", (chunk) => {
// document.write(chunk.detail.content)
// })
//
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
const eventTarget = new EventTarget();
(async () => {
let content = "";
for await (const chunk of llama(prompt, params, config)) {
if (chunk.data) {
content += chunk.data.content;
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
}
if (chunk.data.generation_settings) {
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
}
if (chunk.data.timings) {
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
}
}
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
})();
return eventTarget;
}
// Call llama, return a promise that resolves to the completed text. This does not support streaming
//
// Example:
//
// llamaPromise(prompt).then((content) => {
// document.write(content)
// })
//
// or
//
// const content = await llamaPromise(prompt)
// document.write(content)
//
export const llamaPromise = (prompt, params = {}, config = {}) => {
return new Promise(async (resolve, reject) => {
let content = "";
try {
for await (const chunk of llama(prompt, params, config)) {
content += chunk.data.content;
}
resolve(content);
} catch (error) {
reject(error);
}
});
};
/**
* (deprecated)
*/
export const llamaComplete = async (params, controller, callback) => {
for await (const chunk of llama(params.prompt, params, { controller })) {
callback(chunk);
}
}
// Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async (config = {}) => {
if (!generation_settings) {
const api_url = config.api_url?.replace(/\/+$/, '') || "";
const props = await fetch(`${api_url}/props`).then(r => r.json());
generation_settings = props.default_generation_settings;
}
return generation_settings;
}

View File

@ -1,21 +1,25 @@
import './styles.css'; import './styles.css';
import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js'; import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
import { llama } from './completion.js';
import MarkdownIt from 'markdown-it'; import MarkdownIt from 'markdown-it';
import TextLineStream from 'textlinestream';
const isDev = import.meta.env.MODE === 'development';
// utility functions // utility functions
const isString = (x) => !!x.toLowerCase; const isString = (x) => !!x.toLowerCase;
const isNumeric = (n) => !isString(n) && !isNaN(n); const isBoolean = (x) => x === true || x === false;
const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;'); const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
const copyStr = (str) => navigator.clipboard.writeText(str); const copyStr = (str) => navigator.clipboard.writeText(str);
// constants // constants
const BASE_URL = localStorage.getItem('base') // for debugging const BASE_URL = localStorage.getItem('base') // for debugging
|| (new URL('.', document.baseURI).href).toString(); // for production || (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
const CONFIG_DEFAULT = { const CONFIG_DEFAULT = {
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value. // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
apiKey: '', apiKey: '',
systemMessage: 'You are a helpful assistant.', systemMessage: 'You are a helpful assistant.',
showTokensPerSecond: false,
// make sure these default values are in sync with `common.h` // make sure these default values are in sync with `common.h`
samplers: 'dkypmxt', samplers: 'dkypmxt',
temperature: 0.8, temperature: 0.8,
@ -101,6 +105,48 @@ const SettingsModalShortInput = defineComponent({
}, },
}); });
// message bubble component
const MessageBubble = defineComponent({
components: {
VueMarkdown
},
template: document.getElementById('message-bubble').innerHTML,
props: {
config: Object,
msg: Object,
isGenerating: Boolean,
editUserMsgAndRegenerate: Function,
regenerateMsg: Function,
},
data() {
return {
editingContent: null,
};
},
computed: {
timings() {
if (!this.msg.timings) return null;
return {
...this.msg.timings,
prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
};
}
},
methods: {
copyMsg() {
copyStr(this.msg.content);
},
editMsg() {
this.editUserMsgAndRegenerate({
...this.msg,
content: this.editingContent,
});
this.editingContent = null;
},
},
});
// coversations is stored in localStorage // coversations is stored in localStorage
// format: { [convId]: { id: string, lastModified: number, messages: [...] } } // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
// convId is a string prefixed with 'conv-' // convId is a string prefixed with 'conv-'
@ -192,10 +238,29 @@ const chatScrollToBottom = (requiresNearBottom) => {
} }
}; };
// wrapper for SSE
async function* sendSSEPostRequest(url, fetchOptions) {
const res = await fetch(url, fetchOptions);
const lines = res.body
.pipeThrough(new TextDecoderStream())
.pipeThrough(new TextLineStream());
for await (const line of lines) {
if (isDev) console.log({line});
if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
const data = JSON.parse(line.slice(5));
yield data;
} else if (line.startsWith('error:')) {
const data = JSON.parse(line.slice(6));
throw new Error(data.message || 'Unknown error');
}
}
};
const mainApp = createApp({ const mainApp = createApp({
components: { components: {
VueMarkdown, VueMarkdown,
SettingsModalShortInput, SettingsModalShortInput,
MessageBubble,
}, },
data() { data() {
return { return {
@ -209,7 +274,6 @@ const mainApp = createApp({
selectedTheme: StorageUtils.getTheme(), selectedTheme: StorageUtils.getTheme(),
config: StorageUtils.getConfig(), config: StorageUtils.getConfig(),
showConfigDialog: false, showConfigDialog: false,
editingMsg: null,
// const // const
themes: THEMES, themes: THEMES,
configDefault: {...CONFIG_DEFAULT}, configDefault: {...CONFIG_DEFAULT},
@ -226,6 +290,15 @@ const mainApp = createApp({
}); });
resizeObserver.observe(pendingMsgElem); resizeObserver.observe(pendingMsgElem);
}, },
watch: {
viewingConvId: function(val, oldVal) {
if (val != oldVal) {
this.fetchMessages();
chatScrollToBottom();
this.hideSidebar();
}
}
},
methods: { methods: {
hideSidebar() { hideSidebar() {
document.getElementById('toggle-drawer').checked = false; document.getElementById('toggle-drawer').checked = false;
@ -237,18 +310,10 @@ const mainApp = createApp({
newConversation() { newConversation() {
if (this.isGenerating) return; if (this.isGenerating) return;
this.viewingConvId = StorageUtils.getNewConvId(); this.viewingConvId = StorageUtils.getNewConvId();
this.editingMsg = null;
this.fetchMessages();
chatScrollToBottom();
this.hideSidebar();
}, },
setViewingConv(convId) { setViewingConv(convId) {
if (this.isGenerating) return; if (this.isGenerating) return;
this.viewingConvId = convId; this.viewingConvId = convId;
this.editingMsg = null;
this.fetchMessages();
chatScrollToBottom();
this.hideSidebar();
}, },
deleteConv(convId) { deleteConv(convId) {
if (this.isGenerating) return; if (this.isGenerating) return;
@ -256,7 +321,6 @@ const mainApp = createApp({
StorageUtils.remove(convId); StorageUtils.remove(convId);
if (this.viewingConvId === convId) { if (this.viewingConvId === convId) {
this.viewingConvId = StorageUtils.getNewConvId(); this.viewingConvId = StorageUtils.getNewConvId();
this.editingMsg = null;
} }
this.fetchConversation(); this.fetchConversation();
this.fetchMessages(); this.fetchMessages();
@ -291,7 +355,6 @@ const mainApp = createApp({
this.fetchConversation(); this.fetchConversation();
this.fetchMessages(); this.fetchMessages();
this.inputMsg = ''; this.inputMsg = '';
this.editingMsg = null;
this.generateMessage(currConvId); this.generateMessage(currConvId);
chatScrollToBottom(); chatScrollToBottom();
}, },
@ -299,7 +362,6 @@ const mainApp = createApp({
if (this.isGenerating) return; if (this.isGenerating) return;
this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null }; this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
this.isGenerating = true; this.isGenerating = true;
this.editingMsg = null;
try { try {
const abortController = new AbortController(); const abortController = new AbortController();
@ -330,17 +392,21 @@ const mainApp = createApp({
dry_allowed_length: this.config.dry_allowed_length, dry_allowed_length: this.config.dry_allowed_length,
dry_penalty_last_n: this.config.dry_penalty_last_n, dry_penalty_last_n: this.config.dry_penalty_last_n,
max_tokens: this.config.max_tokens, max_tokens: this.config.max_tokens,
timings_per_token: !!this.config.showTokensPerSecond,
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}), ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
}; };
const config = { const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
controller: abortController, method: 'POST',
api_url: BASE_URL, headers: {
endpoint: '/chat/completions', 'Content-Type': 'application/json',
}; 'Authorization': this.config.apiKey ? `Bearer ${this.config.apiKey}` : undefined,
for await (const chunk of llama(prompt, params, config)) { },
const stop = chunk.data.stop; body: JSON.stringify(params),
const addedContent = chunk.data.choices[0].delta.content; signal: abortController.signal,
});
for await (const chunk of chunks) {
const stop = chunk.stop;
const addedContent = chunk.choices[0].delta.content;
const lastContent = this.pendingMsg.content || ''; const lastContent = this.pendingMsg.content || '';
if (addedContent) { if (addedContent) {
this.pendingMsg = { this.pendingMsg = {
@ -349,6 +415,16 @@ const mainApp = createApp({
content: lastContent + addedContent, content: lastContent + addedContent,
}; };
} }
const timings = chunk.timings;
if (timings && this.config.showTokensPerSecond) {
// only extract what's really needed, to save some space
this.pendingMsg.timings = {
prompt_n: timings.prompt_n,
prompt_ms: timings.prompt_ms,
predicted_n: timings.predicted_n,
predicted_ms: timings.predicted_ms,
};
}
} }
StorageUtils.appendMsg(currConvId, this.pendingMsg); StorageUtils.appendMsg(currConvId, this.pendingMsg);
@ -387,14 +463,10 @@ const mainApp = createApp({
this.fetchMessages(); this.fetchMessages();
this.generateMessage(currConvId); this.generateMessage(currConvId);
}, },
copyMsg(msg) {
copyStr(msg.content);
},
editUserMsgAndRegenerate(msg) { editUserMsgAndRegenerate(msg) {
if (this.isGenerating) return; if (this.isGenerating) return;
const currConvId = this.viewingConvId; const currConvId = this.viewingConvId;
const newContent = msg.content; const newContent = msg.content;
this.editingMsg = null;
StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id); StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
StorageUtils.appendMsg(currConvId, { StorageUtils.appendMsg(currConvId, {
id: Date.now(), id: Date.now(),

View File

@ -394,7 +394,7 @@ int main(int raw_argc, char ** raw_argv) {
} }
if (show_token_count) { if (show_token_count) {
printf("Total number of tokens: %ld\n", tokens.size()); printf("Total number of tokens: %zu\n", tokens.size());
} }
// silence valgrind // silence valgrind
llama_free(ctx); llama_free(ctx);

View File

@ -32,6 +32,13 @@ else()
endif() endif()
endif() endif()
# remove the lib prefix on win32 mingw
if (WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX "")
set(CMAKE_SHARED_LIBRARY_PREFIX "")
set(CMAKE_SHARED_MODULE_PREFIX "")
endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF) option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
@ -172,6 +179,11 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture") "ggml: sycl device architecture")
option(GGML_OPENCL "ggml: use OpenCL" OFF)
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
# extra artifacts # extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})

View File

@ -228,6 +228,7 @@ extern "C" {
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
// Load all known backends from dynamic libraries // Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void); GGML_API void ggml_backend_load_all(void);
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
// //
// Backend scheduler // Backend scheduler

View File

@ -0,0 +1,26 @@
#ifndef GGML_OPENCL_H
#define GGML_OPENCL_H
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
//
// backend API
//
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
#ifdef __cplusplus
}
#endif
#endif // GGML_OPENCL_H

View File

@ -238,6 +238,8 @@
#define GGML_EXIT_ABORTED 1 #define GGML_EXIT_ABORTED 1
#define GGML_ROPE_TYPE_NEOX 2 #define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24
#define GGUF_MAGIC "GGUF" #define GGUF_MAGIC "GGUF"
@ -1443,6 +1445,22 @@ extern "C" {
float beta_fast, float beta_fast,
float beta_slow); float beta_slow);
GGML_API struct ggml_tensor * ggml_rope_multi(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_ext_inplace( GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,

View File

@ -194,11 +194,6 @@ endif()
if (WIN32) if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
if (BUILD_SHARED_LIBS)
# TODO: should not use this
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
endif() endif()
# ggml # ggml
@ -313,6 +308,7 @@ ggml_add_backend(MUSA)
ggml_add_backend(RPC) ggml_add_backend(RPC)
ggml_add_backend(SYCL) ggml_add_backend(SYCL)
ggml_add_backend(Vulkan) ggml_add_backend(Vulkan)
ggml_add_backend(OpenCL)
foreach (target ggml-base ggml) foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>) target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

View File

@ -46,6 +46,10 @@
#include "ggml-vulkan.h" #include "ggml-vulkan.h"
#endif #endif
#ifdef GGML_USE_OPENCL
#include "ggml-opencl.h"
#endif
#ifdef GGML_USE_BLAS #ifdef GGML_USE_BLAS
#include "ggml-blas.h" #include "ggml-blas.h"
#endif #endif
@ -146,6 +150,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg()); register_backend(ggml_backend_vk_reg());
#endif #endif
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg()); register_backend(ggml_backend_cann_reg());
#endif #endif
@ -449,11 +456,21 @@ static std::string backend_filename_suffix() {
#endif #endif
} }
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) { static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths // TODO: search system paths
std::vector<std::string> search_paths = { "./", get_executable_path() };
std::string file_prefix = backend_filename_prefix() + name + "-"; std::string file_prefix = backend_filename_prefix() + name + "-";
std::vector<std::string> search_paths;
if (user_search_path == nullptr) {
search_paths.push_back("./");
search_paths.push_back(get_executable_path());
} else {
#if defined(_WIN32)
search_paths.push_back(std::string(user_search_path) + "\\");
#else
search_paths.push_back(std::string(user_search_path) + "/");
#endif
}
int best_score = 0; int best_score = 0;
std::string best_path; std::string best_path;
@ -463,7 +480,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
if (!fs::exists(search_path)) { if (!fs::exists(search_path)) {
continue; continue;
} }
for (const auto & entry : fs::directory_iterator(search_path)) { fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file()) { if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string(); std::string filename = entry.path().filename().string();
std::string ext = entry.path().extension().string(); std::string ext = entry.path().extension().string();
@ -509,21 +527,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
} }
void ggml_backend_load_all() { void ggml_backend_load_all() {
ggml_backend_load_all_from_path(nullptr);
}
void ggml_backend_load_all_from_path(const char * dir_path) {
#ifdef NDEBUG #ifdef NDEBUG
bool silent = true; bool silent = true;
#else #else
bool silent = false; bool silent = false;
#endif #endif
ggml_backend_load_best("blas", silent); ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent); ggml_backend_load_best("cann", silent, dir_path);
ggml_backend_load_best("cuda", silent); ggml_backend_load_best("cuda", silent, dir_path);
ggml_backend_load_best("hip", silent); ggml_backend_load_best("hip", silent, dir_path);
ggml_backend_load_best("kompute", silent); ggml_backend_load_best("kompute", silent, dir_path);
ggml_backend_load_best("metal", silent); ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent); ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent); ggml_backend_load_best("sycl", silent, dir_path);
ggml_backend_load_best("vulkan", silent); ggml_backend_load_best("vulkan", silent, dir_path);
ggml_backend_load_best("musa", silent); ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("cpu", silent); ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path);
} }

View File

@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
if (*ext_factor != 0) { if (*ext_factor != 0) {
return false; return false;
} }
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return true; return true;
} }
case GGML_OP_UPSCALE: { case GGML_OP_UPSCALE: {

View File

@ -473,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
GGML_TABLE_END() GGML_TABLE_END()
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics //#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128) GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,

View File

@ -122,7 +122,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
} }
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size); void * data = ggml_aligned_malloc(size);
if (data == NULL) { if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL; return NULL;

View File

@ -126,8 +126,7 @@ struct ggml_arm_arch_features_type {
#endif #endif
#include <windows.h> #include <windows.h>
#if defined(_MSC_VER) && !defined(__clang__)
#if !defined(__clang__)
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE)) #define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
typedef volatile LONG atomic_int; typedef volatile LONG atomic_int;
@ -469,7 +468,7 @@ const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
} \ } \
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \ (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
} }
#define GGML_F32_VEC GGML_F32x4 #define GGML_F32_VEC GGML_F32x4
@ -2396,7 +2395,7 @@ static void ggml_init_arm_arch_features(void) {
uint32_t hwcap2 = getauxval(AT_HWCAP2); uint32_t hwcap2 = getauxval(AT_HWCAP2);
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP); ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
@ -9134,6 +9133,64 @@ static void ggml_rope_cache_init(
} }
} }
static void ggml_mrope_cache_init(
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
float * cache, float sin_sign, float theta_scale) {
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
float theta_t = theta_base_t;
float theta_h = theta_base_h;
float theta_w = theta_base_w;
float theta_e = theta_base_e; // extra position id for vision encoder
int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
int sec_w = sections[1] + sections[0];
int sec_e = sections[2] + sec_w;
GGML_ASSERT(sect_dims <= ne0);
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
int sector = (i0 / 2) % sect_dims;
if (indep_sects) {
// compute theta independently for each dim sections
// (i.e. reset corresponding theta when `i0` go from one section to another)
if (sector == 0) {
theta_t = theta_base_t;
}
else if (sector == sections[0]) {
theta_h = theta_base_h;;
}
else if (sector == sec_w) {
theta_w = theta_base_w;
}
else if (sector == sec_e) {
theta_e = theta_base_e;
}
}
float theta = theta_t;
if (sector >= sections[0] && sector < sec_w) {
theta = theta_h;
}
else if (sector >= sec_w && sector < sec_w + sections[2]) {
theta = theta_w;
}
else if (sector >= sec_w + sections[2]) {
theta = theta_e;
}
rope_yarn(
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
);
cache[i0 + 1] *= sin_sign;
theta_t *= theta_scale;
theta_w *= theta_scale;
theta_h *= theta_scale;
theta_e *= theta_scale;
}
}
static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst, struct ggml_tensor * dst,
@ -9144,6 +9201,7 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src2 = dst->src[2]; const struct ggml_tensor * src2 = dst->src[2];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
int sections[4];
//const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
@ -9157,6 +9215,7 @@ static void ggml_compute_forward_rope_f32(
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
GGML_TENSOR_UNARY_OP_LOCALS GGML_TENSOR_UNARY_OP_LOCALS
@ -9189,6 +9248,16 @@ static void ggml_compute_forward_rope_f32(
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_mrope) {
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
}
if (is_vision) {
GGML_ASSERT(n_dims == ne0/2);
}
const float * freq_factors = NULL; const float * freq_factors = NULL;
if (src2 != NULL) { if (src2 != NULL) {
@ -9204,30 +9273,44 @@ static void ggml_compute_forward_rope_f32(
const int32_t * pos = (const int32_t *) src1->data; const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
for (int64_t i2 = 0; i2 < ne2; i2++) { for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
const int64_t p = pos[i2];
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
if (!is_mrope) {
const int64_t p = pos[i2];
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
else {
const int64_t p_t = pos[i2];
const int64_t p_h = pos[i2 + ne2];
const int64_t p_w = pos[i2 + ne2 * 2];
const int64_t p_e = pos[i2 + ne2 * 3];
ggml_mrope_cache_init(
p_t, p_h, p_w, p_e, sections, is_vision,
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
for (int64_t i1 = 0; i1 < ne1; i1++) { for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
if (ir++ < ir0) continue; if (ir++ < ir0) continue;
if (ir > ir1) break; if (ir > ir1) break;
if (!is_neox) { if (is_neox || is_mrope) {
if (is_vision){
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0]; const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1]; const float sin_theta = cache[i0 + 1];
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = src[0]; const float x0 = src[0];
const float x1 = src[1]; const float x1 = src[n_dims];
dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[1] = x0*sin_theta + x1*cos_theta; dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
} }
} else { } else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
@ -9246,7 +9329,40 @@ static void ggml_compute_forward_rope_f32(
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
} }
} }
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0];
const float x1 = src[1];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[1] = x0*sin_theta + x1*cos_theta;
}
}
if (is_vision) {
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = src[0];
const float x1 = src[n_dims];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
}
} else {
// fill the remain channels with data from src tensor
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -9257,6 +9373,7 @@ static void ggml_compute_forward_rope_f32(
} }
} }
} }
}
} }
// TODO: deduplicate f16/f32 code // TODO: deduplicate f16/f32 code
@ -9270,6 +9387,7 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src2 = dst->src[2]; const struct ggml_tensor * src2 = dst->src[2];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
int sections[4];
//const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
@ -9282,6 +9400,8 @@ static void ggml_compute_forward_rope_f16(
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
GGML_TENSOR_UNARY_OP_LOCALS GGML_TENSOR_UNARY_OP_LOCALS
@ -9314,6 +9434,16 @@ static void ggml_compute_forward_rope_f16(
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_mrope) {
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
}
if (is_vision) {
GGML_ASSERT(n_dims == ne0/2);
}
const float * freq_factors = NULL; const float * freq_factors = NULL;
if (src2 != NULL) { if (src2 != NULL) {
@ -9331,28 +9461,42 @@ static void ggml_compute_forward_rope_f16(
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) { for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = pos[i2];
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
if (!is_mrope) {
const int64_t p = pos[i2];
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
else {
const int64_t p_t = pos[i2];
const int64_t p_h = pos[i2 + ne2];
const int64_t p_w = pos[i2 + ne2 * 2];
const int64_t p_e = pos[i2 + ne2 * 3];
ggml_mrope_cache_init(
p_t, p_h, p_w, p_e, sections, is_vision,
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
for (int64_t i1 = 0; i1 < ne1; i1++) { for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue; if (ir++ < ir0) continue;
if (ir > ir1) break; if (ir > ir1) break;
if (!is_neox) { if (is_neox || is_mrope) {
if (is_vision) {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0]; const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1]; const float sin_theta = cache[i0 + 1];
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = GGML_FP16_TO_FP32(src[0]); const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[1]); const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
} }
} else { } else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
@ -9371,7 +9515,39 @@ static void ggml_compute_forward_rope_f16(
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
} }
} }
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[1]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
}
if (is_vision) {
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
} else {
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -9382,6 +9558,7 @@ static void ggml_compute_forward_rope_f16(
} }
} }
} }
}
} }
static void ggml_compute_forward_rope( static void ggml_compute_forward_rope(
@ -12945,7 +13122,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
#include "windows.h" #include "windows.h"
// TODO: support > 64 CPUs // TODO: support > 64 CPUs
bool ggml_thread_apply_affinity(bool * mask) { static bool ggml_thread_apply_affinity(bool * mask) {
HANDLE h = GetCurrentThread(); HANDLE h = GetCurrentThread();
uint64_t bitmask = 0ULL; uint64_t bitmask = 0ULL;

View File

@ -41,28 +41,28 @@
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
#define CC_PASCAL 600 #define GGML_CUDA_CC_PASCAL 600
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA 700 #define GGML_CUDA_CC_VOLTA 700
#define CC_TURING 750 #define GGML_CUDA_CC_TURING 750
#define CC_AMPERE 800 #define GGML_CUDA_CC_AMPERE 800
#define CC_OFFSET_AMD 1000000 #define GGML_CUDA_CC_OFFSET_AMD 1000000
// GCN/CNDA, wave size is 64 // GCN/CNDA, wave size is 64
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300 #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000 #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
#define CC_QY1 210 #define GGML_CUDA_CC_QY1 210
#define CC_QY2 220 #define GGML_CUDA_CC_QY2 220
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float
typedef float2 dfloat2; typedef float2 dfloat2;
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#define FP16_AVAILABLE #define FP16_AVAILABLE
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#define FAST_FP16_AVAILABLE #define FAST_FP16_AVAILABLE
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#define FP16_MMA_AVAILABLE #define FP16_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
#define INT8_MMA_AVAILABLE #define INT8_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
#define FLASH_ATTN_AVAILABLE #define FLASH_ATTN_AVAILABLE
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
static constexpr bool fast_fp16_available(const int cc) { static constexpr bool fast_fp16_available(const int cc) {
return cc >= CC_PASCAL && cc != 610; return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
} }
static constexpr bool fp16_mma_available(const int cc) { static constexpr bool fp16_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA; return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
} }
static constexpr bool int8_mma_available(const int cc) { static constexpr bool int8_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_TURING; return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
} }
[[noreturn]] [[noreturn]]
@ -187,7 +187,7 @@ static __device__ void no_device_code(
#endif // __CUDA_ARCH__ #endif // __CUDA_ARCH__
static __device__ __forceinline__ int warp_reduce_sum(int x) { static __device__ __forceinline__ int warp_reduce_sum(int x) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
return __reduce_add_sync(0xffffffff, x); return __reduce_add_sync(0xffffffff, x);
#else #else
#pragma unroll #pragma unroll
@ -195,7 +195,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
x += __shfl_xor_sync(0xffffffff, x, offset, 32); x += __shfl_xor_sync(0xffffffff, x, offset, 32);
} }
return x; return x;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
} }
static __device__ __forceinline__ float warp_reduce_sum(float x) { static __device__ __forceinline__ float warp_reduce_sum(float x) {
@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
} }
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#pragma unroll #pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) { for (int offset = 16; offset > 0; offset >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#else #else
GGML_UNUSED(x); GGML_UNUSED(x);
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
} }
#if CUDART_VERSION < CUDART_HMASK #if CUDART_VERSION < CUDART_HMASK
@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >= MIN_CC_DP4A #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
return __dp4a(a, b, c); return __dp4a(a, b, c);
#else // __CUDA_ARCH__ >= MIN_CC_DP4A #else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
const int8_t * a8 = (const int8_t *) &a; const int8_t * a8 = (const int8_t *) &a;
const int8_t * b8 = (const int8_t *) &b; const int8_t * b8 = (const int8_t *) &b;
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }

View File

@ -94,7 +94,9 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n
} }
// non-contiguous kernel (slow) // non-contiguous kernel (slow)
static __global__ void concat_f32_non_cont( template <int dim>
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
concat_f32_non_cont(
const char * src0, const char * src0,
const char * src1, const char * src1,
char * dst, char * dst,
@ -121,22 +123,28 @@ static __global__ void concat_f32_non_cont(
uint64_t nb0, uint64_t nb0,
uint64_t nb1, uint64_t nb1,
uint64_t nb2, uint64_t nb2,
uint64_t nb3, uint64_t nb3){
int32_t dim) { static_assert(dim >= 0 && dim <= 3);
const int64_t i3 = blockIdx.z; const int64_t i3 = blockIdx.z;
const int64_t i2 = blockIdx.y; const int64_t i2 = blockIdx.y;
const int64_t i1 = blockIdx.x; const int64_t i1 = blockIdx.x;
int64_t o[4] = {0, 0, 0, 0};
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
const float * x; const float * x;
for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) { for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00); x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
} else { } else {
x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10); if constexpr (dim == 0) {
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
} else if constexpr (dim == 1) {
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
} else if constexpr (dim == 2) {
x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
} else if constexpr (dim == 3) {
x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
}
} }
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -182,15 +190,32 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
} }
} else { } else {
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]); dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>( auto launch_kernel = [&](auto dim) {
(const char *)src0->data, concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
(const char *)src1->data, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
( char *)dst->data,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim); dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
};
switch (dim) {
case 0:
launch_kernel(std::integral_constant<int, 0>{});
break;
case 1:
launch_kernel(std::integral_constant<int, 1>{});
break;
case 2:
launch_kernel(std::integral_constant<int, 2>{});
break;
case 3:
launch_kernel(std::integral_constant<int, 3>{});
break;
default:
GGML_ABORT("Invalid dim: %d", dim);
break;
}
} }
} }

View File

@ -26,7 +26,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
template <bool need_check> template <bool need_check>
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) { static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
#if __CUDA_ARCH__ >= CC_PASCAL #if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE; constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x; const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
@ -64,7 +64,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
GGML_UNUSED(y); GGML_UNUSED(y);
GGML_UNUSED(k); GGML_UNUSED(k);
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_PASCAL #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
} }
template<typename dst_t> template<typename dst_t>
@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>; return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) { if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
return dequantize_block_q8_0_f16_cuda; return dequantize_block_q8_0_f16_cuda;
} }
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>; return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;

View File

@ -304,7 +304,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV); const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
// On AMD the tile kernels perform poorly, use the vec kernel instead: // On AMD the tile kernels perform poorly, use the vec kernel instead:
if (cc >= CC_OFFSET_AMD) { if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
} else { } else {

View File

@ -177,7 +177,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
@ -1081,7 +1081,7 @@ static void ggml_cuda_op_mul_mat_cublas(
const int compute_capability = ggml_cuda_info().devices[id].cc; const int compute_capability = ggml_cuda_info().devices[id].cc;
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id)); ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
if (src0->type != GGML_TYPE_F16) { if (src0->type != GGML_TYPE_F16) {
@ -1108,7 +1108,7 @@ static void ggml_cuda_op_mul_mat_cublas(
const half beta_f16 = 0.0f; const half beta_f16 = 0.0f;
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F; cu_compute_type = CUBLAS_COMPUTE_32F;
} }
@ -1612,7 +1612,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F; cudaDataType_t cu_data_type = CUDA_R_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F; cu_compute_type = CUBLAS_COMPUTE_32F;
} }
@ -2357,7 +2357,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
std::vector<void *> ggml_cuda_cpy_fn_ptrs; std::vector<void *> ggml_cuda_cpy_fn_ptrs;
if (cuda_ctx->cuda_graph->graph == nullptr) { if (cuda_ctx->cuda_graph->graph == nullptr) {
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
@ -3028,7 +3028,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return true; return true;
} }
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
} }
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_CROSS_ENTROPY_LOSS_BACK:

View File

@ -171,7 +171,7 @@ struct mma_int_C_I16J8 {
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) { __device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
#ifdef INT8_MMA_AVAILABLE #ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};" asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3]) : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0])); : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
@ -183,7 +183,7 @@ struct mma_int_C_I16J8 {
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};" asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[2]), "+r"(x[3]) : "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[1]), "r"(mma_B.x[0])); : "r"(mma_A.x[1]), "r"(mma_B.x[0]));
#endif // __CUDA_ARCH__ >= CC_AMPERE #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#else #else
GGML_UNUSED(mma_A); GGML_UNUSED(mma_A);
GGML_UNUSED(mma_B); GGML_UNUSED(mma_B);
@ -193,7 +193,7 @@ struct mma_int_C_I16J8 {
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) { __device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
#ifdef INT8_MMA_AVAILABLE #ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};" asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3]) : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1])); : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
@ -211,7 +211,7 @@ struct mma_int_C_I16J8 {
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};" asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[2]), "+r"(x[3]) : "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[3]), "r"(mma_B.x[1])); : "r"(mma_A.x[3]), "r"(mma_B.x[1]));
#endif // __CUDA_ARCH__ >= CC_AMPERE #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#else #else
GGML_UNUSED(mma_A); GGML_UNUSED(mma_A);
GGML_UNUSED(mma_B); GGML_UNUSED(mma_B);

View File

@ -27,7 +27,7 @@ void ggml_cuda_op_mul_mat_q(
// The stream-k decomposition is only faster for recent NVIDIA GPUs. // The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool. // Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11; const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
switch (src0->type) { switch (src0->type) {
@ -136,7 +136,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return true; return true;
} }
if (cc < MIN_CC_DP4A) { if (cc < GGML_CUDA_CC_DP4A) {
return false; return false;
} }
@ -144,9 +144,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return true; return true;
#endif //GGML_CUDA_FORCE_MMQ #endif //GGML_CUDA_FORCE_MMQ
if (cc < CC_OFFSET_AMD) { if (cc < GGML_CUDA_CC_OFFSET_AMD) {
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
} }
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
} }

View File

@ -89,9 +89,9 @@ struct tile_x_sizes {
static constexpr int get_mmq_x_max_host(const int cc) { static constexpr int get_mmq_x_max_host(const int cc) {
return int8_mma_available(cc) ? 128 : return int8_mma_available(cc) ? 128 :
#ifdef GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_MMQ
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64; cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
#else #else
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64; cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
#endif // GGML_CUDA_FORCE_MMQ #endif // GGML_CUDA_FORCE_MMQ
} }
@ -104,23 +104,23 @@ static constexpr __device__ int get_mmq_x_max_device() {
return 128; return 128;
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#ifdef GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_MMQ
return MMQ_DP4A_MAX_BATCH_SIZE; return MMQ_DP4A_MAX_BATCH_SIZE;
#else // GGML_CUDA_FORCE_MMQ #else // GGML_CUDA_FORCE_MMQ
return 128; return 128;
#endif // GGML_CUDA_FORCE_MMQ #endif // GGML_CUDA_FORCE_MMQ
#else // __CUDA_ARCH__ >= CC_VOLTA #else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
return 64; return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#endif // INT8_MMA_AVAILABLE #endif // INT8_MMA_AVAILABLE
} }
static constexpr int get_mmq_y_host(const int cc) { static constexpr int get_mmq_y_host(const int cc) {
return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64); return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc == GGML_CUDA_CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
} }
static constexpr __device__ int get_mmq_y_device() { static constexpr __device__ int get_mmq_y_device() {
@ -131,11 +131,11 @@ static constexpr __device__ int get_mmq_y_device() {
return 128; return 128;
#endif // defined RDNA1 #endif // defined RDNA1
#else #else
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
return 128; return 128;
#else #else
return 64; return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }
@ -2574,11 +2574,11 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
__launch_bounds__(WARP_SIZE*nwarps, 2) __launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) #endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
#else #else
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
__launch_bounds__(WARP_SIZE*nwarps, 1) __launch_bounds__(WARP_SIZE*nwarps, 1)
#else #else
__launch_bounds__(WARP_SIZE*nwarps, 2) __launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
static __global__ void mul_mat_q( static __global__ void mul_mat_q(
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q(
constexpr int mmq_y = get_mmq_y_device(); constexpr int mmq_y = get_mmq_y_device();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
{ {
constexpr bool fixup = false; constexpr bool fixup = false;
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup> mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q(
blockIdx.x, blockIdx.y, 0, ne00/qk); blockIdx.x, blockIdx.y, 0, ne00/qk);
return; return;
} }
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
const int64_t blocks_per_ne00 = ne00 / qk; const int64_t blocks_per_ne00 = ne00 / qk;
constexpr int blocks_per_iter = MMQ_ITER_K / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk;
@ -2825,7 +2825,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_x_max = get_mmq_x_max_host(cc);
const int mmq_y = get_mmq_y_host(cc); const int mmq_y = get_mmq_y_host(cc);
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
int mmq_x_best = 0; int mmq_x_best = 0;
int nparts_best = INT_MAX; int nparts_best = INT_MAX;

View File

@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
int64_t nwarps = 1; int64_t nwarps = 1;
int64_t rows_per_cuda_block = 1; int64_t rows_per_cuda_block = 1;
if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
switch(ncols_y) { switch(ncols_y) {
case 1: case 1:
nwarps = 4; nwarps = 4;

View File

@ -4,6 +4,11 @@ struct rope_corr_dims {
float v[2]; float v[2];
}; };
struct mrope_sections {
int v[4];
};
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) { static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low); const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y)); return 1.0f - min(1.0f, max(0.0f, y));
@ -108,6 +113,105 @@ static __global__ void rope_neox(
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
} }
template<typename T, bool has_ff>
static __global__ void rope_multi(
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
return;
}
const int row = blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= n_dims) {
const int i = row*ne0 + i0;
dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1];
return;
}
const int i = row*ne0 + i0/2;
const int i2 = row/p_delta_rows;
int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
int sec_w = sections.v[1] + sections.v[0];
int sector = (i0 / 2) % sect_dims;
float theta_base = 0.0;
if (sector < sections.v[0]) {
theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sections.v[0] && sector < sec_w) {
theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w + sections.v[2]) {
theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
}
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
float cos_theta;
float sin_theta;
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
const float x0 = x[i + 0];
const float x1 = x[i + n_dims/2];
dst[i + 0] = x0*cos_theta - x1*sin_theta;
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
}
template<typename T, bool has_ff>
static __global__ void rope_vision(
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
return;
}
const int row = blockDim.x*blockIdx.x + threadIdx.x;
const int i = row*ne0 + i0/2;
const int i2 = row/p_delta_rows; // i2-th tokens
int sect_dims = sections.v[0] + sections.v[1];
int sec_w = sections.v[1] + sections.v[0];
int sector = (i0 / 2) % sect_dims;
float theta_base = 0.0;
if (sector < sections.v[0]) {
const int p = sector;
theta_base = pos[i2]*powf(theta_scale, p);
}
else if (sector >= sections.v[0] && sector < sec_w) {
const int p = sector - sections.v[0];
theta_base = pos[i2 + ne2]*powf(theta_scale, p);
}
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
float cos_theta;
float sin_theta;
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
const float x0 = x[i + 0];
const float x1 = x[i + n_dims];
dst[i + 0] = x0*cos_theta - x1*sin_theta;
dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
}
template<typename T> template<typename T>
static void rope_norm_cuda( static void rope_norm_cuda(
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
@ -156,6 +260,56 @@ static void rope_neox_cuda(
} }
} }
template<typename T>
static void rope_multi_cuda(
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nr, n_blocks_x, 1);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
if (freq_factors == nullptr) {
rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, freq_factors, sections
);
} else {
rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, freq_factors, sections
);
}
}
template<typename T>
static void rope_vision_cuda(
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nr, n_blocks_x, 1);
// break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
// where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
if (freq_factors == nullptr) {
rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, freq_factors, sections
);
} else {
rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, freq_factors, sections
);
}
}
static void rope_norm_cuda_f16( static void rope_norm_cuda_f16(
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
@ -185,6 +339,38 @@ static void rope_neox_cuda_f32(
rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
} }
static void rope_multi_cuda_f16(
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
static void rope_multi_cuda_f32(
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
static void rope_vision_cuda_f16(
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
static void rope_vision_cuda_f32(
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
@ -201,8 +387,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type); GGML_ASSERT(src0->type == dst->type);
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0]; // head dims
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1]; // num heads
const int64_t ne02 = src0->ne[2]; // num heads
const int64_t nr = ggml_nrows(src0); const int64_t nr = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
@ -210,6 +397,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
//const int n_ctx = ((int32_t *) dst->op_params)[3]; //const int n_ctx = ((int32_t *) dst->op_params)[3];
const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
mrope_sections sections;
// RoPE alteration for extended context // RoPE alteration for extended context
float freq_base; float freq_base;
@ -225,8 +413,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&sections.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_mrope) {
GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
}
if (is_vision) {
GGML_ASSERT(n_dims == ne00/2);
}
const int32_t * pos = (const int32_t *) src1_d; const int32_t * pos = (const int32_t *) src1_d;
@ -253,6 +452,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
} else { } else {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
} else if (is_mrope && !is_vision) {
if (src0->type == GGML_TYPE_F32) {
rope_multi_cuda_f32(
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, sections, stream
);
} else if (src0->type == GGML_TYPE_F16) {
rope_multi_cuda_f16(
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, sections, stream
);
} else {
GGML_ABORT("fatal error");
}
} else if (is_vision) {
if (src0->type == GGML_TYPE_F32) {
rope_vision_cuda_f32(
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, sections, stream
);
} else if (src0->type == GGML_TYPE_F16) {
rope_vision_cuda_f16(
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, sections, stream
);
} else {
GGML_ABORT("fatal error");
}
} else { } else {
if (src0->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32) {
rope_norm_cuda_f32( rope_norm_cuda_f32(

View File

@ -3,8 +3,6 @@
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
#ifdef USE_CUB #ifdef USE_CUB
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
// For this reason CUB must be included BEFORE anything else.
#include <cub/cub.cuh> #include <cub/cub.cuh>
using namespace cub; using namespace cub;
#endif // USE_CUB #endif // USE_CUB

View File

@ -74,8 +74,8 @@ static inline int ggml_up(int n, int m) {
// //
GGML_ATTRIBUTE_FORMAT(2, 3) GGML_ATTRIBUTE_FORMAT(2, 3)
void ggml_log_internal (enum ggml_log_level level, const char * format, ...); GGML_API void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data); GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) #define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
@ -304,8 +304,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
// Memory allocation // Memory allocation
void * ggml_aligned_malloc(size_t size); GGML_API void * ggml_aligned_malloc(size_t size);
void ggml_aligned_free(void * ptr, size_t size); GGML_API void ggml_aligned_free(void * ptr, size_t size);
// FP16 to FP32 conversion // FP16 to FP32 conversion

View File

@ -1419,8 +1419,18 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_ROPE:
return true; return true;
case GGML_OP_ROPE:
{
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return true;
}
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT: case GGML_OP_CONT:

View File

@ -1125,8 +1125,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
return has_simdgroup_reduction && (op->ne[0] % 4 == 0); return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
case GGML_OP_ARGMAX: case GGML_OP_ARGMAX:
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_ROPE:
return true; return true;
case GGML_OP_ROPE:
{
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return true;
}
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16; return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_1D: case GGML_OP_POOL_1D:
@ -3026,7 +3036,9 @@ static void ggml_metal_encode_node(
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
GGML_ASSERT(ne10 == ne02); // make sure we have one or more position id(ne10) per token(ne02)
GGML_ASSERT(ne10 % ne02 == 0);
GGML_ASSERT(ne10 >= ne02);
const int nth = MIN(1024, ne00); const int nth = MIN(1024, ne00);

View File

@ -0,0 +1,147 @@
find_package(OpenCL REQUIRED)
find_package(Python3 REQUIRED)
set(TARGET_NAME ggml-opencl)
ggml_add_backend_library(${TARGET_NAME}
ggml-opencl.cpp
../../include/ggml-opencl.h)
target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCL_LIBRARIES})
target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_INCLUDE_DIRS})
if (GGML_OPENCL_PROFILING)
message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
add_compile_definitions(GGML_OPENCL_PROFILING)
endif ()
add_compile_definitions(GGML_OPENCL_SOA_Q)
if (GGML_OPENCL_USE_ADRENO_KERNELS)
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
endif ()
if (GGML_OPENCL_EMBED_KERNELS)
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
set(OPENCL_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h")
set(OPENCL_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h")
set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h")
set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
# Python must be accessible from command line
add_custom_command(
OUTPUT ${OPENCL_CL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
${OPENCL_CL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
${OPENCL_MM_CL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_mm.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
${OPENCL_CVT_CL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_cvt.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_transpose_16.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_transpose_32.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
)
target_sources(${TARGET_NAME} PRIVATE
${OPENCL_CL_SOURCE_EMBED}
${OPENCL_MM_CL_SOURCE_EMBED}
${OPENCL_CVT_CL_SOURCE_EMBED}
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
else ()
# copy ggml-opencl.cl to bin directory
configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
endif ()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,26 @@
#
import sys
import logging
logger = logging.getLogger("opencl-embed-kernel")
def main():
logging.basicConfig(level=logging.INFO)
if len(sys.argv) != 3:
logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
sys.exit(1)
ifile = open(sys.argv[1], "r")
ofile = open(sys.argv[2], "w")
for i in ifile:
ofile.write('R"({})"\n'.format(i))
ifile.close()
ofile.close()
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,106 @@
//------------------------------------------------------------------------------
// This file is contains additional kernels for data conversion.
// These kernels are used when loading the model, so its performance is less
// important.
//------------------------------------------------------------------------------
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#elif defined(cl_amd_fp16)
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
#else
#error "Half precision floating point not supportedby OpenCL implementation on your device."
#endif
#ifdef cl_khr_subgroups
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#elif defined(cl_intel_subgroups)
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#error "Subgroup not supported on your device."
#endif
#ifdef cl_intel_required_subgroup_size
// Always use subgroup size of 32 on Intel.
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
// Always use subgroups size of 64 on Adreno.
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#else
// TODO: do not know how to choose subgroup size on other GPUs.
#error "Selecting subgroup size is not supported on your device."
#endif
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define QR4_1 2
#define QK5_0 32
#define QR5_0 2
#define QK5_1 32
#define QR5_1 2
#define QK8_0 32
#define QR8_0 1
#define QK_K 256
#define K_QUANTS_PER_ITERATION 2
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
//------------------------------------------------------------------------------
// block_q4_0
//------------------------------------------------------------------------------
struct block_q4_0
{
half d;
uint8_t qs[QK4_0 / 2];
};
//------------------------------------------------------------------------------
// mul_vec_q_n_f32_flat_noshuffle
//
// This variation uses flat arrays (struct of arrays, SOA) representation for
// quant tensors. It also uses non shuffled bit order for weights.
//
// The shuffled version is kept in the original file because moving it here
// seems to result in worse performance for adreno.
//------------------------------------------------------------------------------
kernel void kernel_convert_block_q4_0_noshuffle(
global struct block_q4_0 * src0,
global uchar * dst_q,
global half * dst_d
) {
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
global half * d = (global half *) dst_d + get_global_id(0);
*d = b->d;
for (int i = 0; i < QK4_0/4; ++i) {
uchar x0 = b->qs[2*i + 0];
uchar x1 = b->qs[2*i + 1];
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
#ifdef ADRENO_GPU
// Workaround for adreno - must have the following printf statement for
// the kernel to work properly. Otherwise it produces incorrect result.
// convert_uchar above also seems necessary.
// Compare against a large number so that it does not print anything.
// get_sub_group_local_id() also works.
if (get_global_id(0) == 65536*4096) {
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
}
#endif
}
}

View File

@ -0,0 +1,265 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
// assume
#define QK4_0 32
#define N_SIMDGROUP 4
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
float shared_y; \
shared_y = sub_group_broadcast(y.s0, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 0); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 1); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y.s0, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 2); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 3); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
float8 shared_y; \
shared_y = sub_group_broadcast(y, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
__attribute__((qcom_reqd_sub_group_size("full")))
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
ulong offset1, // offset to B (0)
global float * dst, // C
ulong offsetd, // offset to C (0)
uint K, // K
int ne01, // M
int ne02, // 1
int ne10, // K
int ne12, // 1
int ne0, // M
int ne1, // N
int r2, // 1
int r3)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
__private uint4 regA;
__private half2 regS;
__private float8 regB;
__private float2 totalSum = (float2)(0.0f);
// loop along K in block granularity, skip 4 blocks every iter
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
// first 4 fibers in each wave load 8 B values to its private scope
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load half weights for two blocks in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
}
// reduction in local memory, assumes #wave=4
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
// 2 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
vstore2(totalSum, 0, &(dst[gid * 2]));
}
}

View File

@ -0,0 +1,271 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
// assume
#define QK4_0 32
#define N_SIMDGROUP 4
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
float shared_y; \
shared_y = sub_group_broadcast(y.s0, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 0); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 1); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y.s0, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 2); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 3); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
float8 shared_y; \
shared_y = sub_group_broadcast(y, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
__attribute__((qcom_reqd_sub_group_size("full")))
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
ulong offset1, // offset to B (0)
global float * dst, // C
ulong offsetd, // offset to C (0)
int ne00, // K
int ne01, // M
int ne02, // 1
int ne10, // K
int ne12, // 1
int ne0, // M
int ne1, // N
int r2, // 1
int r3)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
uint K = ne00;
uint M = ne01;
uint LINE_STRIDE_A = M / 2;
uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
__private uint4 regA;
__private half2 regS;
__private float8 regB;
__private float2 totalSum = (float2)(0.0f);
// loop along K in block granularity, skip 4 blocks every iter
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
// first 4 fibers in each wave load 8 B values to its private scope
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load half weights for two blocks in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
}
// reduction in local memory, assumes #wave=4
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
// 2 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
vstore2(totalSum, 0, &(dst[gid * 2]));
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,130 @@
// src0_q, src0_d, src1 are transposed as a preprocessing step
// 4-bit weights are transposed in groups of 4 (unsigned short int)
// consider weights originally "next to each other", now "on top of each other"
// each fiber computes a 8x4 tile of output elements
// using unshuffled weights
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
__attribute__((qcom_reqd_sub_group_size("full")))
kernel void kernel_mul_mat_Ab_Bi_8x4(
global const ushort * src0_q, // quantized A
global const half * src0_d, // A scales
__read_only image1d_buffer_t src1, // B (1d image)
global float * dst, // C
int m, // M
int n, // N with padding
int k, // K
int n_no_padding // N without padding
) {
int m_4 = m >> 2;
int n_4 = n >> 2;
int gy = get_global_id(0);
int gx = get_global_id(1);
int gx_2 = gx << 2;
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
half8 B; // registers for activations
half4 dequantized_weights; // registers for dequantized weights
__global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
__global const half* scale_ptr = src0_d + gx_2; // pointer for scales
for(int i=0; i<k; i+=4){ //loop through K dimension
B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
// keep (i/4) and (i/32) in parenthesis, rounds down
// load 4 consecutive groups of 4 weights
ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
// load 4 consecutive scales
half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
// j=0
dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=1
B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=2
B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=3
B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
}
int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
// conditional check if store is to a valid location. Required when N is not a multiple of 8
// if statements allow registers to be reused for each store
// provides a performance boost due to reduced register footprint, which increases number of concurrent waves
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
}
}

View File

@ -0,0 +1,32 @@
// 16-bit transpose, loading/storing an 8x8 tile of elements
kernel void kernel_transpose_16(
__read_only image1d_buffer_t input,
__write_only image1d_buffer_t output,
const uint rows,
const uint cols
) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_3 = i<<3;
const int j_3 = j<<3;
ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
}

View File

@ -0,0 +1,25 @@
// 32-bit transpose, loading/storing a 4x4 tile of elements
kernel void kernel_transpose_32(
__read_only image1d_buffer_t input,
__write_only image1d_buffer_t output,
const uint rows,
const uint cols
) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_2 = i<<2;
const int j_2 = j<<2;
float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
}

View File

@ -0,0 +1,35 @@
// 32-bit transpose, loading/storing a 4x4 tile of elements
// Only used for activations
// converts to FP16
// also adds zero padding for non multiple of 8 prompt lengths
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_2 = i<<2;
const int j_2 = j<<2;
half4 temp0 = {0,0,0,0}; // initialize outputs to 0
half4 temp1 = {0,0,0,0};
half4 temp2 = {0,0,0,0};
half4 temp3 = {0,0,0,0};
if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
temp0 = read_imageh(input, (j_2+0)*cols+i);
}
if((j_2+1)*cols+i*4+3 < rows*cols*16){
temp1 = read_imageh(input, (j_2+1)*cols+i);
}
if((j_2+2)*cols+i*4+3 < rows*cols*16){
temp2 = read_imageh(input, (j_2+2)*cols+i);
}
if((j_2+3)*cols+i*4+3 < rows*cols*16){
temp3 = read_imageh(input, (j_2+3)*cols+i);
}
write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
}

View File

@ -11,6 +11,7 @@
// //
#include "common.hpp" #include "common.hpp"
#include "ggml-impl.h"
int get_current_device_id() { int get_current_device_id() {
return dpct::dev_mgr::instance().current_device_id(); return dpct::dev_mgr::instance().current_device_id();
@ -28,11 +29,7 @@ void* ggml_sycl_host_malloc(size_t size) try {
if (err != 0) { if (err != 0) {
// clear the error // clear the error
fprintf( GGML_LOG_ERROR("WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size / 1024.0 / 1024.0, "syclGetErrorString is not supported");
stderr,
"WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
size / 1024.0 / 1024.0,
"syclGetErrorString is not supported");
return nullptr; return nullptr;
} }
@ -66,18 +63,12 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst, const ggml_tensor *src1, ggml_tensor *dst,
const ggml_sycl_op_flatten_t op) try { const ggml_sycl_op_flatten_t op) try {
const int64_t nrows0 = ggml_nrows(src0);
const bool use_src1 = src1 != nullptr; const bool use_src1 = src1 != nullptr;
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
// dd = data device // dd = data device
float * src0_ddf = (float *) src0->data; float * src0_ddf = (float *) src0->data;
float * src1_ddf = use_src1 ? (float *) src1->data : nullptr; float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;

View File

@ -626,6 +626,7 @@ struct bin_bcast_sycl {
}); });
} }
} }
GGML_UNUSED(ctx);
} }
}; };

View File

@ -47,7 +47,7 @@ static void concat_f32_dim1(const float *x, const float *y, float *dst,
// operation // operation
int offset_dst = nidx + item_ct1.get_group(1) * ne0 + int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
if (item_ct1.get_group(1) < ne01) { // src0 if (item_ct1.get_group(1) < (size_t) ne01) { // src0
int offset_src = int offset_src =
nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01; nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
dst[offset_dst] = x[offset_src]; dst[offset_dst] = x[offset_src];
@ -70,7 +70,7 @@ static void concat_f32_dim2(const float *x, const float *y, float *dst,
// operation // operation
int offset_dst = nidx + item_ct1.get_group(1) * ne0 + int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
if (item_ct1.get_group(0) < ne02) { // src0 if (item_ct1.get_group(0) < (size_t) ne02) { // src0
int offset_src = nidx + item_ct1.get_group(1) * ne0 + int offset_src = nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
dst[offset_dst] = x[offset_src]; dst[offset_dst] = x[offset_src];

View File

@ -424,7 +424,7 @@ static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y,
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2); const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
// make each work-item deal with more elements since sycl global range can not exceed max int // make each work-item deal with more elements since sycl global range can not exceed max int
const src_t * x = (src_t *) vx; const src_t * x = (const src_t *) vx;
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) { for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
y[i] = x[i]; y[i] = x[i];
} }

View File

@ -1015,9 +1015,9 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
break; break;
} }
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_ddq_i; GGML_UNUSED(src1_ddq_i);
(void) src1_ncols; GGML_UNUSED(src1_ncols);
(void) src1_padded_row_size; GGML_UNUSED(src1_padded_row_size);
} }

View File

@ -1237,7 +1237,7 @@ namespace dpct
std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr) std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
{ {
auto it = m_map.upper_bound((byte_t *)ptr); auto it = m_map.upper_bound(const_cast<byte_t *>(reinterpret_cast<const byte_t *>(ptr)));
if (it == m_map.end()) if (it == m_map.end())
{ {
// Not a virtual pointer. // Not a virtual pointer.

View File

@ -237,7 +237,7 @@ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
int i02 = i12 / sf2; int i02 = i12 / sf2;
int i03 = i13 / sf3; int i03 = i13 / sf3;
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
} }
void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02, void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
@ -251,8 +251,7 @@ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const i
// operation // operation
int offset_dst = nidx + item_ct1.get_group(1) * ne0 + int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
if (nidx < ne00 && item_ct1.get_group(1) < ne01 && if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) {
item_ct1.get_group(0) < ne02) {
int offset_src = nidx + item_ct1.get_group(1) * ne00 + int offset_src = nidx + item_ct1.get_group(1) * ne00 +
item_ct1.get_group(0) * ne00 * ne01; item_ct1.get_group(0) * ne00 * ne01;
dst[offset_dst] = x[offset_src]; dst[offset_dst] = x[offset_src];
@ -520,9 +519,10 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor
silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@ -535,9 +535,10 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor
gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst, const ggml_tensor *src1, ggml_tensor *dst,
@ -550,9 +551,10 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_
gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@ -564,9 +566,10 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@ -579,9 +582,10 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor
relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -595,9 +599,10 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml
hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -610,9 +615,10 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_t
hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -625,9 +631,10 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor
exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -640,9 +647,10 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor
log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -655,9 +663,10 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_ten
sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -670,9 +679,10 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor
sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -685,9 +695,10 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor
sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -700,9 +711,10 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor
cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -715,9 +727,10 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor
step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -730,9 +743,10 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor
neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -749,9 +763,10 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_
leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream); leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@ -764,9 +779,10 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -787,9 +803,10 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_ten
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
main_stream); main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@ -805,9 +822,10 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[0], src0->ne[1], src0->ne[2],
dst->ne[0], dst->ne[1], dst->ne[2], main_stream); dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@ -827,7 +845,8 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor
acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream); acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
(void) dst; GGML_UNUSED(dst);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,

View File

@ -51,8 +51,8 @@ public:
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab); const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab); const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab); const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
auto a_mem = dnnl::memory(a_in_md, eng, (void*)a); auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
auto b_mem = dnnl::memory(b_in_md, eng, (void*)b); auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md); auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c); auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
@ -79,8 +79,8 @@ public:
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab); const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab); const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab); const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
auto a_mem = dnnl::memory(a_in_md, eng, (void*)a); auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
auto b_mem = dnnl::memory(b_in_md, eng, (void*)b); auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md); auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c); auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);

View File

@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.device_count = dpct::dev_mgr::instance().device_count(); info.device_count = dpct::dev_mgr::instance().device_count();
if (info.device_count == 0) { if (info.device_count == 0) {
GGML_LOG_ERROR("%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__); GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__);
return info; return info;
} }
@ -64,7 +64,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
#else #else
GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__); GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
#endif #endif
GGML_LOG_INFO("%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count); GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME);
for (int i = 0; i < info.device_count; ++i) { for (int i = 0; i < info.device_count; ++i) {
info.devices[i].vmm = 0; info.devices[i].vmm = 0;
@ -137,7 +137,6 @@ void ggml_backend_sycl_print_sycl_devices() {
for (int id = 0; id < device_count; ++id) { for (int id = 0; id < device_count; ++id) {
sycl::device device = dpct::dev_mgr::instance().get_device(id); sycl::device device = dpct::dev_mgr::instance().get_device(id);
sycl::backend backend = device.get_backend();
std::string backend_type = get_device_backend_and_type(device); std::string backend_type = get_device_backend_and_type(device);
int type_id = DeviceNums[backend_type]++; int type_id = DeviceNums[backend_type]++;
std::stringstream device_type; std::stringstream device_type;
@ -420,14 +419,12 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
return true; return true;
} }
return false; return false;
} GGML_UNUSED(buffer);
catch (sycl::exception const &exc) { } catch (const sycl::exception & exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
<< ", line:" << __LINE__ << std::endl;
std::exit(1); std::exit(1);
} }
static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
uint8_t value) try { uint8_t value) try {
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
@ -1092,10 +1089,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {}; ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
size_t pool_size = 0; size_t pool_size = 0;
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
qptr(qptr_),
device(device_) {
}
~ggml_sycl_pool_leg() { ~ggml_sycl_pool_leg() {
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
@ -1238,7 +1232,7 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
zeros[i] = 0.f; zeros[i] = 0.f;
qzeros[i] = 0; qzeros[i] = 0;
} }
const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros; const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
float sum = xi[0]; float sum = xi[0];
float amax = sycl::fabs(xi[0]); float amax = sycl::fabs(xi[0]);
#pragma unroll #pragma unroll
@ -1799,6 +1793,9 @@ static void pool2d_nchw_kernel(
switch (op) { switch (op) {
case GGML_OP_POOL_AVG: res = 0; break; case GGML_OP_POOL_AVG: res = 0; break;
case GGML_OP_POOL_MAX: res = -FLT_MAX; break; case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
default:
res = (To) sycl::nan(uint32_t(0));
break;
} }
for (int i = bh; i < eh; i += 1) { for (int i = bh; i < eh; i += 1) {
@ -1817,6 +1814,9 @@ static void pool2d_nchw_kernel(
switch (op) { switch (op) {
case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break; case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break; case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
default:
res = (To) sycl::nan(uint32_t(0));
break;
} }
} }
} }
@ -1855,7 +1855,8 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
}); });
(void) dst; GGML_UNUSED(dst);
GGML_UNUSED(ctx);
} }
template <typename src0_t> template <typename src0_t>
@ -1893,10 +1894,10 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
}); });
} }
(void) dst; GGML_UNUSED(dst);
GGML_UNUSED(ctx);
} }
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
const int ky, const int kx_padded, const int ky, const int kx_padded,
queue_ptr stream) { queue_ptr stream) {
@ -2464,8 +2465,8 @@ static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tens
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream); ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
(void) src1; GGML_UNUSED(src1);
(void) src1_d; GGML_UNUSED(src1_d);
} }
@ -2484,17 +2485,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t ne10 = src1->ne[0]; const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low; const int64_t row_diff = row_high - row_low;
int id; int id;
SYCL_CHECK( SYCL_CHECK(
CHECK_TRY_ERROR(id = get_current_device_id())); CHECK_TRY_ERROR(id = get_current_device_id()));
#if !GGML_SYCL_DNNL
const int64_t ne0 = dst->ne[0];
// the main device has a larger memory buffer to hold the results from all GPUs // the main device has a larger memory buffer to hold the results from all GPUs
// ldc == nrows of the matrix that cuBLAS writes into // ldc == nrows of the matrix that cuBLAS writes into
int ldc = id == ctx.device ? ne0 : row_diff; int ldc = id == ctx.device ? ne0 : row_diff;
#endif
#ifdef GGML_SYCL_F16 #ifdef GGML_SYCL_F16
bool use_fp16 = true; // TODO(Yu) SYCL capability check bool use_fp16 = true; // TODO(Yu) SYCL capability check
@ -2531,9 +2533,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
: src1_as_f16.get(); : src1_as_f16.get();
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols); ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
#if !GGML_SYCL_DNNL
const sycl::half alpha_f16 = 1.0f; const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f; const sycl::half beta_f16 = 0.0f;
#if !GGML_SYCL_DNNL
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm( SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
*stream, oneapi::mkl::transpose::trans, *stream, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
@ -2570,9 +2572,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get(); const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get(); const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
#if !GGML_SYCL_DNNL
const float alpha = 1.0f; const float alpha = 1.0f;
const float beta = 0.0f; const float beta = 0.0f;
#if !GGML_SYCL_DNNL
# ifdef GGML_SYCL_NVIDIA # ifdef GGML_SYCL_NVIDIA
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans, oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
@ -2590,9 +2592,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>()); src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
#endif #endif
} }
(void) dst; GGML_UNUSED(dst);
(void) src1_ddq_i; GGML_UNUSED(src1_ddq_i);
(void) src1_padded_row_size; GGML_UNUSED(src1_padded_row_size);
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@ -2638,8 +2640,9 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
item_ct1); item_ct1);
}); });
(void) src1; GGML_UNUSED(src1);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -2654,9 +2657,10 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor
sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream); sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -2673,9 +2677,10 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_te
sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -2694,9 +2699,10 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_ten
argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream); argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -2713,9 +2719,10 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tens
argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream); argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@ -2735,9 +2742,10 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const gg
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@ -2758,9 +2766,10 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tenso
*/ */
SYCL_CHECK(0); SYCL_CHECK(0);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@ -2783,9 +2792,10 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tenso
*/ */
SYCL_CHECK(0); SYCL_CHECK(0);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }
static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
@ -2862,7 +2872,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src0_is_contiguous = ggml_is_contiguous(src0);
const bool src1_is_contiguous = ggml_is_contiguous(src1); const bool src1_is_contiguous = ggml_is_contiguous(src1);
@ -3289,7 +3298,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
const int64_t ne_dst = ggml_nelements(dst);
SYCL_CHECK(ggml_sycl_set_device(ctx.device)); SYCL_CHECK(ggml_sycl_set_device(ctx.device));
queue_ptr main_stream = ctx.stream();; queue_ptr main_stream = ctx.stream();;
@ -3397,6 +3405,7 @@ catch (sycl::exception const &exc) {
inline bool ggml_sycl_supports_mmq(enum ggml_type type) { inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
// TODO: accuracy issues in MMQ // TODO: accuracy issues in MMQ
GGML_UNUSED(type);
return false; return false;
} }
@ -3772,7 +3781,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
(void) dst; GGML_UNUSED(dst);
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@ -3783,7 +3792,7 @@ catch (sycl::exception const &exc) {
static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
// TODO: why do we pass dst as src1 here? // TODO: why do we pass dst as src1 here?
ggml_sycl_cpy(ctx, src0, dst, nullptr); ggml_sycl_cpy(ctx, src0, dst, nullptr);
(void) src1; GGML_UNUSED(src1);
} }
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -3828,13 +3837,16 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor
} }
static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
(void) src0; GGML_UNUSED(src0);
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
GGML_UNUSED(ctx);
} }
void ggml_sycl_set_main_device(const int main_device) try { void ggml_sycl_set_main_device(const int main_device) try {
if (dpct::get_current_device_id() == main_device) return; if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
return;
}
check_allow_gpu_index(main_device); check_allow_gpu_index(main_device);
dpct::select_device(main_device); dpct::select_device(main_device);
@ -4202,6 +4214,7 @@ try
{ {
ggml_backend_sycl_context *sycl_ctx = ggml_backend_sycl_context *sycl_ctx =
(ggml_backend_sycl_context *)backend->context; (ggml_backend_sycl_context *)backend->context;
sycl::event *sycl_event = static_cast<sycl::event *>(event->context); sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0); const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
@ -4216,7 +4229,7 @@ catch (sycl::exception const &exc)
} }
static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try { static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
ggml_backend_sycl_context* sycl_ctx = static_cast<ggml_backend_sycl_context*>(backend->context);
sycl::event* sycl_event = static_cast<sycl::event*>(event->context); sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
if (ggml_backend_is_sycl(backend)) { if (ggml_backend_is_sycl(backend)) {
@ -4475,7 +4488,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
return true; return true;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return ggml_is_contiguous(op->src[0]); return ggml_is_contiguous(op->src[0]);
}
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
// TODO: add support for the new F32 operations // TODO: add support for the new F32 operations
return op->src[0]->type == GGML_TYPE_F16; return op->src[0]->type == GGML_TYPE_F16;
@ -4624,6 +4646,7 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
// SYCL doesn't support registering host memory, left here for reference // SYCL doesn't support registering host memory, left here for reference
// "ggml_backend_register_host_buffer" // "ggml_backend_register_host_buffer"
// "ggml_backend_unregister_host_buffer" // "ggml_backend_unregister_host_buffer"
GGML_UNUSED(name);
return nullptr; return nullptr;
} }

View File

@ -120,6 +120,7 @@ void ggml_sycl_op_im2col(
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
} }
(void) src0; GGML_UNUSED(src0);
(void) src0_dd; GGML_UNUSED(src0_dd);
GGML_UNUSED(ctx);
} }

View File

@ -813,7 +813,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
} }
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 constexpr int blocks_per_tile_x_row = QI4_K > WARP_SIZE ? 1 : WARP_SIZE / QI4_K; // == 1 if QK_K == 256
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
#pragma unroll #pragma unroll
@ -961,7 +961,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
} }
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 constexpr int blocks_per_tile_x_row = QI5_K > WARP_SIZE ? 1 : WARP_SIZE / QI5_K; // == 1 if QK_K == 256
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
#pragma unroll #pragma unroll
@ -1109,7 +1109,7 @@ load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
dpct::sub_sat()); dpct::sub_sat());
} }
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 constexpr int blocks_per_tile_x_row = QI6_K > WARP_SIZE ? 1 : WARP_SIZE / QI6_K; // == 1 if QK_K == 256
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
float * x_dmf = (float *) x_dm; float * x_dmf = (float *) x_dm;
@ -3020,9 +3020,9 @@ void ggml_sycl_op_mul_mat_q(
break; break;
} }
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_ddf_i; GGML_UNUSED(src1_ddf_i);
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__

View File

@ -753,11 +753,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{ {
stream->submit([&](sycl::handler & cgh) {
stream->submit([&](sycl::handler &cgh) {
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
auto ksigns64_ptr_ct1 = &ksigns64[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
@ -780,9 +776,6 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
{ {
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
auto ksigns64_ptr_ct1 = &ksigns64[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
@ -805,9 +798,6 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
{ {
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
auto ksigns64_ptr_ct1 = &ksigns64[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
@ -830,8 +820,6 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
{ {
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
@ -854,9 +842,6 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
{ {
stream->submit([&](sycl::handler &cgh) { stream->submit([&](sycl::handler &cgh) {
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
auto ksigns64_ptr_ct1 = &ksigns64[0];
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
@ -954,7 +939,7 @@ void ggml_sycl_op_mul_mat_vec_q(
const size_t q8_1_bs = QK8_1; const size_t q8_1_bs = QK8_1;
// the main device has a larger memory buffer to hold the results from all GPUs // the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into // nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
for (int i = 0; i < src1_ncols; i++) for (int i = 0; i < src1_ncols; i++)
{ {
const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs; const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
@ -1023,7 +1008,8 @@ void ggml_sycl_op_mul_mat_vec_q(
break; break;
} }
} }
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_ddf_i; GGML_UNUSED(src1_ddf_i);
GGML_UNUSED(ctx);
} }

View File

@ -31,7 +31,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
*/ */
item_ct1.barrier(sycl::access::fence_space::local_space); item_ct1.barrier(sycl::access::fence_space::local_space);
mean_var = 0.f; mean_var = 0.f;
int nreduce = nwarps / WARP_SIZE; size_t nreduce = nwarps / WARP_SIZE;
for (size_t i = 0; i < nreduce; i += 1) for (size_t i = 0; i < nreduce; i += 1)
{ {
mean_var += s_sum[lane_id + i * WARP_SIZE]; mean_var += s_sum[lane_id + i * WARP_SIZE];
@ -55,7 +55,7 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
const int nthreads = item_ct1.get_local_range(2); const int nthreads = item_ct1.get_local_range(2);
const int nwarps = nthreads / WARP_SIZE; const int nwarps = nthreads / WARP_SIZE;
start += item_ct1.get_local_id(2); start += item_ct1.get_local_id(2);
int nreduce = nwarps / WARP_SIZE; size_t nreduce = nwarps / WARP_SIZE;
if (end >= ne_elements) { if (end >= ne_elements) {
end = ne_elements; end = ne_elements;
@ -163,7 +163,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
converged control flow. You may need to adjust the code. converged control flow. You may need to adjust the code.
*/ */
item_ct1.barrier(sycl::access::fence_space::local_space); item_ct1.barrier(sycl::access::fence_space::local_space);
int nreduce = nwarps / WARP_SIZE; size_t nreduce = nwarps / WARP_SIZE;
tmp = 0.f; tmp = 0.f;
for (size_t i = 0; i < nreduce; i += 1) for (size_t i = 0; i < nreduce; i += 1)
{ {
@ -352,6 +352,7 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
(void)src1; (void)src1;
(void)dst; (void)dst;
(void)src1_dd; (void)src1_dd;
GGML_UNUSED(ctx);
} }
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,

View File

@ -269,7 +269,8 @@ void ggml_sycl_op_rope(
} }
} }
(void) src1; GGML_UNUSED(src1);
(void) dst; GGML_UNUSED(dst);
(void) src1_dd; GGML_UNUSED(src1_dd);
GGML_UNUSED(ctx);
} }

View File

@ -16,7 +16,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
const int nthreads = block_size; const int nthreads = block_size;
const int nwarps = nthreads / WARP_SIZE; const int nwarps = nthreads / WARP_SIZE;
int nreduce = nwarps / WARP_SIZE; size_t nreduce = nwarps / WARP_SIZE;
float slope = 1.0f; float slope = 1.0f;
// ALiBi // ALiBi
@ -53,9 +53,10 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
if (block_size > WARP_SIZE) { if (block_size > WARP_SIZE) {
if (warp_id == 0) { if (warp_id == 0) {
buf[lane_id] = -INFINITY; buf[lane_id] = -INFINITY;
for (size_t i = 1; i < nreduce; i += 1) for (size_t i = 1; i < nreduce; i += 1) {
buf[lane_id + i * WARP_SIZE] = -INFINITY; buf[lane_id + i * WARP_SIZE] = -INFINITY;
} }
}
item_ct1.barrier(sycl::access::fence_space::local_space); item_ct1.barrier(sycl::access::fence_space::local_space);
if (lane_id == 0) { if (lane_id == 0) {
@ -63,8 +64,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
} }
item_ct1.barrier(sycl::access::fence_space::local_space); item_ct1.barrier(sycl::access::fence_space::local_space);
max_val = buf[lane_id]; max_val = buf[lane_id];
for (size_t i = 1; i < nreduce; i += 1) for (size_t i = 1; i < nreduce; i += 1) {
{
max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]); max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
} }
max_val = warp_reduce_max(max_val, item_ct1); max_val = warp_reduce_max(max_val, item_ct1);
@ -89,9 +89,10 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
item_ct1.barrier(sycl::access::fence_space::local_space); item_ct1.barrier(sycl::access::fence_space::local_space);
if (warp_id == 0) { if (warp_id == 0) {
buf[lane_id] = 0.f; buf[lane_id] = 0.f;
for (size_t i = 1; i < nreduce; i += 1) for (size_t i = 1; i < nreduce; i += 1) {
buf[lane_id + i * WARP_SIZE] = 0.f; buf[lane_id + i * WARP_SIZE] = 0.f;
} }
}
item_ct1.barrier(sycl::access::fence_space::local_space); item_ct1.barrier(sycl::access::fence_space::local_space);
if (lane_id == 0) { if (lane_id == 0) {
@ -100,8 +101,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
item_ct1.barrier(sycl::access::fence_space::local_space); item_ct1.barrier(sycl::access::fence_space::local_space);
tmp = buf[lane_id]; tmp = buf[lane_id];
for (size_t i = 1; i < nreduce; i += 1) for (size_t i = 1; i < nreduce; i += 1) {
{
tmp += buf[lane_id + i * WARP_SIZE]; tmp += buf[lane_id + i * WARP_SIZE];
} }
tmp = warp_reduce_sum(tmp, item_ct1); tmp = warp_reduce_sum(tmp, item_ct1);

View File

@ -68,4 +68,5 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml
const int max_period = dst->op_params[1]; const int max_period = dst->op_params[1];
timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
GGML_UNUSED(src1);
} }

View File

@ -59,7 +59,7 @@ static void rwkv_wkv_f32_kernel(
float y = 0; float y = 0;
// Process in chunks of 4 for better vectorization // Process in chunks of 4 for better vectorization
sycl::float4 k4, r4, tf4, td4, s4, kv4; sycl::float4 k4, r4, tf4, td4, s4;
#pragma unroll #pragma unroll
for (int j = 0; j < head_size; j += 4) { for (int j = 0; j < head_size; j += 4) {
// Load data in vec4 chunks // Load data in vec4 chunks
@ -135,4 +135,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
); );
}); });
}); });
GGML_UNUSED(src0);
GGML_UNUSED(src1);
} }

View File

@ -1,11 +1,13 @@
#pragma once #pragma once
#include "ggml.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
void ggml_critical_section_start(void); GGML_API void ggml_critical_section_start(void);
void ggml_critical_section_end(void); GGML_API void ggml_critical_section_end(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -81,7 +81,7 @@ if (Vulkan_FOUND)
--target-cpp ${_ggml_vk_source} --target-cpp ${_ggml_vk_source}
--no-clean --no-clean
DEPENDS ${_ggml_vk_shader_deps} DEPENDS ${_ggml_vk_shader_deps} ${_ggml_vk_genshaders_cmd}
COMMENT "Generate vulkan shaders" COMMENT "Generate vulkan shaders"
) )

View File

@ -44,12 +44,6 @@
#define MAX_VK_BUFFERS 256 #define MAX_VK_BUFFERS 256
#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 1
#else
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif
#define VK_CHECK(err, msg) \ #define VK_CHECK(err, msg) \
do { \ do { \
vk::Result err_ = (err); \ vk::Result err_ = (err); \
@ -168,7 +162,12 @@ struct vk_device_struct {
uint32_t subgroup_size; uint32_t subgroup_size;
uint32_t shader_core_count; uint32_t shader_core_count;
bool uma; bool uma;
bool coopmat2; bool float_controls_rte_fp16;
bool subgroup_size_control;
uint32_t subgroup_min_size;
uint32_t subgroup_max_size;
bool subgroup_require_full_support;
bool coopmat_support; bool coopmat_support;
bool coopmat_acc_f32_support; bool coopmat_acc_f32_support;
@ -176,6 +175,7 @@ struct vk_device_struct {
uint32_t coopmat_m; uint32_t coopmat_m;
uint32_t coopmat_n; uint32_t coopmat_n;
uint32_t coopmat_k; uint32_t coopmat_k;
bool coopmat2;
size_t idx; size_t idx;
@ -754,8 +754,12 @@ static uint32_t compile_count = 0;
static std::mutex compile_count_mutex; static std::mutex compile_count_mutex;
static std::condition_variable compile_count_cond; static std::condition_variable compile_count_cond;
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align, bool disable_robustness) { static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint,
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")"); uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
uint32_t align, bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size <<
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align <<
", " << disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
GGML_ASSERT(parameter_count > 0); GGML_ASSERT(parameter_count > 0);
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
@ -814,14 +818,28 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
specialization_constants.data() specialization_constants.data()
); );
vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{};
if (device->subgroup_require_full_support && require_full_subgroups) {
pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT;
}
vk::PipelineShaderStageCreateInfo pipeline_shader_create_info( vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
vk::PipelineShaderStageCreateFlags(), pipeline_shader_stage_create_flags,
vk::ShaderStageFlagBits::eCompute, vk::ShaderStageFlagBits::eCompute,
pipeline->shader_module, pipeline->shader_module,
entrypoint.c_str(), entrypoint.c_str(),
&specialization_info); &specialization_info);
vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info;
pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size;
if (device->subgroup_size_control && required_subgroup_size > 0) {
GGML_ASSERT(device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size);
pipeline_shader_create_info.setPNext(&pipeline_shader_stage_required_subgroup_size_create_info);
}
vk::ComputePipelineCreateInfo compute_pipeline_create_info( vk::ComputePipelineCreateInfo compute_pipeline_create_info(
vk::PipelineCreateFlags(), vk::PipelineCreateFlags{},
pipeline_shader_create_info, pipeline_shader_create_info,
pipeline->layout); pipeline->layout);
@ -1501,7 +1519,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>(); device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
std::vector<std::future<void>> compiles; std::vector<std::future<void>> compiles;
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align, bool disable_robustness = false) { auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
{ {
// wait until fewer than N compiles are in progress // wait until fewer than N compiles are in progress
uint32_t N = std::max(1u, std::thread::hardware_concurrency()); uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@ -1511,7 +1531,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
} }
compile_count++; compile_count++;
} }
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness)); compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint,
parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness, require_full_subgroups, required_subgroup_size));
}; };
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
@ -1617,28 +1638,33 @@ static void ggml_vk_load_shaders(vk_device& device) {
// Create 6 variants, {s,m,l}x{unaligned,aligned} // Create 6 variants, {s,m,l}x{unaligned,aligned}
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
if (device->mul_mat ## ID ## _l) \ if (device->mul_mat ## ID ## _l) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \
if (device->mul_mat ## ID ## _m) \ if (device->mul_mat ## ID ## _m) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \
if (device->mul_mat ## ID ## _s) \ if (device->mul_mat ## ID ## _s) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \
if (device->mul_mat ## ID ## _l) \ if (device->mul_mat ## ID ## _l) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \
if (device->mul_mat ## ID ## _m) \ if (device->mul_mat ## ID ## _m) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \
if (device->mul_mat ## ID ## _s) \ if (device->mul_mat ## ID ## _s) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \
// Create 2 variants, {f16,f32} accumulator // Create 2 variants, {f16,f32} accumulator
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ #define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
if (device->coopmat_acc_f16_support) { \
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
} \
if (device->coopmat_acc_f32_support) { \
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
} \
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
if (device->coopmat_acc_f16_support) {
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
@ -1651,6 +1677,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
} else {
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
}
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
@ -1658,6 +1698,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
if (device->coopmat_acc_f16_support) {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
@ -1670,7 +1711,22 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} else {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} }
}
#undef CREATE_MM2
#undef CREATE_MM #undef CREATE_MM
} else if (device->fp16) { } else if (device->fp16) {
// Create 6 variants, {s,m,l}x{unaligned,aligned} // Create 6 variants, {s,m,l}x{unaligned,aligned}
@ -1688,6 +1744,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
if (device->mul_mat ## ID ## _s) \ if (device->mul_mat ## ID ## _s) \
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
// Create 2 variants, {f16,f32} accumulator
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
@ -1725,6 +1786,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} }
#undef CREATE_MM2
#undef CREATE_MM #undef CREATE_MM
} else { } else {
// Create 6 variants, {s,m,l}x{unaligned,aligned} // Create 6 variants, {s,m,l}x{unaligned,aligned}
@ -1779,53 +1841,58 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} }
#undef CREATE_MM2
#undef CREATE_MM #undef CREATE_MM
} }
// mul mat vec // mul mat vec
// computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
// AMD GCN and Intel graphics cards perform best when the number of rows per shader is doubled
uint32_t rm = 1;
if ((device->vendor_id == VK_VENDOR_ID_AMD && device->subgroup_min_size == 64 && device->subgroup_max_size == 64) || device->vendor_id == VK_VENDOR_ID_INTEL)
rm = 2;
// computing additional rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
// dequant shaders // dequant shaders
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@ -1922,17 +1989,26 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
if (device->float_controls_rte_fp16) {
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_rte_len, rope_norm_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_rte_len, rope_neox_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
} else {
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
}
ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
if (device->float_controls_rte_fp16) {
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
} else {
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
}
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
@ -1994,6 +2070,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
amd_shader_core_properties2 = true; amd_shader_core_properties2 = true;
} else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
pipeline_robustness = true; pipeline_robustness = true;
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
device->subgroup_size_control = true;
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
!getenv("GGML_VK_DISABLE_COOPMAT")) { !getenv("GGML_VK_DISABLE_COOPMAT")) {
device->coopmat_support = true; device->coopmat_support = true;
@ -2013,11 +2091,15 @@ static vk_device ggml_vk_get_device(size_t idx) {
vk::PhysicalDeviceDriverProperties driver_props; vk::PhysicalDeviceDriverProperties driver_props;
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props; vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
vk::PhysicalDeviceVulkan12Properties vk12_props;
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
props2.pNext = &props3; props2.pNext = &props3;
props3.pNext = &subgroup_props; props3.pNext = &subgroup_props;
subgroup_props.pNext = &driver_props; subgroup_props.pNext = &driver_props;
driver_props.pNext = &vk12_props;
VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props; VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props;
if (maintenance4_support) { if (maintenance4_support) {
last_struct->pNext = (VkBaseOutStructure *)&props4; last_struct->pNext = (VkBaseOutStructure *)&props4;
@ -2031,6 +2113,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props; last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props; last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
} }
if (device->subgroup_size_control) {
last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props;
last_struct = (VkBaseOutStructure *)&subgroup_size_control_props;
}
#if defined(VK_NV_cooperative_matrix2) #if defined(VK_NV_cooperative_matrix2)
vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props; vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
@ -2063,12 +2149,13 @@ static vk_device ggml_vk_get_device(size_t idx) {
} else { } else {
device->shader_core_count = 0; device->shader_core_count = 0;
} }
device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;
const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr;
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
if (device->vendor_id == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) { if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
// Intel drivers don't support coopmat properly yet // Intel drivers don't support coopmat properly yet
// Only RADV supports coopmat properly on AMD // Only RADV supports coopmat properly on AMD
device->coopmat_support = false; device->coopmat_support = false;
@ -2124,6 +2211,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
device_extensions.push_back("VK_EXT_pipeline_robustness"); device_extensions.push_back("VK_EXT_pipeline_robustness");
} }
VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
subgroup_size_control_features.pNext = nullptr;
subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
subgroup_size_control_features.computeFullSubgroups = false;
subgroup_size_control_features.subgroupSizeControl = false;
if (device->subgroup_size_control) {
last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features;
last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
}
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
coopmat_features.pNext = nullptr; coopmat_features.pNext = nullptr;
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
@ -2151,6 +2249,20 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->pipeline_robustness = pl_robustness_features.pipelineRobustness; device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
if (device->subgroup_size_control) {
device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize;
}
device->subgroup_size_control = device->subgroup_size_control &&
(subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) &&
subgroup_size_control_features.subgroupSizeControl;
if (device->subgroup_size_control) {
device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups;
device_extensions.push_back("VK_EXT_subgroup_size_control");
}
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix; device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
if (coopmat2_support) { if (coopmat2_support) {
@ -2300,7 +2412,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
} }
} }
if (device->coopmat_m == 0) { if (device->coopmat_m == 0 || !device->coopmat_acc_f32_support) {
// No suitable matmul mode found // No suitable matmul mode found
GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n"); GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n");
device->coopmat_support = false; device->coopmat_support = false;
@ -2425,13 +2537,15 @@ static void ggml_vk_print_gpu_info(size_t idx) {
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
!getenv("GGML_VK_DISABLE_COOPMAT")) { !getenv("GGML_VK_DISABLE_COOPMAT")) {
coopmat_support = true; coopmat_support = true;
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
!getenv("GGML_VK_DISABLE_COOPMAT2")) { !getenv("GGML_VK_DISABLE_COOPMAT2")) {
coopmat2_support = true; coopmat2_support = true;
#endif
} }
} }
if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) { if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
// Intel drivers don't support coopmat properly yet // Intel drivers don't support coopmat properly yet
// Only RADV supports coopmat properly on AMD // Only RADV supports coopmat properly on AMD
coopmat_support = false; coopmat_support = false;
@ -2718,7 +2832,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
return ctx->device->pipeline_matmul_f32_f16; return ctx->device->pipeline_matmul_f32_f16;
} }
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) { if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
return ctx->device->pipeline_matmul_f16_f32.f16acc; return ctx->device->pipeline_matmul_f16_f32.f16acc;
} }
@ -2793,7 +2907,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
return ctx->device->pipeline_matmul_id_f32; return ctx->device->pipeline_matmul_id_f32;
} }
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) { if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
return ctx->device->pipeline_matmul_id_f16_f32.f16acc; return ctx->device->pipeline_matmul_id_f16_f32.f16acc;
} }
@ -7573,7 +7687,16 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float); return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
case GGML_OP_ROPE: case GGML_OP_ROPE:
{
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return ggml_is_contiguous(op->src[0]); return ggml_is_contiguous(op->src[0]);
}
case GGML_OP_NONE: case GGML_OP_NONE:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:

View File

@ -25,92 +25,94 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
#if defined(DATA_A_Q4_0) #if defined(DATA_A_Q4_0)
vec2 dequantize(uint ib, uint iqs, uint a_offset) { vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a[a_offset + ib].d);
const uint vui = uint(data_a[a_offset + ib].qs[iqs]); const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d; return (vec2(vui & 0xF, vui >> 4) - 8.0f);
} }
vec4 dequantize4(uint ib, uint iqs, uint a_offset) { vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a_packed16[a_offset + ib].d);
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) - 8.0f) * d; return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
} }
#endif #endif
#if defined(DATA_A_Q4_1) #if defined(DATA_A_Q4_1)
vec2 dequantize(uint ib, uint iqs, uint a_offset) { vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a[a_offset + ib].d);
const float m = float(data_a[a_offset + ib].m);
const uint vui = uint(data_a[a_offset + ib].qs[iqs]); const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return vec2(vui & 0xF, vui >> 4) * d + m; return vec2(vui & 0xF, vui >> 4);
} }
vec4 dequantize4(uint ib, uint iqs, uint a_offset) { vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a_packed16[a_offset + ib].d);
const float m = float(data_a_packed16[a_offset + ib].m);
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) * d + m; return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12);
} }
#endif #endif
#if defined(DATA_A_Q5_0) #if defined(DATA_A_Q5_0)
vec2 dequantize(uint ib, uint iqs, uint a_offset) { vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a[a_offset + ib].d);
const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0]; const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0];
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
const uint vui = uint(data_a[a_offset + ib].qs[iqs]); const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d; return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f);
} }
vec4 dequantize4(uint ib, uint iqs, uint a_offset) { vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a_packed16[a_offset + ib].d);
const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0]; const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10); const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
return (vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) - 16.0f) * d; return (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f);
} }
#endif #endif
#if defined(DATA_A_Q5_1) #if defined(DATA_A_Q5_1)
vec2 dequantize(uint ib, uint iqs, uint a_offset) { vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a[a_offset + ib].d);
const float m = float(data_a[a_offset + ib].m);
const uint uint_qh = data_a[a_offset + ib].qh; const uint uint_qh = data_a[a_offset + ib].qh;
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
const uint vui = uint(data_a[a_offset + ib].qs[iqs]); const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m; return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y);
} }
vec4 dequantize4(uint ib, uint iqs, uint a_offset) { vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a_packed16[a_offset + ib].d);
const float m = float(data_a_packed16[a_offset + ib].m);
const uint uint_qh = data_a_packed16[a_offset + ib].qh; const uint uint_qh = data_a_packed16[a_offset + ib].qh;
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10); const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
return vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * d + m; return vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y);
} }
#endif #endif
#if defined(DATA_A_Q8_0) #if defined(DATA_A_Q8_0)
vec2 dequantize(uint ib, uint iqs, uint a_offset) { vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a[a_offset + ib].d); return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
} }
vec4 dequantize4(uint ib, uint iqs, uint a_offset) { vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a_packed16[a_offset + ib].d);
uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2]; uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1]; uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
return vec4(int8_t(v0 & 0xFF), int8_t((v0 >> 8) & 0xFF), int8_t(v1 & 0xFF), int8_t((v1 >> 8) & 0xFF)) * d; return vec4(int8_t(v0 & 0xFF), int8_t(v0 >> 8), int8_t(v1 & 0xFF), int8_t(v1 >> 8));
} }
#endif #endif
#if defined(DATA_A_IQ4_NL) #if defined(DATA_A_IQ4_NL)
vec2 dequantize(uint ib, uint iqs, uint a_offset) { vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a[a_offset + ib].d);
const uint vui = uint(data_a[a_offset + ib].qs[iqs]); const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d; return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
} }
vec4 dequantize4(uint ib, uint iqs, uint a_offset) { vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a_packed16[a_offset + ib].d);
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[(vui >> 12) & 0xF]) * d; return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
}
#endif
#if defined(DATA_A_F32) || defined(DATA_A_F16)
vec2 get_dm(uint ib, uint a_offset) {
return vec2(0, 0);
}
#endif
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ4_NL)
vec2 get_dm(uint ib, uint a_offset) {
return vec2(float(data_a[a_offset + ib].d), 0);
}
#endif
#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
vec2 get_dm(uint ib, uint a_offset) {
return vec2(float(data_a[a_offset + ib].d), float(data_a[a_offset + ib].m));
} }
#endif #endif

View File

@ -9,8 +9,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() { void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint i = gl_WorkGroupID.x * 256 + wgy; const uint ib = gl_WorkGroupID.x * 256 + wgy;
if (i >= p.M * p.K / QUANT_K) { if (ib >= p.M * p.K / QUANT_K) {
return; return;
} }
@ -20,37 +20,49 @@ void main() {
const uint is = 2 * il; const uint is = 2 * il;
const uint n = 4; const uint n = 4;
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x); const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y); const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
const uint y_idx = i * QUANT_K + 64 * il + n * ir; const uint y_idx = ib * QUANT_K + 64 * il + n * ir;
const uint qs_idx = 32*il + n * ir; const uint qs_idx = 32*il + n * ir;
uint8_t sc; uint scidx0 = (is < 4) ? is : (is + 4);
uint8_t m; uint scidx1 = (is < 4) ? is : (is - 4);
if (is < 4) { uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
sc = uint8_t(data_a[i].scales[is] & 63); uint scidxshift1 = (is < 4) ? 0 : 2;
m = uint8_t(data_a[i].scales[is + 4] & 63); uint mbidx0 = is + 4;
} else { uint mbidx1 = (is < 4) ? is + 4 : is;
sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4)); uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
m = uint8_t((data_a[i].scales[is + 4] >> 4) | ((data_a[i].scales[is ] >> 6) << 4)); uint mbidxshift0 = (is < 4) ? 0 : 4;
} uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
const FLOAT_TYPE d1 = dall * sc; uint mbidxshift1 = (is < 4) ? 0 : 2;
const FLOAT_TYPE m1 = dmin * m;
uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
const FLOAT_TYPE d1 = dall * sc;
const FLOAT_TYPE m1 = dmin * mbyte;
scidx0 = (is < 4) ? is + 1 : (is + 5);
scidx1 = (is < 4) ? is + 1 : (is - 3);
scidxmask1 = (is < 4) ? 0x30 : 0xC0;
scidxshift1 = (is < 4) ? 0 : 2;
mbidx0 = is + 5;
mbidx1 = (is < 4) ? is + 5 : is + 1;
mbidxmask0 = (is < 4) ? 0xF : 0xF0;
mbidxshift0 = (is < 4) ? 0 : 4;
mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
mbidxshift1 = (is < 4) ? 0 : 2;
sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
if (is < 4) {
sc = uint8_t(data_a[i].scales[is + 1] & 63);
m = uint8_t(data_a[i].scales[is + 5] & 63);
} else {
sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
m = uint8_t((data_a[i].scales[is + 5] >> 4) | ((data_a[i].scales[is + 1] >> 6) << 4));
}
const FLOAT_TYPE d2 = dall * sc; const FLOAT_TYPE d2 = dall * sc;
const FLOAT_TYPE m2 = dmin * m; const FLOAT_TYPE m2 = dmin * mbyte;
[[unroll]] for (uint l = 0; l < n; ++l) { [[unroll]] for (uint l = 0; l < n; ++l) {
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1); data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] & 0xF) - m1);
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >> 4) - m2); data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] >> 4) - m2);
} }
} }
} }

View File

@ -9,8 +9,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() { void main() {
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
const uint i = gl_WorkGroupID.x * 256 + wgy; const uint ib = gl_WorkGroupID.x * 256 + wgy;
if (i >= p.M * p.K / QUANT_K) { if (ib >= p.M * p.K / QUANT_K) {
return; return;
} }
@ -19,40 +19,52 @@ void main() {
const uint ir = tid % 16; const uint ir = tid % 16;
const uint is = 2 * il; const uint is = 2 * il;
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x); const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y); const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
const uint y_idx = i * QUANT_K + 64 * il + 2 * ir; const uint y_idx = ib * QUANT_K + 64 * il + 2 * ir;
const uint qs_idx = 32*il + 2 * ir; const uint qs_idx = 32*il + 2 * ir;
const uint qh_idx = 2 * ir; const uint qh_idx = 2 * ir;
uint8_t sc; uint scidx0 = (is < 4) ? is : (is + 4);
uint8_t m; uint scidx1 = (is < 4) ? is : (is - 4);
if (is < 4) { uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
sc = uint8_t(data_a[i].scales[is] & 63); uint scidxshift1 = (is < 4) ? 0 : 2;
m = uint8_t(data_a[i].scales[is + 4] & 63); uint mbidx0 = is + 4;
} else { uint mbidx1 = (is < 4) ? is + 4 : is;
sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4)); uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
m = uint8_t((data_a[i].scales[is + 4] >> 4) | ((data_a[i].scales[is ] >> 6) << 4)); uint mbidxshift0 = (is < 4) ? 0 : 4;
} uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
const FLOAT_TYPE d1 = dall * sc; uint mbidxshift1 = (is < 4) ? 0 : 2;
const FLOAT_TYPE m1 = dmin * m;
uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
const FLOAT_TYPE d1 = dall * sc;
const FLOAT_TYPE m1 = dmin * mbyte;
scidx0 = (is < 4) ? is + 1 : (is + 5);
scidx1 = (is < 4) ? is + 1 : (is - 3);
scidxmask1 = (is < 4) ? 0x30 : 0xC0;
scidxshift1 = (is < 4) ? 0 : 2;
mbidx0 = is + 5;
mbidx1 = (is < 4) ? is + 5 : is + 1;
mbidxmask0 = (is < 4) ? 0xF : 0xF0;
mbidxshift0 = (is < 4) ? 0 : 4;
mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
mbidxshift1 = (is < 4) ? 0 : 2;
sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
if (is < 4) {
sc = uint8_t(data_a[i].scales[is + 1] & 63);
m = uint8_t(data_a[i].scales[is + 5] & 63);
} else {
sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
m = uint8_t((data_a[i].scales[is + 5] >> 4) | ((data_a[i].scales[is + 1] >> 6) << 4));
}
const FLOAT_TYPE d2 = dall * sc; const FLOAT_TYPE d2 = dall * sc;
const FLOAT_TYPE m2 = dmin * m; const FLOAT_TYPE m2 = dmin * mbyte;
const uint8_t hm1 = uint8_t(1 << (2 * il )); const uint8_t hm1 = uint8_t(1 << (2 * il ));
const uint8_t hm2 = uint8_t(1 << (2 * il + 1)); const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
data_b[y_idx ] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx ] & 0xF) + (((data_a[i].qh[qh_idx ] & hm1) != 0) ? 16 : 0)) - m1); data_b[y_idx ] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx ] & 0xF) + (((data_a[ib].qh[qh_idx ] & hm1) != 0) ? 16 : 0)) - m1);
data_b[y_idx + 1] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1] & 0xF) + (((data_a[i].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1); data_b[y_idx + 1] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] & 0xF) + (((data_a[ib].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx ] >> 4) + (((data_a[i].qh[qh_idx ] & hm2) != 0) ? 16 : 0)) - m2); data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx ] >> 4) + (((data_a[ib].qh[qh_idx ] & hm2) != 0) ? 16 : 0)) - m2);
data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1] >> 4) + (((data_a[i].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2); data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] >> 4) + (((data_a[ib].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
} }
} }

View File

@ -31,6 +31,8 @@ void main() {
const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2; const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
vec2 v = dequantize(ib, iqs, 0); vec2 v = dequantize(ib, iqs, 0);
const vec2 dm = get_dm(ib, 0);
v = v * dm.x + dm.y;
data_d[d_offset + iybs + iqs ] = D_TYPE(v.x); data_d[d_offset + iybs + iqs ] = D_TYPE(v.x);
data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y); data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);

View File

@ -1,6 +1,11 @@
#version 450 #version 450
#extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_spirv_intrinsics: enable
#if RTE16
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
#endif
layout (push_constant) uniform parameter layout (push_constant) uniform parameter
{ {

View File

@ -31,27 +31,13 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
#if K_PER_ITER == 8 #if K_PER_ITER == 8
#if QUANT_R == 2 #if QUANT_R == 2
B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4]; const B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4]; const B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
FLOAT_TYPE b0 = FLOAT_TYPE(bv02.x); const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
FLOAT_TYPE b1 = FLOAT_TYPE(bv13.x); const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
FLOAT_TYPE b2 = FLOAT_TYPE(bv02.y);
FLOAT_TYPE b3 = FLOAT_TYPE(bv13.y);
FLOAT_TYPE b4 = FLOAT_TYPE(bv02.z);
FLOAT_TYPE b5 = FLOAT_TYPE(bv13.z);
FLOAT_TYPE b6 = FLOAT_TYPE(bv02.w);
FLOAT_TYPE b7 = FLOAT_TYPE(bv13.w);
#else #else
B_TYPE_VEC4 bv0 = data_b_v4[(b_offset + iybs + iqs) / 4]; const vec4 bv0 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4]);
B_TYPE_VEC4 bv1 = data_b_v4[(b_offset + iybs + iqs) / 4 + 1]; const vec4 bv1 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4 + 1]);
FLOAT_TYPE b0 = FLOAT_TYPE(bv0.x);
FLOAT_TYPE b1 = FLOAT_TYPE(bv0.y);
FLOAT_TYPE b2 = FLOAT_TYPE(bv0.z);
FLOAT_TYPE b3 = FLOAT_TYPE(bv0.w);
FLOAT_TYPE b4 = FLOAT_TYPE(bv1.x);
FLOAT_TYPE b5 = FLOAT_TYPE(bv1.y);
FLOAT_TYPE b6 = FLOAT_TYPE(bv1.z);
FLOAT_TYPE b7 = FLOAT_TYPE(bv1.w);
#endif #endif
#else #else
// Check if the second of the pair of elements is OOB, and don't fetch B or // Check if the second of the pair of elements is OOB, and don't fetch B or
@ -67,22 +53,29 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]); b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
} }
#endif #endif
uint ibi = first_row*p.ncols;
[[unroll]] for (uint n = 0; n < num_rows; ++n) { [[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index const uint ib = (ibi + col)/QUANT_K; // block index
ibi += p.ncols;
#if K_PER_ITER == 8 #if K_PER_ITER == 8
const vec4 v = dequantize4(ib, iqs, a_offset); vec4 v = dequantize4(ib, iqs, a_offset);
const vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset); vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
const vec2 dm = get_dm(ib, a_offset);
if (dm.y != 0) { // quant has min component
v = v * dm.x + dm.y;
v2 = v2 * dm.x + dm.y;
}
// matrix multiplication // matrix multiplication
temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]); FLOAT_TYPE rowtmp = dot(bv0, v);
temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]); rowtmp += dot(bv1, v2);
temp[n] = fma(FLOAT_TYPE(v.z), b2, temp[n]);
temp[n] = fma(FLOAT_TYPE(v.w), b3, temp[n]); if (dm.y == 0)
temp[n] = fma(FLOAT_TYPE(v2.x), b4, temp[n]); rowtmp *= dm.x;
temp[n] = fma(FLOAT_TYPE(v2.y), b5, temp[n]);
temp[n] = fma(FLOAT_TYPE(v2.z), b6, temp[n]); temp[n] += rowtmp;
temp[n] = fma(FLOAT_TYPE(v2.w), b7, temp[n]);
#else #else
const vec2 v = dequantize(ib, iqs, a_offset); const vec2 v = dequantize(ib, iqs, a_offset);

View File

@ -2,8 +2,6 @@
#extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require #extension GL_EXT_shader_8bit_storage : require
#define K_QUANTS_PER_ITERATION 2
#ifdef MUL_MAT_ID #ifdef MUL_MAT_ID
#define EXPERT_COUNT 8 #define EXPERT_COUNT 8
#endif #endif

View File

@ -3,9 +3,11 @@
#include "mul_mat_vec_base.comp" #include "mul_mat_vec_base.comp"
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; layout (constant_id = 0) const uint BLOCK_SIZE = 32;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() { void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
@ -20,22 +22,25 @@ void main() {
const uint num_blocks_per_row = p.ncols / QUANT_K; const uint num_blocks_per_row = p.ncols / QUANT_K;
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 // 16 threads are used to process each block
const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 const uint it_size = gl_WorkGroupSize.x/16;
const uint tid = gl_LocalInvocationID.x;
const uint itid = tid%16; // 0...16
const uint ix = tid/16;
const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 const uint step = 8;
const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = tid - step*v_im; // 0...15 or 0...7 const uint v_in = itid - step*v_im; // 0...15 or 0...7
const uint l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15 const uint l0 = 2*v_in; // 0...15
const uint q_offset = 32*v_im + l0; const uint q_offset = 32*v_im + l0;
const uint s_offset = 8*v_im; const uint s_offset = 8*v_im;
const uint y_offset = 128*v_im + l0; const uint y_offset = 128*v_im + l0;
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset; const uint y_idx = i * QUANT_K + y_offset;
f16vec2 d = data_a[ib0 + i].d; f16vec2 d = data_a[ib0 + i].d;
@ -71,7 +76,7 @@ void main() {
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { [[unroll]] for (int l = 0; l < 2; ++l) {
sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3), sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3),
fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3), fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3), fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3),
@ -96,7 +101,7 @@ void main() {
// sum up partial sums and write back result // sum up partial sums and write back result
barrier(); barrier();
[[unroll]] for (uint s = 16; s > 0; s >>= 1) { [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
if (tid < s) { if (tid < s) {
tmp[tid] += tmp[tid + s]; tmp[tid] += tmp[tid + s];
} }

Some files were not shown because too many files have changed in this diff Show More