Compare commits

...

5 Commits

Author SHA1 Message Date
edwko
21781db72d
Merge fa522bc346 into 2cd43f4900 2024-12-24 09:55:27 -08:00
Djip007
2cd43f4900
ggml : more perfo with llamafile tinyblas on x86_64 (#10714)
Some checks are pending
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
* more perfo with llamafile tinyblas on x86_64.

- add bf16 suport
- change dispache strategie (thanks:
https://github.com/ikawrakow/ik_llama.cpp/pull/71 )
- reduce memory bandwidth

simple tinyblas dispache and more cache freindly

* tinyblas dynamic dispaching

* sgemm: add M blocs.

* - git 2.47 use short id of len 9.
- show-progress is not part of GNU Wget2

* remove not stable test
2024-12-24 18:54:49 +01:00
NeverLucky
09fe2e7613
server: allow filtering llama server response fields (#10940)
* llama_server_response_fields

* llama_server_response_fields_fix_issues

* params fixes

* fix

* clarify docs

* change to "response_fields"

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2024-12-24 17:39:49 +01:00
edwko
fa522bc346 ASCII/Romanization Support for OuteTTS 2024-12-19 12:39:17 +02:00
edwko
7a21a92ea3
Create anyascii.h 2024-12-19 12:31:58 +02:00
13 changed files with 1662 additions and 291 deletions

View File

@ -450,6 +450,8 @@ These words will not be included in the completion, so make sure to add them to
`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error.
**Response format**
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.

View File

@ -92,6 +92,7 @@ struct slot_params {
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
bool timings_per_token = false;
bool post_sampling_probs = false;
bool ignore_eos = false;
@ -209,6 +210,7 @@ struct server_task {
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
@ -522,6 +524,7 @@ struct server_task_result_cmpl_final : server_task_result {
bool post_sampling_probs;
std::vector<completion_token_output> probs_output;
std::vector<std::string> response_fields;
slot_params generation_params;
@ -568,7 +571,7 @@ struct server_task_result_cmpl_final : server_task_result {
if (!stream && !probs_output.empty()) {
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
}
return res;
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
}
json to_json_oaicompat_chat() {
@ -2066,6 +2069,7 @@ struct server_context {
res->tokens = slot.generated_tokens;
res->timings = slot.get_timings();
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
res->response_fields = slot.params.response_fields;
res->truncated = slot.truncated;
res->n_decoded = slot.n_decoded;

View File

@ -95,7 +95,7 @@ def test_consistent_result_same_seed(n_slots: int):
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"seed": 42,
"temperature": 1.0,
"temperature": 0.0,
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
})
if last_res is not None:
@ -120,9 +120,10 @@ def test_different_result_different_seed(n_slots: int):
assert res.body["content"] != last_res.body["content"]
last_res = res
# TODO figure why it don't work with temperature = 1
# @pytest.mark.parametrize("temperature", [0.0, 1.0])
@pytest.mark.parametrize("n_batch", [16, 32])
@pytest.mark.parametrize("temperature", [0.0, 1.0])
@pytest.mark.parametrize("temperature", [0.0])
def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
global server
server.n_batch = n_batch
@ -257,6 +258,40 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
# assert match_regex(re_content, res.body["content"])
@pytest.mark.parametrize(
"prompt,n_predict,response_fields",
[
("I believe the meaning of life is", 8, []),
("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
],
)
def test_completion_response_fields(
prompt: str, n_predict: int, response_fields: list[str]
):
global server
server.start()
res = server.make_request(
"POST",
"/completion",
data={
"n_predict": n_predict,
"prompt": prompt,
"response_fields": response_fields,
},
)
assert res.status_code == 200
assert "content" in res.body
assert len(res.body["content"])
if len(response_fields):
assert res.body["generation_settings/n_predict"] == n_predict
assert res.body["prompt"] == "<s> " + prompt
assert isinstance(res.body["content"], str)
assert len(res.body) == len(response_fields)
else:
assert len(res.body)
assert "generation_settings" in res.body
def test_n_probs():
global server
server.start()

View File

@ -90,6 +90,28 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
return false;
}
// get value by path(key1 / key2)
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
json result = json::object();
for (const std::string & path : paths) {
json current = js;
const auto keys = string_split<std::string>(path, /*separator*/ '/');
bool valid_path = true;
for (const std::string & k : keys) {
if (valid_path && current.is_object() && current.contains(k)) {
current = current[k];
} else {
valid_path = false;
}
}
if (valid_path) {
result[path] = current;
}
}
return result;
}
/**
* this handles 2 cases:
* - only string, example: "string"

View File

@ -1,5 +1,10 @@
set(TARGET llama-tts)
add_library(anyascii
anyascii.cpp
anyascii.h
)
add_executable(${TARGET} tts.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE anyascii)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

1252
examples/tts/anyascii.cpp Normal file

File diff suppressed because one or more lines are too long

27
examples/tts/anyascii.h Normal file
View File

@ -0,0 +1,27 @@
/*
ISC License
Copyright (c) 2020-2023, Hunter WB <hunterwb.com>
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifndef ANYASCII_H
#define ANYASCII_H
#include <stddef.h>
#include <stdint.h>
size_t anyascii(uint_least32_t utf32, const char **ascii);
#endif

View File

@ -3,6 +3,7 @@
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include "anyascii.h"
#define _USE_MATH_DEFINES // For M_PI on MSVC
@ -15,6 +16,7 @@
#include <string>
#include <thread>
#include <vector>
#include <codecvt>
//
// Terminal utils
@ -370,15 +372,36 @@ static std::string replace_numbers_with_words(const std::string & input_text) {
return result;
}
static std::string convert_to_ascii(const std::string& utf8_str) {
std::string result;
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
std::basic_string<char32_t> utf32_str = converter.from_bytes(utf8_str);
for (char32_t c : utf32_str) {
const char* ascii_result;
size_t length = anyascii(c, &ascii_result);
if (length > 0) {
result.append(ascii_result, length);
}
}
return result;
}
// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
static std::string process_text(const std::string & text) {
/*
For languages written without word boundaries (like Japanese, Chinese),
proper morphological analysis is still required but not implemented here.
While this function can romanize such text, it doesn't handle word segmentation.
To properly handle these languages, integration with morphological analyzers like MeCab
or similar tools would be needed before this preprocessing step.
*/
// For now I skipped text romanization as I am unsure how to handle
// uroman and MeCab implementations in C++
// maybe something like https://github.com/anyascii/anyascii/ could work.
// currently only English would be supported in this function
// Converts text to ASCII/romanized form (e.g., "こんにちは" -> "konnichiha")
std::string processed_text = convert_to_ascii(text);
std::string processed_text = replace_numbers_with_words(text);
processed_text = replace_numbers_with_words(processed_text);
std::transform(processed_text.begin(), processed_text.end(),
processed_text.begin(), ::tolower);
@ -394,13 +417,6 @@ static std::string process_text(const std::string & text) {
processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
/*
Replace spaces with the separator token same as in line 365
for (auto & c : prompt_user) {
if (c == ' ') {
prompt_clean += "<|text_sep|>";
*/
processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
return processed_text;

View File

@ -7419,14 +7419,14 @@ static void ggml_compute_forward_mul_mat(
if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
if (!llamafile_sgemm(params,
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
(const char *)src1->data + i12*nb12 + i13*nb13,
nb11/ggml_type_size(src1->type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
src0->type,
src1->type,
dst->type))
@ -7471,14 +7471,14 @@ UseGgmlGemm1:;
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
if (!llamafile_sgemm(params,
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
row_size/ggml_type_size(vec_dot_type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
src0->type,
vec_dot_type,
dst->type))

View File

@ -53,6 +53,8 @@
#include "ggml-cpu-impl.h"
#include "ggml-quants.h"
#include <atomic>
#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
#else
@ -134,6 +136,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
return _mm512_fmadd_ps(a, b, c);
}
#endif
#if defined(__AVX512BF16__)
template <>
inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
return _mm512_dpbf16_ps(c, a, b);
}
template <>
inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
return _mm256_dpbf16_ps(c, a, b);
}
#endif
#endif
#if defined(__ARM_FEATURE_FMA)
@ -226,6 +238,13 @@ template <> inline __m256 load(const float *p) {
}
#endif // __AVX__
#if defined(__AVX2__) || defined(__AVX512F__)
template <> inline __m256 load(const ggml_bf16_t *p) {
return _mm256_castsi256_ps(
_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
}
#endif // __AVX2__
#if defined(__F16C__)
template <> inline __m256 load(const ggml_fp16_t *p) {
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
@ -239,8 +258,27 @@ template <> inline __m512 load(const float *p) {
template <> inline __m512 load(const ggml_fp16_t *p) {
return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
}
template <> inline __m512 load(const ggml_bf16_t *p) {
return _mm512_castsi512_ps(
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
}
#endif // __AVX512F__
#if defined(__AVX512BF16__)
template <> inline __m512bh load(const ggml_bf16_t *p) {
return (__m512bh)_mm512_loadu_ps((const float *)p);
}
template <> inline __m256bh load(const ggml_bf16_t *p) {
return (__m256bh)_mm256_loadu_ps((const float *)p);
}
template <> inline __m512bh load(const float *p) {
return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
}
template <> inline __m256bh load(const float *p) {
return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// CONSTANTS
@ -252,199 +290,170 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION
template <int M>
static inline int64_t BLOCK_SIZE(size_t m) {
const int64_t NB_BLOC_M = (m + M - 1) / M;
return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
}
static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
}
template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
class tinyBLAS {
public:
tinyBLAS(int64_t k,
tinyBLAS(const ggml_compute_params * params, int64_t k,
const TA *A, int64_t lda,
const TB *B, int64_t ldb,
TC *C, int64_t ldc,
int ith, int nth)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
TC *C, int64_t ldc)
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
}
void matmul(int64_t m, int64_t n) {
mnpack(0, m, 0, n);
bool matmul(int64_t m, int64_t n) {
if (k % KN != 0)
return false;
// compute RM for only need tile with size RM&RM-1
#if VECTOR_REGISTERS == 32
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
return true;
}
#else // VECTOR_REGISTERS == 16
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
return true;
}
#endif
return false;
}
private:
NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t mc, nc, mp, np;
switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
#if VECTOR_REGISTERS == 32
case 0x55:
mc = 5;
nc = 5;
gemm<5, 5>(m0, m, n0, n);
break;
case 0x45:
mc = 4;
nc = 5;
gemm<4, 5>(m0, m, n0, n);
break;
case 0x54:
mc = 5;
nc = 4;
gemm<5, 4>(m0, m, n0, n);
break;
case 0x44:
mc = 4;
nc = 4;
gemm<4, 4>(m0, m, n0, n);
break;
case 0x53:
mc = 5;
nc = 3;
gemm<5, 3>(m0, m, n0, n);
break;
case 0x35:
mc = 3;
nc = 5;
gemm<3, 5>(m0, m, n0, n);
break;
case 0x43:
mc = 4;
nc = 3;
gemm<4, 3>(m0, m, n0, n);
break;
#else
case 0x55:
case 0x54:
case 0x53:
case 0x45:
case 0x44:
case 0x43:
mc = 4;
nc = 3;
gemm<4, 3>(m0, m, n0, n);
break;
case 0x35:
#endif
case 0x34:
mc = 3;
nc = 4;
gemm<3, 4>(m0, m, n0, n);
break;
case 0x52:
mc = 5;
nc = 2;
gemm<5, 2>(m0, m, n0, n);
break;
case 0x33:
mc = 3;
nc = 3;
gemm<3, 3>(m0, m, n0, n);
break;
case 0x25:
mc = 2;
nc = 5;
gemm<2, 5>(m0, m, n0, n);
break;
case 0x42:
mc = 4;
nc = 2;
gemm<4, 2>(m0, m, n0, n);
break;
case 0x24:
mc = 2;
nc = 4;
gemm<2, 4>(m0, m, n0, n);
break;
case 0x32:
mc = 3;
nc = 2;
gemm<3, 2>(m0, m, n0, n);
break;
case 0x23:
mc = 2;
nc = 3;
gemm<2, 3>(m0, m, n0, n);
break;
case 0x51:
mc = 5;
nc = 1;
gemm<5, 1>(m0, m, n0, n);
break;
case 0x41:
mc = 4;
nc = 1;
gemm<4, 1>(m0, m, n0, n);
break;
case 0x22:
mc = 2;
nc = 2;
gemm<2, 2>(m0, m, n0, n);
break;
case 0x15:
mc = 1;
nc = 5;
gemm<1, 5>(m0, m, n0, n);
break;
case 0x14:
mc = 1;
nc = 4;
gemm<1, 4>(m0, m, n0, n);
break;
case 0x31:
mc = 3;
nc = 1;
gemm<3, 1>(m0, m, n0, n);
break;
case 0x13:
mc = 1;
nc = 3;
gemm<1, 3>(m0, m, n0, n);
break;
case 0x21:
mc = 2;
nc = 1;
gemm<2, 1>(m0, m, n0, n);
break;
case 0x12:
mc = 1;
nc = 2;
gemm<1, 2>(m0, m, n0, n);
break;
case 0x11:
mc = 1;
nc = 1;
gemm<1, 1>(m0, m, n0, n);
break;
default:
return;
template <int RM, int RN, int BM>
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
if (SIZE_N == RN) {
return gemm<RM, RN, BM>(m, n, BN);
}
if constexpr (RN > 1) {
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
} else {
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
GGML_ASSERT(false); // we have miss something.
}
mp = m0 + (m - m0) / mc * mc;
np = n0 + (n - n0) / nc * nc;
mnpack(mp, m, n0, np);
mnpack(m0, m, np, n);
}
template <int RM, int RN>
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * RN;
inline void gemm_bloc(int64_t ii, int64_t jj) {
D Cv[RN][RM] = {};
for (int64_t l = 0; l < k; l += KN)
for (int64_t j = 0; j < RN; ++j)
for (int64_t i = 0; i < RM; ++i)
Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
load<V>(B + ldb * (jj + j) + l),
Cv[j][i]);
for (int64_t l = 0; l < k; l += KN) {
// help compiler for op order.
if constexpr (RM <= RN) {
V Av[RM];
for (int64_t i = 0; i < RM; ++i) {
Av[i] = load<V>(A + lda * (ii + i) + l);
}
for (int64_t j = 0; j < RN; ++j) {
V Bv = load<V>(B + ldb * (jj + j) + l);
for (int64_t i = 0; i < RM; ++i) {
Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
}
}
} else {
V Bv[RN];
for (int64_t j = 0; j < RN; ++j) {
Bv[j] = load<V>(B + ldb * (jj + j) + l);
}
for (int64_t i = 0; i < RM; ++i) {
V Av = load<V>(A + lda * (ii + i) + l);
for (int64_t j = 0; j < RN; ++j) {
Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
}
}
}
}
for (int64_t j = 0; j < RN; ++j)
for (int64_t i = 0; i < RM; ++i)
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
}
template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
static std::atomic<int64_t> current_chunk;
GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN;
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
// "round" bloc_size to "nearest" BN
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
const int64_t nb_job = ytiles * NB_BN;
if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
}
ggml_barrier(params->threadpool);
int64_t job = params->ith;
while (job < nb_job) {
const int64_t ii = (job % ytiles) * RM * BM;
const int64_t jb = job / ytiles;
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
int64_t jj = jj0;
for (; jj < jj1; jj += RN) {
gemm_bloc<RM, RN>(ii + bi, jj);
}
if constexpr (RN > 1) {
for (; jj < jj2; jj += RN - 1) {
gemm_bloc<RM, RN-1>(ii + bi, jj);
}
}
GGML_ASSERT(jj == jj2);
}
// next step.
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
}
ggml_barrier(params->threadpool);
return;
}
const ggml_compute_params * params;
const TA *const A;
const TB *const B;
TC *const C;
@ -452,8 +461,6 @@ class tinyBLAS {
const int64_t lda;
const int64_t ldb;
const int64_t ldc;
const int ith;
const int nth;
};
//////////////////////////////////////////////////////////////////////////////////////////
@ -1657,8 +1664,9 @@ class tinyBLAS_PPC {
* @param Ctype is GGML data type of `C`
* @return true if this function was able to service the matmul request
*/
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int Atype, int Btype, int Ctype) {
assert(m >= 0);
assert(n >= 0);
@ -1666,8 +1674,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
assert(lda >= k);
assert(ldb >= k);
assert(ldc >= m);
assert(nth > 0);
assert(ith < nth);
assert(params->nth > 0);
assert(params->ith < params->nth);
// only enable sgemm for prompt processing
if (n < 2)
@ -1682,37 +1690,25 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
if (Btype != GGML_TYPE_F32)
return false;
#if defined(__AVX512F__)
if (k % 16)
return false;
tinyBLAS<16, __m512, __m512, float, float, float> tb{
tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
#elif defined(__AVX__) || defined(__AVX2__)
if (k % 8)
return false;
tinyBLAS<8, __m256, __m256, float, float, float> tb{
tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
#elif defined(__ARM_NEON)
if (n < 4)
return false;
if (k % 4)
return false;
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
#elif defined(__MMA__)
if (k % 8)
return false;
@ -1720,7 +1716,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1728,60 +1724,71 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
#endif
}
case GGML_TYPE_BF16: {
#if defined(__AVX512BF16__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__AVX512F__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__AVX2__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#endif
return false;
}
case GGML_TYPE_F16: {
#if defined(__AVX512F__)
if (k % 16)
return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
if (Btype == GGML_TYPE_F16) {
tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
(const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
if (k % 8)
return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
if (Btype == GGML_TYPE_F16) {
tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
(const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
if (n < 8)
return false;
if (k % 8)
return false;
if (Btype != GGML_TYPE_F16)
return false;
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
if (Btype == GGML_TYPE_F16) {
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
if (k % 4)
return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
if (Btype == GGML_TYPE_F32) {
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
#else
return false;
(float *)C, ldc};
return tb.matmul(m, n);
}
#endif
return false;
}
case GGML_TYPE_Q8_0: {
@ -1792,7 +1799,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q8_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#elif defined(__ARM_FEATURE_DOTPROD)
@ -1800,7 +1807,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q8_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1816,7 +1823,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q4_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#elif defined(__ARM_FEATURE_DOTPROD)
@ -1824,7 +1831,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q4_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1840,7 +1847,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q5_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1856,7 +1863,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_iq4_nl *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1868,6 +1875,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
return false;
}
(void)params;
(void)m;
(void)n;
(void)k;
@ -1877,8 +1885,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(void)ldb;
(void)C;
(void)ldc;
(void)ith;
(void)nth;
(void)Atype;
(void)Btype;
(void)Ctype;

View File

@ -5,8 +5,8 @@
extern "C" {
#endif
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
const void *, int64_t, void *, int64_t, int, int,
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
const void *, int64_t, const void *, int64_t, void *, int64_t,
int, int, int);
#ifdef __cplusplus

View File

@ -126,6 +126,8 @@ connection = sqlite3.connect(input_file)
cursor = connection.cursor()
builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
commit_short_len = len(builds[0][0])
try:
repo = git.Repo(".", search_parent_directories=True)
except git.InvalidGitRepositoryError:
@ -138,11 +140,11 @@ def find_parent_in_data(commit: git.Commit):
seen_hexsha8 = set()
while heap:
depth, current_commit = heapq.heappop(heap)
current_hexsha8 = commit.hexsha[:8]
current_hexsha8 = commit.hexsha[:commit_short_len]
if (current_hexsha8,) in builds:
return current_hexsha8
for parent in commit.parents:
parent_hexsha8 = parent.hexsha[:8]
parent_hexsha8 = parent.hexsha[:commit_short_len]
if parent_hexsha8 not in seen_hexsha8:
seen_hexsha8.add(parent_hexsha8)
heapq.heappush(heap, (depth + 1, parent))
@ -156,9 +158,9 @@ def get_all_parent_hexsha8s(commit: git.Commit):
while unvisited:
current_commit = unvisited.pop(0)
visited.append(current_commit.hexsha[:8])
visited.append(current_commit.hexsha[:commit_short_len])
for parent in current_commit.parents:
if parent.hexsha[:8] not in visited:
if parent.hexsha[:commit_short_len] not in visited:
unvisited.append(parent)
return visited
@ -169,10 +171,10 @@ def get_commit_name(hexsha8):
if repo is None:
return hexsha8
for h in repo.heads:
if h.commit.hexsha[:8] == hexsha8:
if h.commit.hexsha[:commit_short_len] == hexsha8:
return h.name
for t in repo.tags:
if t.commit.hexsha[:8] == hexsha8:
if t.commit.hexsha[:commit_short_len] == hexsha8:
return t.name
return hexsha8
@ -183,13 +185,13 @@ def get_commit_hexsha8(name):
return None
for h in repo.heads:
if h.name == name:
return h.commit.hexsha[:8]
return h.commit.hexsha[:commit_short_len]
for t in repo.tags:
if t.name == name:
return t.commit.hexsha[:8]
return t.commit.hexsha[:commit_short_len]
for c in repo.iter_commits("--all"):
if c.hexsha[:8] == name[:8]:
return c.hexsha[:8]
if c.hexsha[:commit_short_len] == name[:commit_short_len]:
return c.hexsha[:commit_short_len]
return None

View File

@ -26,7 +26,7 @@ function has_cmd {
}
if has_cmd wget; then
cmd="wget -q --show-progress -c -O %s/%s %s"
cmd="wget -q -c -O %s/%s %s"
elif has_cmd curl; then
cmd="curl -C - -f --output-dir %s -o %s -L %s"
else