Compare commits

...

6 Commits

Author SHA1 Message Date
Jeff Bolz
02efc84343
Merge e52a0f28e7 into 2cd43f4900 2024-12-24 19:23:43 +01:00
Djip007
2cd43f4900
ggml : more perfo with llamafile tinyblas on x86_64 (#10714)
Some checks are pending
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
* more perfo with llamafile tinyblas on x86_64.

- add bf16 suport
- change dispache strategie (thanks:
https://github.com/ikawrakow/ik_llama.cpp/pull/71 )
- reduce memory bandwidth

simple tinyblas dispache and more cache freindly

* tinyblas dynamic dispaching

* sgemm: add M blocs.

* - git 2.47 use short id of len 9.
- show-progress is not part of GNU Wget2

* remove not stable test
2024-12-24 18:54:49 +01:00
NeverLucky
09fe2e7613
server: allow filtering llama server response fields (#10940)
* llama_server_response_fields

* llama_server_response_fields_fix_issues

* params fixes

* fix

* clarify docs

* change to "response_fields"

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2024-12-24 17:39:49 +01:00
Jeff Bolz
e52a0f28e7 vulkan: increase small tile size for NV_coopmat2 2024-12-21 22:36:56 -06:00
Jeff Bolz
26252831ac vulkan: optimize im2col, more elements per thread 2024-12-21 22:36:44 -06:00
Jeff Bolz
207449810e tests: Add im2col perf tests 2024-12-21 16:11:22 -06:00
12 changed files with 416 additions and 308 deletions

View File

@ -450,6 +450,8 @@ These words will not be included in the completion, so make sure to add them to
`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error.
**Response format**
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.

View File

@ -92,6 +92,7 @@ struct slot_params {
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
bool timings_per_token = false;
bool post_sampling_probs = false;
bool ignore_eos = false;
@ -209,6 +210,7 @@ struct server_task {
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
@ -522,6 +524,7 @@ struct server_task_result_cmpl_final : server_task_result {
bool post_sampling_probs;
std::vector<completion_token_output> probs_output;
std::vector<std::string> response_fields;
slot_params generation_params;
@ -568,7 +571,7 @@ struct server_task_result_cmpl_final : server_task_result {
if (!stream && !probs_output.empty()) {
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
}
return res;
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
}
json to_json_oaicompat_chat() {
@ -2066,6 +2069,7 @@ struct server_context {
res->tokens = slot.generated_tokens;
res->timings = slot.get_timings();
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
res->response_fields = slot.params.response_fields;
res->truncated = slot.truncated;
res->n_decoded = slot.n_decoded;

View File

@ -95,7 +95,7 @@ def test_consistent_result_same_seed(n_slots: int):
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"seed": 42,
"temperature": 1.0,
"temperature": 0.0,
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
})
if last_res is not None:
@ -120,9 +120,10 @@ def test_different_result_different_seed(n_slots: int):
assert res.body["content"] != last_res.body["content"]
last_res = res
# TODO figure why it don't work with temperature = 1
# @pytest.mark.parametrize("temperature", [0.0, 1.0])
@pytest.mark.parametrize("n_batch", [16, 32])
@pytest.mark.parametrize("temperature", [0.0, 1.0])
@pytest.mark.parametrize("temperature", [0.0])
def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
global server
server.n_batch = n_batch
@ -257,6 +258,40 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
# assert match_regex(re_content, res.body["content"])
@pytest.mark.parametrize(
"prompt,n_predict,response_fields",
[
("I believe the meaning of life is", 8, []),
("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
],
)
def test_completion_response_fields(
prompt: str, n_predict: int, response_fields: list[str]
):
global server
server.start()
res = server.make_request(
"POST",
"/completion",
data={
"n_predict": n_predict,
"prompt": prompt,
"response_fields": response_fields,
},
)
assert res.status_code == 200
assert "content" in res.body
assert len(res.body["content"])
if len(response_fields):
assert res.body["generation_settings/n_predict"] == n_predict
assert res.body["prompt"] == "<s> " + prompt
assert isinstance(res.body["content"], str)
assert len(res.body) == len(response_fields)
else:
assert len(res.body)
assert "generation_settings" in res.body
def test_n_probs():
global server
server.start()

View File

@ -90,6 +90,28 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
return false;
}
// get value by path(key1 / key2)
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
json result = json::object();
for (const std::string & path : paths) {
json current = js;
const auto keys = string_split<std::string>(path, /*separator*/ '/');
bool valid_path = true;
for (const std::string & k : keys) {
if (valid_path && current.is_object() && current.contains(k)) {
current = current[k];
} else {
valid_path = false;
}
}
if (valid_path) {
result[path] = current;
}
}
return result;
}
/**
* this handles 2 cases:
* - only string, example: "string"

View File

@ -7419,14 +7419,14 @@ static void ggml_compute_forward_mul_mat(
if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
if (!llamafile_sgemm(params,
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
(const char *)src1->data + i12*nb12 + i13*nb13,
nb11/ggml_type_size(src1->type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
src0->type,
src1->type,
dst->type))
@ -7471,14 +7471,14 @@ UseGgmlGemm1:;
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
if (!llamafile_sgemm(params,
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
row_size/ggml_type_size(vec_dot_type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
src0->type,
vec_dot_type,
dst->type))

View File

@ -53,6 +53,8 @@
#include "ggml-cpu-impl.h"
#include "ggml-quants.h"
#include <atomic>
#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
#else
@ -134,6 +136,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
return _mm512_fmadd_ps(a, b, c);
}
#endif
#if defined(__AVX512BF16__)
template <>
inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
return _mm512_dpbf16_ps(c, a, b);
}
template <>
inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
return _mm256_dpbf16_ps(c, a, b);
}
#endif
#endif
#if defined(__ARM_FEATURE_FMA)
@ -226,6 +238,13 @@ template <> inline __m256 load(const float *p) {
}
#endif // __AVX__
#if defined(__AVX2__) || defined(__AVX512F__)
template <> inline __m256 load(const ggml_bf16_t *p) {
return _mm256_castsi256_ps(
_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
}
#endif // __AVX2__
#if defined(__F16C__)
template <> inline __m256 load(const ggml_fp16_t *p) {
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
@ -239,8 +258,27 @@ template <> inline __m512 load(const float *p) {
template <> inline __m512 load(const ggml_fp16_t *p) {
return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
}
template <> inline __m512 load(const ggml_bf16_t *p) {
return _mm512_castsi512_ps(
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
}
#endif // __AVX512F__
#if defined(__AVX512BF16__)
template <> inline __m512bh load(const ggml_bf16_t *p) {
return (__m512bh)_mm512_loadu_ps((const float *)p);
}
template <> inline __m256bh load(const ggml_bf16_t *p) {
return (__m256bh)_mm256_loadu_ps((const float *)p);
}
template <> inline __m512bh load(const float *p) {
return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
}
template <> inline __m256bh load(const float *p) {
return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// CONSTANTS
@ -252,199 +290,170 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION
template <int M>
static inline int64_t BLOCK_SIZE(size_t m) {
const int64_t NB_BLOC_M = (m + M - 1) / M;
return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
}
static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
}
template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
class tinyBLAS {
public:
tinyBLAS(int64_t k,
tinyBLAS(const ggml_compute_params * params, int64_t k,
const TA *A, int64_t lda,
const TB *B, int64_t ldb,
TC *C, int64_t ldc,
int ith, int nth)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
TC *C, int64_t ldc)
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
}
void matmul(int64_t m, int64_t n) {
mnpack(0, m, 0, n);
bool matmul(int64_t m, int64_t n) {
if (k % KN != 0)
return false;
// compute RM for only need tile with size RM&RM-1
#if VECTOR_REGISTERS == 32
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
return true;
}
#else // VECTOR_REGISTERS == 16
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
return true;
}
#endif
return false;
}
private:
NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t mc, nc, mp, np;
switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
#if VECTOR_REGISTERS == 32
case 0x55:
mc = 5;
nc = 5;
gemm<5, 5>(m0, m, n0, n);
break;
case 0x45:
mc = 4;
nc = 5;
gemm<4, 5>(m0, m, n0, n);
break;
case 0x54:
mc = 5;
nc = 4;
gemm<5, 4>(m0, m, n0, n);
break;
case 0x44:
mc = 4;
nc = 4;
gemm<4, 4>(m0, m, n0, n);
break;
case 0x53:
mc = 5;
nc = 3;
gemm<5, 3>(m0, m, n0, n);
break;
case 0x35:
mc = 3;
nc = 5;
gemm<3, 5>(m0, m, n0, n);
break;
case 0x43:
mc = 4;
nc = 3;
gemm<4, 3>(m0, m, n0, n);
break;
#else
case 0x55:
case 0x54:
case 0x53:
case 0x45:
case 0x44:
case 0x43:
mc = 4;
nc = 3;
gemm<4, 3>(m0, m, n0, n);
break;
case 0x35:
#endif
case 0x34:
mc = 3;
nc = 4;
gemm<3, 4>(m0, m, n0, n);
break;
case 0x52:
mc = 5;
nc = 2;
gemm<5, 2>(m0, m, n0, n);
break;
case 0x33:
mc = 3;
nc = 3;
gemm<3, 3>(m0, m, n0, n);
break;
case 0x25:
mc = 2;
nc = 5;
gemm<2, 5>(m0, m, n0, n);
break;
case 0x42:
mc = 4;
nc = 2;
gemm<4, 2>(m0, m, n0, n);
break;
case 0x24:
mc = 2;
nc = 4;
gemm<2, 4>(m0, m, n0, n);
break;
case 0x32:
mc = 3;
nc = 2;
gemm<3, 2>(m0, m, n0, n);
break;
case 0x23:
mc = 2;
nc = 3;
gemm<2, 3>(m0, m, n0, n);
break;
case 0x51:
mc = 5;
nc = 1;
gemm<5, 1>(m0, m, n0, n);
break;
case 0x41:
mc = 4;
nc = 1;
gemm<4, 1>(m0, m, n0, n);
break;
case 0x22:
mc = 2;
nc = 2;
gemm<2, 2>(m0, m, n0, n);
break;
case 0x15:
mc = 1;
nc = 5;
gemm<1, 5>(m0, m, n0, n);
break;
case 0x14:
mc = 1;
nc = 4;
gemm<1, 4>(m0, m, n0, n);
break;
case 0x31:
mc = 3;
nc = 1;
gemm<3, 1>(m0, m, n0, n);
break;
case 0x13:
mc = 1;
nc = 3;
gemm<1, 3>(m0, m, n0, n);
break;
case 0x21:
mc = 2;
nc = 1;
gemm<2, 1>(m0, m, n0, n);
break;
case 0x12:
mc = 1;
nc = 2;
gemm<1, 2>(m0, m, n0, n);
break;
case 0x11:
mc = 1;
nc = 1;
gemm<1, 1>(m0, m, n0, n);
break;
default:
return;
template <int RM, int RN, int BM>
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
if (SIZE_N == RN) {
return gemm<RM, RN, BM>(m, n, BN);
}
if constexpr (RN > 1) {
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
} else {
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
GGML_ASSERT(false); // we have miss something.
}
mp = m0 + (m - m0) / mc * mc;
np = n0 + (n - n0) / nc * nc;
mnpack(mp, m, n0, np);
mnpack(m0, m, np, n);
}
template <int RM, int RN>
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * RN;
D Cv[RN][RM] = {};
for (int64_t l = 0; l < k; l += KN)
for (int64_t j = 0; j < RN; ++j)
for (int64_t i = 0; i < RM; ++i)
Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
load<V>(B + ldb * (jj + j) + l),
Cv[j][i]);
for (int64_t j = 0; j < RN; ++j)
for (int64_t i = 0; i < RM; ++i)
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
inline void gemm_bloc(int64_t ii, int64_t jj) {
D Cv[RN][RM] = {};
for (int64_t l = 0; l < k; l += KN) {
// help compiler for op order.
if constexpr (RM <= RN) {
V Av[RM];
for (int64_t i = 0; i < RM; ++i) {
Av[i] = load<V>(A + lda * (ii + i) + l);
}
for (int64_t j = 0; j < RN; ++j) {
V Bv = load<V>(B + ldb * (jj + j) + l);
for (int64_t i = 0; i < RM; ++i) {
Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
}
}
} else {
V Bv[RN];
for (int64_t j = 0; j < RN; ++j) {
Bv[j] = load<V>(B + ldb * (jj + j) + l);
}
for (int64_t i = 0; i < RM; ++i) {
V Av = load<V>(A + lda * (ii + i) + l);
for (int64_t j = 0; j < RN; ++j) {
Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
}
}
}
}
for (int64_t j = 0; j < RN; ++j)
for (int64_t i = 0; i < RM; ++i)
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
}
template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
static std::atomic<int64_t> current_chunk;
GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN;
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
// "round" bloc_size to "nearest" BN
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
const int64_t nb_job = ytiles * NB_BN;
if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
}
ggml_barrier(params->threadpool);
int64_t job = params->ith;
while (job < nb_job) {
const int64_t ii = (job % ytiles) * RM * BM;
const int64_t jb = job / ytiles;
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
int64_t jj = jj0;
for (; jj < jj1; jj += RN) {
gemm_bloc<RM, RN>(ii + bi, jj);
}
if constexpr (RN > 1) {
for (; jj < jj2; jj += RN - 1) {
gemm_bloc<RM, RN-1>(ii + bi, jj);
}
}
GGML_ASSERT(jj == jj2);
}
// next step.
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
}
ggml_barrier(params->threadpool);
return;
}
const ggml_compute_params * params;
const TA *const A;
const TB *const B;
TC *const C;
@ -452,8 +461,6 @@ class tinyBLAS {
const int64_t lda;
const int64_t ldb;
const int64_t ldc;
const int ith;
const int nth;
};
//////////////////////////////////////////////////////////////////////////////////////////
@ -1657,8 +1664,9 @@ class tinyBLAS_PPC {
* @param Ctype is GGML data type of `C`
* @return true if this function was able to service the matmul request
*/
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int Atype, int Btype, int Ctype) {
assert(m >= 0);
assert(n >= 0);
@ -1666,8 +1674,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
assert(lda >= k);
assert(ldb >= k);
assert(ldc >= m);
assert(nth > 0);
assert(ith < nth);
assert(params->nth > 0);
assert(params->ith < params->nth);
// only enable sgemm for prompt processing
if (n < 2)
@ -1682,37 +1690,25 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
if (Btype != GGML_TYPE_F32)
return false;
#if defined(__AVX512F__)
if (k % 16)
return false;
tinyBLAS<16, __m512, __m512, float, float, float> tb{
tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
#elif defined(__AVX__) || defined(__AVX2__)
if (k % 8)
return false;
tinyBLAS<8, __m256, __m256, float, float, float> tb{
tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
#elif defined(__ARM_NEON)
if (n < 4)
return false;
if (k % 4)
return false;
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
#elif defined(__MMA__)
if (k % 8)
return false;
@ -1720,7 +1716,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1728,60 +1724,71 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
#endif
}
case GGML_TYPE_BF16: {
#if defined(__AVX512BF16__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__AVX512F__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__AVX2__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#endif
return false;
}
case GGML_TYPE_F16: {
#if defined(__AVX512F__)
if (k % 16)
return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
if (Btype == GGML_TYPE_F16) {
tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
(const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
if (k % 8)
return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
if (Btype == GGML_TYPE_F16) {
tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
(const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
if (n < 8)
return false;
if (k % 8)
return false;
if (Btype != GGML_TYPE_F16)
return false;
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
if (Btype == GGML_TYPE_F16) {
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
if (k % 4)
return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
#else
return false;
if (Btype == GGML_TYPE_F32) {
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#endif
return false;
}
case GGML_TYPE_Q8_0: {
@ -1792,7 +1799,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q8_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#elif defined(__ARM_FEATURE_DOTPROD)
@ -1800,7 +1807,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q8_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1816,7 +1823,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q4_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#elif defined(__ARM_FEATURE_DOTPROD)
@ -1824,7 +1831,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q4_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1840,7 +1847,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q5_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1856,7 +1863,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_iq4_nl *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1868,6 +1875,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
return false;
}
(void)params;
(void)m;
(void)n;
(void)k;
@ -1877,8 +1885,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(void)ldb;
(void)C;
(void)ldc;
(void)ith;
(void)nth;
(void)Atype;
(void)Btype;
(void)Ctype;

View File

@ -5,8 +5,8 @@
extern "C" {
#endif
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
const void *, int64_t, void *, int64_t, int, int,
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
const void *, int64_t, const void *, int64_t, void *, int64_t,
int, int, int);
#ifdef __cplusplus

View File

@ -1404,10 +1404,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
// spec constants and tile sizes for non-quant matmul/matmul_id
l_warptile = { 256, 128, 256, 64 };
m_warptile = { 256, 128, 128, 64 };
s_warptile = { 128, 32, 16, 64 };
s_warptile = { 128, 64, 64, 64 };
l_wg_denoms = {128, 256, 1 };
m_wg_denoms = {128, 128, 1 };
s_wg_denoms = { 32, 16, 1 };
s_wg_denoms = { 64, 64, 1 };
// spec constants and tile sizes for quant matmul (non-Qi_K)
l_warptile_mmq = { 256, 128, 256, 64 };
@ -2012,11 +2012,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, { device->subgroup_size }, 1, true);
if (device->float_controls_rte_fp16) {
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, { device->subgroup_size }, 1, true);
} else {
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, { device->subgroup_size }, 1, true);
}
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);

View File

@ -2,6 +2,7 @@
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_spirv_intrinsics: enable
#extension GL_EXT_control_flow_attributes : require
#if RTE16
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
@ -23,40 +24,64 @@ layout (push_constant) uniform parameter
#include "types.comp"
#define BLOCK_SIZE 256
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
const uint NUM_ITER = 256 / BLOCK_SIZE;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.x;
if (i >= p.pelements) {
return;
}
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
const uint kx = i / ksize;
const uint kd = kx * ksize;
const uint ky = (i - kd) / p.OW;
const uint ix = i % p.OW;
const uint gidx = gl_GlobalInvocationID.x;
const uint oh = gl_GlobalInvocationID.y;
const uint batch = gl_GlobalInvocationID.z / p.IC;
const uint ic = gl_GlobalInvocationID.z % p.IC;
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
const uint offset_dst =
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
(ic * (p.KW * p.KH) + ky * p.KW + kx);
if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
data_d[offset_dst] = D_TYPE(0.0f);
} else {
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
A_TYPE values[NUM_ITER];
uint offset_dst[NUM_ITER];
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
values[idx] = A_TYPE(0);
}
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
const uint i = gidx * NUM_ITER + idx;
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
const uint kx = i / ksize;
const uint kd = kx * ksize;
const uint ky = (i - kd) / p.OW;
const uint ix = i % p.OW;
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
offset_dst[idx] =
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
(ic * (p.KW * p.KH) + ky * p.KW + kx);
if (i >= p.pelements) {
continue;
}
if (iih < p.IH && iiw < p.IW) {
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
values[idx] = data_a[offset_src + iih * p.IW + iiw];
}
}
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
const uint i = gidx * NUM_ITER + idx;
if (i >= p.pelements) {
continue;
}
data_d[offset_dst[idx]] = D_TYPE(values[idx]);
}
}

View File

@ -126,6 +126,8 @@ connection = sqlite3.connect(input_file)
cursor = connection.cursor()
builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
commit_short_len = len(builds[0][0])
try:
repo = git.Repo(".", search_parent_directories=True)
except git.InvalidGitRepositoryError:
@ -138,11 +140,11 @@ def find_parent_in_data(commit: git.Commit):
seen_hexsha8 = set()
while heap:
depth, current_commit = heapq.heappop(heap)
current_hexsha8 = commit.hexsha[:8]
current_hexsha8 = commit.hexsha[:commit_short_len]
if (current_hexsha8,) in builds:
return current_hexsha8
for parent in commit.parents:
parent_hexsha8 = parent.hexsha[:8]
parent_hexsha8 = parent.hexsha[:commit_short_len]
if parent_hexsha8 not in seen_hexsha8:
seen_hexsha8.add(parent_hexsha8)
heapq.heappush(heap, (depth + 1, parent))
@ -156,9 +158,9 @@ def get_all_parent_hexsha8s(commit: git.Commit):
while unvisited:
current_commit = unvisited.pop(0)
visited.append(current_commit.hexsha[:8])
visited.append(current_commit.hexsha[:commit_short_len])
for parent in current_commit.parents:
if parent.hexsha[:8] not in visited:
if parent.hexsha[:commit_short_len] not in visited:
unvisited.append(parent)
return visited
@ -169,10 +171,10 @@ def get_commit_name(hexsha8):
if repo is None:
return hexsha8
for h in repo.heads:
if h.commit.hexsha[:8] == hexsha8:
if h.commit.hexsha[:commit_short_len] == hexsha8:
return h.name
for t in repo.tags:
if t.commit.hexsha[:8] == hexsha8:
if t.commit.hexsha[:commit_short_len] == hexsha8:
return t.name
return hexsha8
@ -183,13 +185,13 @@ def get_commit_hexsha8(name):
return None
for h in repo.heads:
if h.name == name:
return h.commit.hexsha[:8]
return h.commit.hexsha[:commit_short_len]
for t in repo.tags:
if t.name == name:
return t.commit.hexsha[:8]
return t.commit.hexsha[:commit_short_len]
for c in repo.iter_commits("--all"):
if c.hexsha[:8] == name[:8]:
return c.hexsha[:8]
if c.hexsha[:commit_short_len] == name[:commit_short_len]:
return c.hexsha[:commit_short_len]
return None

View File

@ -26,7 +26,7 @@ function has_cmd {
}
if has_cmd wget; then
cmd="wget -q --show-progress -c -O %s/%s %s"
cmd="wget -q -c -O %s/%s %s"
elif has_cmd curl; then
cmd="curl -C - -f --output-dir %s -o %s -L %s"
else

View File

@ -3945,6 +3945,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
}
}
for (int K : {3, 5}) {
for (int IC : {256, 2560}) {
for (int IW_IH : {32, 64, 256}) {
if (IC == 2560 && IW_IH == 256) {
// too big
continue;
}
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {IW_IH, IW_IH, IC, 1}, {K, K, IC, 1}, 1, 1, 1, 1, 1, 1, true));
}
}
}
return test_cases;
}