mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 13:30:35 +00:00
Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
e5ca7e9507
6
Makefile
6
Makefile
@ -325,9 +325,9 @@ ifdef LLAMA_DEBUG
|
||||
endif
|
||||
else
|
||||
MK_CPPFLAGS += -DNDEBUG
|
||||
MK_CFLAGS += -O3
|
||||
MK_CXXFLAGS += -O3
|
||||
MK_NVCCFLAGS += -O3
|
||||
MK_CFLAGS += -O3 -g
|
||||
MK_CXXFLAGS += -O3 -g
|
||||
MK_NVCCFLAGS += -O3 -g
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SANITIZE_THREAD
|
||||
|
@ -1324,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
else { invalid_param = true; }
|
||||
return true;
|
||||
}
|
||||
if (arg == "--no-warmup") {
|
||||
params.warmup = false;
|
||||
return true;
|
||||
}
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
// Parse args for logging parameters
|
||||
if (log_param_single_parse(argv[i])) {
|
||||
@ -1446,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
||||
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
||||
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
||||
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
|
||||
options.push_back({ "server infill",
|
||||
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
||||
|
||||
|
@ -1570,6 +1570,34 @@ class LlamaModel(Model):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||
base = self.hparams.get("rope_theta", 10000.0)
|
||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
|
||||
factor = rope_scaling.get("factor", 8.0)
|
||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
assert low_freq_wavelen != high_freq_wavelen
|
||||
|
||||
rope_factors = []
|
||||
for freq in freqs:
|
||||
wavelen = 2 * math.pi / freq
|
||||
if wavelen < high_freq_wavelen:
|
||||
rope_factors.append(1)
|
||||
elif wavelen > low_freq_wavelen:
|
||||
rope_factors.append(factor)
|
||||
else:
|
||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||
|
||||
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
|
@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
|
||||
make
|
||||
```
|
||||
|
||||
- On Windows:
|
||||
- On Windows (x86/x64 only, arm64 requires cmake):
|
||||
|
||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||
2. Extract `w64devkit` on your pc.
|
||||
@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options.
|
||||
cmake -B build -G "Xcode"
|
||||
cmake --build build --config Debug
|
||||
```
|
||||
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
|
||||
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
|
||||
- Tab Workload: Desktop-development with C++
|
||||
- Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
|
||||
- Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
|
||||
- For Windows on ARM (arm64, WoA) build with:
|
||||
```bash
|
||||
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
|
||||
cmake --build build-arm64-windows-llvm-release
|
||||
```
|
||||
Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
|
||||
|
||||
- Using `gmake` (FreeBSD):
|
||||
|
||||
|
@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
v = (float) *(int8_t *) &data[i];
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
printf("%12.4f", v);
|
||||
sum += v;
|
||||
|
@ -211,8 +211,9 @@ struct lora_merge_ctx {
|
||||
}
|
||||
}
|
||||
|
||||
// if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile
|
||||
std::vector<std::pair<struct ggml_tensor *, bool>> base_tensors;
|
||||
// mapping base tensor to out tensor (same shape with base, but different type)
|
||||
// if out_tensor == nullptr, we only copy it
|
||||
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
|
||||
for (auto & it : base_model.tensors) {
|
||||
bool t_a = true;
|
||||
bool t_b = true;
|
||||
@ -221,22 +222,22 @@ struct lora_merge_ctx {
|
||||
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
|
||||
}
|
||||
auto base_tensor = it.second;
|
||||
struct ggml_tensor * out_tensor;
|
||||
if (!t_a && !t_b) {
|
||||
// only copy
|
||||
out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
|
||||
ggml_set_name(out_tensor, base_tensor->name);
|
||||
base_tensors.push_back(std::make_pair(out_tensor, false));
|
||||
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
|
||||
ggml_set_name(cpy_tensor, base_tensor->name);
|
||||
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
|
||||
gguf_add_tensor(ctx_out, cpy_tensor);
|
||||
} else if (t_a && t_b) {
|
||||
// need merging
|
||||
out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
|
||||
out_tensor->type = get_out_tensor_type(base_tensor);
|
||||
struct ggml_tensor * out_tensor = ggml_new_tensor(
|
||||
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
|
||||
ggml_set_name(out_tensor, base_tensor->name);
|
||||
base_tensors.push_back(std::make_pair(out_tensor, true));
|
||||
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
|
||||
gguf_add_tensor(ctx_out, out_tensor);
|
||||
} else {
|
||||
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
|
||||
}
|
||||
gguf_add_tensor(ctx_out, out_tensor);
|
||||
}
|
||||
|
||||
// placeholder for the meta data
|
||||
@ -247,9 +248,9 @@ struct lora_merge_ctx {
|
||||
|
||||
// process base model tensors
|
||||
size_t n_merged = 0;
|
||||
for (auto & it : base_tensors) {
|
||||
if (it.second) {
|
||||
merge_tensor(it.first);
|
||||
for (auto & it : base_to_out_tensors) {
|
||||
if (it.second != nullptr) {
|
||||
merge_tensor(it.first, it.second);
|
||||
n_merged++;
|
||||
} else {
|
||||
copy_tensor(it.first);
|
||||
@ -265,7 +266,7 @@ struct lora_merge_ctx {
|
||||
}
|
||||
|
||||
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
|
||||
printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size());
|
||||
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
|
||||
}
|
||||
|
||||
void copy_tensor(struct ggml_tensor * base) {
|
||||
@ -276,7 +277,7 @@ struct lora_merge_ctx {
|
||||
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
|
||||
}
|
||||
|
||||
void merge_tensor(struct ggml_tensor * base) {
|
||||
void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
|
||||
std::string name_base(base->name);
|
||||
std::string name_lora_a = name_base + ".lora_a";
|
||||
std::string name_lora_b = name_base + ".lora_b";
|
||||
@ -287,14 +288,14 @@ struct lora_merge_ctx {
|
||||
std::vector<struct ggml_tensor *> inp_a(adapters.size());
|
||||
std::vector<struct ggml_tensor *> inp_b(adapters.size());
|
||||
struct ggml_init_params params {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*(1+adapters.size()*2),
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
|
||||
// alloc tensors
|
||||
struct ggml_tensor * inp = ggml_dup_tensor(ctx, base);
|
||||
struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
|
||||
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||
auto t_a = adapters[i]->get_tensor(name_lora_a);
|
||||
auto t_b = adapters[i]->get_tensor(name_lora_b);
|
||||
@ -303,9 +304,21 @@ struct lora_merge_ctx {
|
||||
}
|
||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||
|
||||
// load data to backend buffer
|
||||
// load base tensor to backend buffer
|
||||
base_model.read_tensor_data(name_base, read_buf);
|
||||
ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp));
|
||||
if (base->type != GGML_TYPE_F32) {
|
||||
// optionally dequantize it
|
||||
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
|
||||
auto nels = ggml_nelements(inp_base);
|
||||
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
|
||||
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
|
||||
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
|
||||
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
|
||||
} else {
|
||||
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
|
||||
}
|
||||
|
||||
// load lora tensors to backend buffer
|
||||
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||
adapters[i]->read_tensor_data(name_lora_a, read_buf);
|
||||
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
|
||||
@ -325,20 +338,21 @@ struct lora_merge_ctx {
|
||||
};
|
||||
struct ggml_context * ctx0 = ggml_init(params0);
|
||||
gf = ggml_new_graph(ctx0);
|
||||
struct ggml_tensor * cur = inp;
|
||||
struct ggml_tensor * cur = inp_base;
|
||||
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i]));
|
||||
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]);
|
||||
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
|
||||
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
|
||||
// scale
|
||||
const float alpha = adapters[i]->alpha;
|
||||
const float rank = (float) inp_b[i]->ne[0];
|
||||
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
|
||||
delta = ggml_scale(ctx0, delta, scale);
|
||||
cur = ggml_add(ctx0, cur, delta);
|
||||
printf("%s : + merging from adapter[%ld]\n", __func__, i);
|
||||
cur = ggml_add(ctx0, delta, cur);
|
||||
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
|
||||
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
|
||||
}
|
||||
cur = ggml_cast(ctx0, cur, get_out_tensor_type(base));
|
||||
cur = ggml_cast(ctx0, cur, out->type);
|
||||
printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
ggml_free(ctx0);
|
||||
}
|
||||
|
@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
exit(1); //GGML_ABORT("fatal error");
|
||||
}
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
||||
@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
exit(1); //GGML_ABORT("fatal error");
|
||||
}
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
|
@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) {
|
||||
case JSON: return "json";
|
||||
case MARKDOWN: return "md";
|
||||
case SQL: return "sql";
|
||||
default: GGML_ASSERT(!"invalid output format");
|
||||
default: GGML_ABORT("invalid output format");
|
||||
}
|
||||
}
|
||||
|
||||
@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
|
||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
||||
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
||||
case LLAMA_SPLIT_MODE_ROW: return "row";
|
||||
default: GGML_ASSERT(!"invalid split mode");
|
||||
default: GGML_ABORT("invalid split mode");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1326,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
|
||||
case SQL:
|
||||
return std::unique_ptr<printer>(new sql_printer());
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
|
@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
embeddings = peg_0;
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
|
||||
@ -132,12 +131,20 @@
|
||||
align-items: stretch;
|
||||
}
|
||||
|
||||
.right {
|
||||
.message-controls {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
gap: 0.5em;
|
||||
justify-content: flex-end;
|
||||
}
|
||||
.message-controls > div:nth-child(2) {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5em;
|
||||
}
|
||||
.message-controls > div:nth-child(2) > div {
|
||||
display: flex;
|
||||
margin-left: auto;
|
||||
gap: 0.5em;
|
||||
}
|
||||
|
||||
fieldset {
|
||||
border: none;
|
||||
@ -276,6 +283,7 @@
|
||||
|
||||
import { llama } from './completion.js';
|
||||
import { SchemaConverter } from './json-schema-to-grammar.mjs';
|
||||
|
||||
let selected_image = false;
|
||||
var slot_id = -1;
|
||||
|
||||
@ -447,6 +455,9 @@
|
||||
|
||||
/* END: Support for storing prompt templates and parameters in browsers LocalStorage */
|
||||
|
||||
const tts = window.speechSynthesis;
|
||||
const ttsVoice = signal(null)
|
||||
|
||||
const llamaStats = signal(null)
|
||||
const controller = signal(null)
|
||||
|
||||
@ -596,8 +607,51 @@
|
||||
});
|
||||
}
|
||||
|
||||
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
||||
const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
|
||||
function MessageInput() {
|
||||
const message = useSignal("")
|
||||
const message = useSignal("");
|
||||
|
||||
const talkActive = useSignal(false);
|
||||
const sendOnTalk = useSignal(false);
|
||||
const talkStop = (e) => {
|
||||
if (e) e.preventDefault();
|
||||
|
||||
talkActive.value = false;
|
||||
talkRecognition?.stop();
|
||||
}
|
||||
const talk = (e) => {
|
||||
e.preventDefault();
|
||||
|
||||
if (talkRecognition)
|
||||
talkRecognition.start();
|
||||
else
|
||||
alert("Speech recognition is not supported by this browser.");
|
||||
}
|
||||
if(talkRecognition) {
|
||||
talkRecognition.onstart = () => {
|
||||
talkActive.value = true;
|
||||
}
|
||||
talkRecognition.onresult = (e) => {
|
||||
if (event.results.length > 0) {
|
||||
message.value = event.results[0][0].transcript;
|
||||
if (sendOnTalk.value) {
|
||||
submit(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
talkRecognition.onspeechend = () => {
|
||||
talkStop();
|
||||
}
|
||||
}
|
||||
|
||||
const ttsVoices = useSignal(tts?.getVoices() || []);
|
||||
const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
|
||||
if (tts) {
|
||||
tts.onvoiceschanged = () => {
|
||||
ttsVoices.value = tts.getVoices();
|
||||
}
|
||||
}
|
||||
|
||||
const submit = (e) => {
|
||||
stop(e);
|
||||
@ -624,11 +678,45 @@
|
||||
value="${message}"
|
||||
/>
|
||||
</div>
|
||||
<div class="right">
|
||||
<button type="submit" disabled=${generating.value}>Send</button>
|
||||
<button onclick=${uploadImage}>Upload Image</button>
|
||||
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||
<button onclick=${reset}>Reset</button>
|
||||
<div class="message-controls">
|
||||
<div> </div>
|
||||
<div>
|
||||
<div>
|
||||
<button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
|
||||
<button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
|
||||
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||
<button onclick=${reset}>Reset</button>
|
||||
</div>
|
||||
<div>
|
||||
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
|
||||
e.preventDefault();
|
||||
alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
|
||||
`(TTS and speech recognition are not provided by llama.cpp)\n` +
|
||||
`Note: STT requires HTTPS to work.`);
|
||||
}}>[?]</a>
|
||||
<button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
|
||||
<div>
|
||||
<input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
|
||||
<label for="send-on-talk" style="line-height: initial;">Send after talking</label>
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
|
||||
e.preventDefault();
|
||||
alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
|
||||
}}>[?]</a>
|
||||
<label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
|
||||
<select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
|
||||
<option value="" selected="${!ttsVoice.value}">None</option>
|
||||
${[
|
||||
...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
|
||||
...ttsVoices.value.filter(v => !v.default),
|
||||
].map(
|
||||
v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
|
||||
)}
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
`
|
||||
@ -659,26 +747,86 @@
|
||||
}
|
||||
}, [messages])
|
||||
|
||||
const ttsChatLineActiveIx = useSignal(undefined);
|
||||
const ttsChatLine = (e, ix, msg) => {
|
||||
if (e) e.preventDefault();
|
||||
|
||||
if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
|
||||
|
||||
const ttsVoices = tts.getVoices();
|
||||
const voice = ttsVoices.find(v => v.name === ttsVoice.value);
|
||||
if (!voice) return;
|
||||
|
||||
if (ttsChatLineActiveIx.value !== undefined) {
|
||||
tts.cancel();
|
||||
if (ttsChatLineActiveIx.value === ix) {
|
||||
ttsChatLineActiveIx.value = undefined;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
ttsChatLineActiveIx.value = ix;
|
||||
let ttsUtter = new SpeechSynthesisUtterance(msg);
|
||||
ttsUtter.voice = voice;
|
||||
ttsUtter.onend = e => {
|
||||
ttsChatLineActiveIx.value = undefined;
|
||||
};
|
||||
tts.speak(ttsUtter);
|
||||
}
|
||||
|
||||
const isCompletionMode = session.value.type === 'completion'
|
||||
|
||||
// Try play the last bot message
|
||||
const lastCharChatLinesIxs = useSignal([]);
|
||||
const lastCharChatLinesIxsOld = useSignal([]);
|
||||
useEffect(() => {
|
||||
if (
|
||||
!isCompletionMode
|
||||
&& lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
|
||||
&& !generating.value
|
||||
) {
|
||||
const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
|
||||
if (ix !== undefined) {
|
||||
const msg = messages[ix];
|
||||
ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
|
||||
}
|
||||
|
||||
lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
|
||||
}
|
||||
}, [generating.value]);
|
||||
|
||||
const chatLine = ([user, data], index) => {
|
||||
let message
|
||||
const isArrayMessage = Array.isArray(data)
|
||||
const isArrayMessage = Array.isArray(data);
|
||||
const text = isArrayMessage ?
|
||||
data.map(msg => msg.content).join('') :
|
||||
data;
|
||||
if (params.value.n_probs > 0 && isArrayMessage) {
|
||||
message = html`<${Probabilities} data=${data} />`
|
||||
} else {
|
||||
const text = isArrayMessage ?
|
||||
data.map(msg => msg.content).join('') :
|
||||
data;
|
||||
message = isCompletionMode ?
|
||||
text :
|
||||
html`<${Markdownish} text=${template(text)} />`
|
||||
}
|
||||
|
||||
const fromBot = user && user === '{{char}}';
|
||||
if (fromBot && !lastCharChatLinesIxs.value.includes(index))
|
||||
lastCharChatLinesIxs.value.push(index);
|
||||
|
||||
if (user) {
|
||||
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
||||
return html`
|
||||
<div>
|
||||
<p key=${index}><strong>${template(user)}:</strong> ${message}</p>
|
||||
${
|
||||
fromBot && ttsVoice.value
|
||||
&& html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
|
||||
}
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
return isCompletionMode ?
|
||||
html`<span key=${index}>${message}</span>` :
|
||||
html`<p key=${index}>${message}</p>`
|
||||
html`<div><p key=${index}>${message}</p></div>`
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
|
||||
printf(">");
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
|
||||
GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
|
||||
}
|
||||
|
||||
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
||||
|
@ -50,9 +50,15 @@ else()
|
||||
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
|
||||
endif()
|
||||
|
||||
if (CMAKE_CROSSCOMPILING)
|
||||
set(GGML_NATIVE_DEFAULT OFF)
|
||||
else()
|
||||
set(GGML_NATIVE_DEFAULT ON)
|
||||
endif()
|
||||
|
||||
# general
|
||||
option(GGML_STATIC "ggml: static link libraries" OFF)
|
||||
option(GGML_NATIVE "ggml: enable -march=native flag" ON)
|
||||
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
|
||||
option(GGML_LTO "ggml: enable link time optimization" OFF)
|
||||
option(GGML_CCACHE "ggml: use ccache if available" ON)
|
||||
|
||||
@ -70,7 +76,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
|
||||
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
|
||||
|
||||
# instruction set specific
|
||||
if (GGML_NATIVE)
|
||||
if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
|
||||
set(INS_ENB OFF)
|
||||
else()
|
||||
set(INS_ENB ON)
|
||||
|
@ -254,18 +254,8 @@
|
||||
|
||||
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||
|
||||
#define GGML_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fflush(stdout); \
|
||||
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
ggml_print_backtrace(); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
||||
#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
|
||||
#elif defined(__GNUC__)
|
||||
#define GGML_UNREACHABLE() __builtin_unreachable()
|
||||
#elif defined(_MSC_VER)
|
||||
@ -274,6 +264,17 @@
|
||||
#define GGML_UNREACHABLE() ((void) 0)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define GGML_NORETURN [[noreturn]]
|
||||
#elif defined(_MSC_VER)
|
||||
#define GGML_NORETURN __declspec(noreturn)
|
||||
#else
|
||||
#define GGML_NORETURN _Noreturn
|
||||
#endif
|
||||
|
||||
#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
|
||||
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
|
||||
|
||||
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
||||
// main purpose is to reduce code duplication and improve readability.
|
||||
//
|
||||
@ -322,6 +323,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
|
||||
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
|
||||
|
||||
enum ggml_status {
|
||||
GGML_STATUS_ALLOC_FAILED = -2,
|
||||
GGML_STATUS_FAILED = -1,
|
||||
@ -636,8 +640,11 @@ extern "C" {
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used;
|
||||
struct ggml_tensor ** keys;
|
||||
};
|
||||
|
||||
@ -651,7 +658,7 @@ extern "C" {
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_table;
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
@ -698,8 +705,6 @@ extern "C" {
|
||||
GGML_API int64_t ggml_cycles(void);
|
||||
GGML_API int64_t ggml_cycles_per_ms(void);
|
||||
|
||||
GGML_API void ggml_print_backtrace(void);
|
||||
|
||||
// accepts a UTF-8 path, even on Windows
|
||||
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
||||
|
||||
@ -2005,8 +2010,8 @@ extern "C" {
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
if (svcntw() == 8) {
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
if (svcntw() == 8) {
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
|
@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
||||
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
||||
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
||||
GGML_ASSERT(!"not enough space in the buffer");
|
||||
return;
|
||||
GGML_ABORT("not enough space in the buffer");
|
||||
}
|
||||
|
||||
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
|
||||
@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
|
||||
return;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(!"out of allocated_tensors");
|
||||
GGML_ABORT("out of allocated_tensors");
|
||||
}
|
||||
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
|
||||
return;
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
|
||||
GGML_ASSERT(!"tensor not found");
|
||||
GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||
// this should never happen
|
||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||
__func__, size, max_avail);
|
||||
GGML_ASSERT(!"not enough space in the buffer");
|
||||
GGML_UNREACHABLE();
|
||||
GGML_ABORT("not enough space in the buffer");
|
||||
}
|
||||
}
|
||||
|
||||
@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
}
|
||||
}
|
||||
|
||||
free(galloc->hash_set.keys);
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
free(galloc->hash_values);
|
||||
free(galloc->bufts);
|
||||
free(galloc->buffers);
|
||||
@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
typedef struct ggml_gallocr * ggml_gallocr_t;
|
||||
|
||||
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
||||
size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
|
||||
return &galloc->hash_values[i];
|
||||
}
|
||||
|
||||
@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
||||
|
||||
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||
// clear hash tables
|
||||
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
||||
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
||||
ggml_hash_set_reset(&galloc->hash_set);
|
||||
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
||||
|
||||
// allocate leafs
|
||||
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
||||
@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||
size_t hash_size = graph->visited_hash_table.size;
|
||||
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
||||
// add 25% margin to avoid hash collisions
|
||||
min_hash_size += min_hash_size / 4;
|
||||
|
||||
// initialize hash table
|
||||
if (galloc->hash_set.size < hash_size) {
|
||||
free(galloc->hash_set.keys);
|
||||
free(galloc->hash_values);
|
||||
galloc->hash_set.size = hash_size;
|
||||
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
||||
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
||||
if (galloc->hash_set.size < min_hash_size) {
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
galloc->hash_set = ggml_hash_set_new(min_hash_size);
|
||||
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
||||
|
||||
free(galloc->hash_values);
|
||||
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
|
||||
GGML_ASSERT(galloc->hash_values != NULL);
|
||||
} else {
|
||||
// reset hash table
|
||||
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
|
||||
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
||||
}
|
||||
|
||||
// reset allocators
|
||||
@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
}
|
||||
|
||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
|
||||
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
||||
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||
return talloc->size_max >= node_size;
|
||||
}
|
||||
|
||||
|
@ -1055,11 +1055,10 @@ struct ggml_backend_sched {
|
||||
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
||||
ggml_gallocr_t galloc;
|
||||
|
||||
// hash keys of the nodes in the graph
|
||||
struct ggml_hash_set hash_set;
|
||||
// hash values
|
||||
int * tensor_backend_id;
|
||||
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
||||
// hash map of the nodes in the graph
|
||||
struct ggml_hash_set hash_set;
|
||||
int * hv_tensor_backend_ids; // [hash_set.size]
|
||||
struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
||||
|
||||
int * node_backend_ids; // [graph_size]
|
||||
int * leaf_backend_ids; // [graph_size]
|
||||
@ -1068,7 +1067,7 @@ struct ggml_backend_sched {
|
||||
int * prev_leaf_backend_ids; // [graph_size]
|
||||
|
||||
// copy of the graph with modified inputs
|
||||
struct ggml_cgraph * graph;
|
||||
struct ggml_cgraph graph;
|
||||
|
||||
// graph splits
|
||||
struct ggml_backend_sched_split * splits;
|
||||
@ -1087,19 +1086,16 @@ struct ggml_backend_sched {
|
||||
ggml_backend_sched_eval_callback callback_eval;
|
||||
void * callback_eval_user_data;
|
||||
|
||||
bool debug;
|
||||
char * context_buffer;
|
||||
size_t context_buffer_size;
|
||||
|
||||
// align context_buffer to GGML_MEM_ALIGN
|
||||
#ifdef _MSC_VER
|
||||
__declspec(align(GGML_MEM_ALIGN))
|
||||
#else
|
||||
__attribute__((aligned(GGML_MEM_ALIGN)))
|
||||
#endif
|
||||
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
||||
bool debug;
|
||||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
||||
#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
||||
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
||||
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
||||
|
||||
// returns the priority of the backend, lower id is higher priority
|
||||
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
@ -1169,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
return cur_backend_id;
|
||||
}
|
||||
|
||||
// assign nodes that use weights to the backend of the weights
|
||||
// operations with weights are preferably run on the same backend as the weights
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
const struct ggml_tensor * src = tensor->src[i];
|
||||
@ -1275,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
sched->is_reset = false;
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/* .mem_size = */ sizeof(sched->context_buffer),
|
||||
/* .mem_size = */ sched->context_buffer_size,
|
||||
/* .mem_buffer = */ sched->context_buffer,
|
||||
/* .no_alloc = */ true
|
||||
};
|
||||
@ -1284,39 +1279,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
sched->ctx = ggml_init(params);
|
||||
if (sched->ctx == NULL) {
|
||||
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("%s: failed to initialize context\n", __func__);
|
||||
}
|
||||
|
||||
// pass 1: assign backends to ops with pre-allocated inputs
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
int * leaf_backend_id = &tensor_backend_id(leaf);
|
||||
if (*leaf_backend_id != -1) {
|
||||
// do not overwrite user assignments
|
||||
continue;
|
||||
// do not overwrite user assignments
|
||||
if (*leaf_backend_id == -1) {
|
||||
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||
}
|
||||
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||
}
|
||||
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id != -1) {
|
||||
// do not overwrite user assignments
|
||||
continue;
|
||||
}
|
||||
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||
// src
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
// do not overwrite user assignments
|
||||
if (*node_backend_id == -1) {
|
||||
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||
|
||||
#if 0
|
||||
// src
|
||||
if (node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
int * src_backend_id = &tensor_backend_id(src);
|
||||
if (*src_backend_id == -1) {
|
||||
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
int * src_backend_id = &tensor_backend_id(src);
|
||||
if (*src_backend_id == -1) {
|
||||
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -1488,12 +1487,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
}
|
||||
|
||||
// pass 4: split graph, find tensors that need to be copied
|
||||
// pass 5: split graph, find tensors that need to be copied
|
||||
{
|
||||
int i_split = 0;
|
||||
struct ggml_backend_sched_split * split = &sched->splits[0];
|
||||
// find the backend of the first split, skipping view ops
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
int i = 0;
|
||||
for (; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (!ggml_is_view_op(node->op)) {
|
||||
split->backend_id = tensor_backend_id(node);
|
||||
@ -1502,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
split->i_start = 0;
|
||||
split->n_inputs = 0;
|
||||
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
||||
int cur_backend_id = split->backend_id;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
for (; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
@ -1513,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
const int node_backend_id = tensor_backend_id(node);
|
||||
|
||||
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
||||
assert(node_backend_id != -1); // all nodes should be assigned by now
|
||||
|
||||
// check if we should start a new split based on the sources of the current node
|
||||
bool need_new_split = false;
|
||||
@ -1527,7 +1526,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = tensor_backend_id(src);
|
||||
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
||||
if (src_backend_id != cur_backend_id) {
|
||||
need_new_split = true;
|
||||
break;
|
||||
}
|
||||
@ -1536,9 +1535,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
// FIXME: count the number of inputs instead of only checking when full
|
||||
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
||||
const size_t id = hash_id(src);
|
||||
int src_backend_id = sched->tensor_backend_id[id];
|
||||
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
|
||||
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||
need_new_split = true;
|
||||
break;
|
||||
@ -1570,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
continue;
|
||||
}
|
||||
|
||||
const int src_backend_id = tensor_backend_id(src);
|
||||
size_t src_id = hash_id(src);
|
||||
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||
size_t id = hash_id(src);
|
||||
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
||||
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
||||
ggml_backend_t backend = sched->backends[src_backend_id];
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * tensor_copy;
|
||||
@ -1589,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
ggml_set_input(tensor_copy);
|
||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||
}
|
||||
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
||||
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
||||
SET_CAUSE(tensor_copy, "4.cpy");
|
||||
}
|
||||
int n_graph_inputs = sched->n_graph_inputs++;
|
||||
@ -1598,11 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
}
|
||||
|
||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||
if (src_backend_id != cur_backend_id && !supported) {
|
||||
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
||||
// create a copy of the input in the split's backend
|
||||
const size_t id = hash_id(src);
|
||||
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
||||
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
|
||||
ggml_backend_t backend = sched->backends[cur_backend_id];
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||
@ -1611,14 +1608,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
ggml_set_input(tensor_copy);
|
||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||
}
|
||||
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
||||
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
||||
SET_CAUSE(tensor_copy, "4.cpy");
|
||||
}
|
||||
int n_inputs = split->n_inputs++;
|
||||
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||
split->inputs[n_inputs] = src;
|
||||
}
|
||||
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
||||
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1630,7 +1627,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
ggml_backend_sched_print_assignments(sched, graph);
|
||||
}
|
||||
|
||||
// swap node_backend_ids and leaf_backend_ids and prevs
|
||||
// swap node_backend_ids and leaf _backend_ids with prevs
|
||||
{
|
||||
int * tmp = sched->node_backend_ids;
|
||||
sched->node_backend_ids = sched->prev_node_backend_ids;
|
||||
@ -1641,9 +1638,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
sched->prev_leaf_backend_ids = tmp;
|
||||
}
|
||||
|
||||
// create copies of the graph for each split
|
||||
// TODO: avoid this copy
|
||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
||||
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
if (sched->graph.size < graph_size) {
|
||||
sched->graph.size = graph_size;
|
||||
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
||||
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
||||
GGML_ASSERT(sched->graph.nodes != NULL);
|
||||
GGML_ASSERT(sched->graph.leafs != NULL);
|
||||
}
|
||||
sched->graph.n_nodes = 0;
|
||||
sched->graph.n_leafs = 0;
|
||||
|
||||
struct ggml_cgraph * graph_copy = &sched->graph;
|
||||
|
||||
for (int i = 0; i < sched->n_splits; i++) {
|
||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||
@ -1654,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
struct ggml_tensor * input = split->inputs[j];
|
||||
const size_t input_id = hash_id(input);
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
|
||||
|
||||
// add a dependency to the input source so that it is not freed before the copy is done
|
||||
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
||||
input_dep->src[0] = input;
|
||||
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
||||
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
||||
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
||||
|
||||
// add a dependency to the input copy so that it is allocated at the start of the split
|
||||
@ -1681,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
size_t id = hash_id(input);
|
||||
int backend_id = tensor_backend_id(input);
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||
}
|
||||
@ -1694,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
struct ggml_tensor * input = split->inputs[j];
|
||||
size_t id = hash_id(input);
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||
}
|
||||
@ -1708,13 +1715,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||
}
|
||||
|
||||
sched->graph = graph_copy;
|
||||
}
|
||||
|
||||
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
bool backend_ids_changed = false;
|
||||
for (int i = 0; i < sched->graph->n_nodes; i++) {
|
||||
for (int i = 0; i < sched->graph.n_nodes; i++) {
|
||||
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
||||
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
||||
backend_ids_changed = true;
|
||||
@ -1722,7 +1727,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
}
|
||||
}
|
||||
if (!backend_ids_changed) {
|
||||
for (int i = 0; i < sched->graph->n_leafs; i++) {
|
||||
for (int i = 0; i < sched->graph.n_leafs; i++) {
|
||||
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
||||
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
||||
backend_ids_changed = true;
|
||||
@ -1732,14 +1737,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
}
|
||||
|
||||
// allocate graph
|
||||
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
||||
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
||||
// the re-allocation may cause the split inputs to be moved to a different address
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
|
||||
fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
||||
#endif
|
||||
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
||||
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
||||
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
||||
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
||||
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@ -1760,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
for (int j = 0; j < split->n_inputs; j++) {
|
||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
||||
struct ggml_tensor * input = split->inputs[j];
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
|
||||
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
||||
|
||||
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||
@ -1846,21 +1851,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
||||
|
||||
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
||||
sched->n_backends = n_backends;
|
||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||
|
||||
// initialize hash table
|
||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
||||
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
||||
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
||||
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
||||
|
||||
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
||||
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
||||
|
||||
sched->n_backends = n_backends;
|
||||
|
||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||
sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
||||
sched->context_buffer = malloc(sched->context_buffer_size);
|
||||
|
||||
const int initial_splits_capacity = 16;
|
||||
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
||||
@ -1895,37 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
ggml_hash_set_free(&sched->hash_set);
|
||||
free(sched->splits);
|
||||
free(sched->hash_set.keys);
|
||||
free(sched->tensor_backend_id);
|
||||
free(sched->tensor_copies);
|
||||
free(sched->hv_tensor_backend_ids);
|
||||
free(sched->hv_tensor_copies);
|
||||
free(sched->node_backend_ids);
|
||||
free(sched->leaf_backend_ids);
|
||||
free(sched->prev_node_backend_ids);
|
||||
free(sched->prev_leaf_backend_ids);
|
||||
free(sched->context_buffer);
|
||||
free(sched->graph.nodes);
|
||||
free(sched->graph.leafs);
|
||||
free(sched);
|
||||
}
|
||||
|
||||
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||
// reset state for the next run
|
||||
if (!sched->is_reset) {
|
||||
size_t hash_size = sched->hash_set.size;
|
||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
||||
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
||||
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
||||
|
||||
ggml_hash_set_reset(&sched->hash_set);
|
||||
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
||||
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
||||
sched->is_reset = true;
|
||||
}
|
||||
sched->is_alloc = false;
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
// TODO: extract this to a separate function
|
||||
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1936,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, graph);
|
||||
|
||||
|
||||
if (!ggml_backend_sched_alloc_splits(sched)) {
|
||||
return false;
|
||||
}
|
||||
@ -2009,6 +2017,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
tensor_backend_id(node) = backend_index;
|
||||
SET_CAUSE(node, "usr");
|
||||
sched->is_reset = false;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
||||
@ -2051,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
||||
GGML_ASSERT(src != NULL);
|
||||
GGML_ASSERT(src->data && "graph must be allocated");
|
||||
|
||||
size_t id = ggml_hash_insert(hash_set, src);
|
||||
if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
|
||||
return node_copies[ggml_hash_find(hash_set, src)];
|
||||
size_t id = ggml_hash_insert(&hash_set, src);
|
||||
if (id == GGML_HASHSET_ALREADY_EXISTS) {
|
||||
return node_copies[ggml_hash_find(&hash_set, src)];
|
||||
}
|
||||
|
||||
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
||||
@ -2078,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
||||
return dst;
|
||||
}
|
||||
|
||||
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
||||
static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
||||
size_t id = ggml_hash_find(hash_set, src);
|
||||
if (node_init[id]) {
|
||||
return;
|
||||
@ -2105,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
||||
}
|
||||
|
||||
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
||||
struct ggml_hash_set hash_set = {
|
||||
/* .size = */ graph->visited_hash_table.size,
|
||||
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
||||
};
|
||||
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
||||
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
||||
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
||||
|
||||
@ -2123,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||
|
||||
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
||||
fprintf(stderr, "failed to allocate context for graph copy\n");
|
||||
free(hash_set.keys);
|
||||
ggml_hash_set_free(&hash_set);
|
||||
free(node_copies);
|
||||
free(node_init);
|
||||
ggml_free(ctx_allocated);
|
||||
@ -2146,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
||||
if (buffer == NULL) {
|
||||
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
||||
free(hash_set.keys);
|
||||
ggml_hash_set_free(&hash_set);
|
||||
free(node_copies);
|
||||
free(node_init);
|
||||
ggml_free(ctx_allocated);
|
||||
@ -2164,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||
// copy data and init views
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
graph_copy_init_tensor(hash_set, node_copies, node_init, node);
|
||||
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
||||
}
|
||||
|
||||
// build graph copy
|
||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
|
||||
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
|
||||
graph_copy->nodes[i] = node_copy;
|
||||
}
|
||||
graph_copy->n_nodes = graph->n_nodes;
|
||||
|
||||
free(hash_set.keys);
|
||||
ggml_hash_set_free(&hash_set);
|
||||
free(node_copies);
|
||||
free(node_init);
|
||||
|
||||
|
@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
|
||||
break;
|
||||
|
||||
default:
|
||||
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -120,7 +120,7 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
||||
file, line);
|
||||
GGML_CANN_LOG_ERROR(" %s\n", stmt);
|
||||
// abort with GGML_ASSERT to get a stack trace
|
||||
GGML_ASSERT(!"CANN error");
|
||||
GGML_ABORT("CANN error");
|
||||
}
|
||||
|
||||
/**
|
||||
@ -342,7 +342,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||
// memory should always buffered. these memory may still needed by
|
||||
// tasks in stream.
|
||||
// TODO, fix me.
|
||||
GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
|
||||
GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
|
||||
}
|
||||
};
|
||||
|
||||
@ -1559,23 +1559,18 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
|
||||
return false;
|
||||
}
|
||||
|
||||
// need open both directions for memcpyasync between devices.
|
||||
ggml_cann_set_device(cann_ctx_dst->device);
|
||||
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
|
||||
ggml_cann_set_device(cann_ctx_src->device);
|
||||
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
||||
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
||||
cann_ctx_dst->stream()));
|
||||
cann_ctx_src->stream()));
|
||||
|
||||
// record event on src stream
|
||||
if (!cann_ctx_src->copy_event) {
|
||||
ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event));
|
||||
}
|
||||
|
||||
ACL_CHECK(
|
||||
aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
|
||||
|
||||
// wait on dst stream for the copy to complete
|
||||
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(),
|
||||
cann_ctx_src->copy_event));
|
||||
//TODO: workaround for Event didn`t work here.
|
||||
aclrtSynchronizeStream(cann_ctx_src->stream());
|
||||
} else {
|
||||
// src and dst are on the same backend
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
||||
@ -1763,8 +1758,8 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
||||
*
|
||||
* This function determines whether the CANN backend supports the given backend
|
||||
* buffer type by comparing the device context of the backend and buffer type.
|
||||
* It returns true if the device associated with the buffer type matches the
|
||||
* device associated with the backend.
|
||||
* It returns true if the devices are same between the backend context and
|
||||
* buffer type context.
|
||||
*
|
||||
* @param backend Pointer to the CANN backend.
|
||||
* @param buft Pointer to the backend buffer type to check.
|
||||
@ -1773,9 +1768,14 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
||||
*/
|
||||
GGML_CALL static bool ggml_backend_cann_supports_buft(
|
||||
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
if (ggml_backend_buft_is_cann(buft)) {
|
||||
ggml_backend_cann_context * cann_ctx =
|
||||
(ggml_backend_cann_context *)backend->context;
|
||||
ggml_backend_cann_buffer_type_context * buft_ctx =
|
||||
(ggml_backend_cann_buffer_type_context *)buft->context;
|
||||
return buft_ctx->device == cann_ctx->device;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1874,7 +1874,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
|
||||
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
|
||||
(aclrtEvent)event->context));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -844,7 +844,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_cann_max_pool2d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_POOL_COUNT:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -931,9 +931,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
@ -955,12 +955,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
// TODO
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
} else if (src->type == GGML_TYPE_F32) {
|
||||
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
|
||||
// && nb0 == type_size)
|
||||
@ -991,10 +991,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
// TODO: dst not contiguous
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
@ -1017,11 +1017,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
// TODO
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
@ -1029,7 +1029,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -2219,7 +2219,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -2492,7 +2492,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_cann_mul_mat_q8_0(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
||||
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
||||
// abort with GGML_ASSERT to get a stack trace
|
||||
GGML_ASSERT(!"CUDA error");
|
||||
GGML_ABORT("CUDA error");
|
||||
}
|
||||
|
||||
// this is faster on Windows
|
||||
@ -1596,7 +1596,7 @@ static void ggml_cuda_op_mul_mat(
|
||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if (quantize_src1 && !src1_is_contiguous) {
|
||||
@ -2945,7 +2945,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||
|
||||
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
|
||||
#endif
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast(
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -348,7 +348,7 @@ static __device__ void no_device_code(
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
||||
#else
|
||||
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
||||
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||
|
@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) {
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
|
||||
fprintf(stderr, "By default only f16 KV cache is supported.\n");
|
||||
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
} else if (D == 128) {
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
|
||||
fprintf(stderr, "Supported combinations:\n");
|
||||
@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) {
|
||||
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
|
||||
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
|
||||
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
|
||||
fprintf(stderr, "Only f16 is supported.\n");
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
// ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
|
||||
// break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
return;
|
||||
@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
return;
|
||||
@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
break;
|
||||
default:
|
||||
// TODO: k-quants
|
||||
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -84,7 +84,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -75,7 +75,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return MMQ_Q8_1_DS_LAYOUT_D4;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -2898,7 +2898,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -162,7 +162,7 @@ static void mul_mat_vec_q_cuda(
|
||||
rows_per_cuda_block = 2;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -196,7 +196,7 @@ static void mul_mat_vec_q_cuda(
|
||||
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -413,7 +413,7 @@ void ggml_cuda_op_mul_mat_vec_q(
|
||||
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda(
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
attn_factor, corr_dims, freq_factors, stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
attn_factor, corr_dims, freq_factors, stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -634,21 +634,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
||||
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
||||
// bitset
|
||||
|
||||
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||
|
||||
static size_t ggml_bitset_size(size_t n) {
|
||||
return (n + BITSET_MASK) >> BITSET_SHR;
|
||||
}
|
||||
|
||||
static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
|
||||
return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
|
||||
}
|
||||
|
||||
static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
|
||||
bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
|
||||
}
|
||||
|
||||
static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
||||
bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
|
||||
}
|
||||
|
||||
// hash set
|
||||
|
||||
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||
|
||||
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||
// returns the minimum size for a hash set that can hold min_sz elements
|
||||
size_t ggml_hash_size(size_t min_sz);
|
||||
|
||||
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||
// remove all elements from the hash set
|
||||
void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
|
||||
|
||||
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||
// returns true if key is in the hash set
|
||||
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
|
||||
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
|
||||
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
|
||||
// return index, asserts if table is full
|
||||
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
|
||||
// hash function for ggml_tensor
|
||||
static inline size_t ggml_hash(const struct ggml_tensor * p) {
|
||||
// the last 4 bits are always zero due to alignment
|
||||
return (size_t)(uintptr_t)p >> 4;
|
||||
}
|
||||
|
||||
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||
size_t h = ggml_hash(key) % hash_set->size;
|
||||
|
||||
// linear probing
|
||||
size_t i = h;
|
||||
while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
|
||||
i = (i + 1) % hash_set->size;
|
||||
if (i == h) {
|
||||
// visited all hash table entries -> not found
|
||||
return GGML_HASHSET_FULL;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||
size_t i = ggml_hash_find(hash_set, key);
|
||||
return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
|
||||
}
|
||||
|
||||
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||
size_t h = ggml_hash(key) % hash_set->size;
|
||||
|
||||
// linear probing
|
||||
size_t i = h;
|
||||
do {
|
||||
if (!ggml_bitset_get(hash_set->used, i)) {
|
||||
ggml_bitset_set(hash_set->used, i);
|
||||
hash_set->keys[i] = key;
|
||||
return i;
|
||||
}
|
||||
if (hash_set->keys[i] == key) {
|
||||
return GGML_HASHSET_ALREADY_EXISTS;
|
||||
}
|
||||
i = (i + 1) % hash_set->size;
|
||||
} while (i != h);
|
||||
|
||||
// visited all hash table entries -> not found
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||
size_t h = ggml_hash(key) % hash_set->size;
|
||||
|
||||
// linear probing
|
||||
size_t i = h;
|
||||
do {
|
||||
if (!ggml_bitset_get(hash_set->used, i)) {
|
||||
ggml_bitset_set(hash_set->used, i);
|
||||
hash_set->keys[i] = key;
|
||||
return i;
|
||||
}
|
||||
if (hash_set->keys[i] == key) {
|
||||
return i;
|
||||
}
|
||||
i = (i + 1) % hash_set->size;
|
||||
} while (i != h);
|
||||
|
||||
// visited all hash table entries -> not found
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
|
||||
}
|
||||
if ((a % b) != 0) {
|
||||
fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
|
||||
GGML_ASSERT(!"safe_divide result would've had remainder");
|
||||
GGML_ABORT("safe_divide result would've had remainder");
|
||||
}
|
||||
return a / b;
|
||||
}
|
||||
@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||
|
||||
if (!ggml_vk_supports_op(dst)) {
|
||||
fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
||||
GGML_ASSERT(!"unsupported op");
|
||||
GGML_ABORT("unsupported op");
|
||||
}
|
||||
|
||||
const int32_t ne00 = src0 ? src0->ne[0] : 0;
|
||||
@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||
continue;
|
||||
not_implemented: {}
|
||||
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
//GGML_ASSERT(false);
|
||||
//GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// Evaluate sequence
|
||||
|
@ -869,7 +869,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
NSError * error = nil;
|
||||
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
||||
GGML_ASSERT(!"capture failed");
|
||||
GGML_ABORT("capture failed");
|
||||
}
|
||||
}
|
||||
|
||||
@ -931,7 +931,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
|
||||
if (!ggml_metal_supports_op(ctx, dst)) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
||||
GGML_ASSERT(!"unsupported op");
|
||||
GGML_ABORT("unsupported op");
|
||||
}
|
||||
|
||||
if (should_capture) {
|
||||
@ -1068,7 +1068,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
bcast_row = true;
|
||||
@ -1077,7 +1077,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1131,7 +1131,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
|
||||
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SQR:
|
||||
@ -1609,7 +1609,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
default: GGML_ABORT("MUL MAT-MAT not implemented");
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
@ -1782,7 +1782,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
GGML_ABORT("not implemented");
|
||||
}
|
||||
};
|
||||
|
||||
@ -1911,7 +1911,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
default: GGML_ABORT("MUL_MAT_ID not implemented");
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
@ -2078,7 +2078,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
GGML_ABORT("not implemented");
|
||||
}
|
||||
};
|
||||
|
||||
@ -2178,7 +2178,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
default: GGML_ABORT("not implemented");
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
@ -2316,13 +2316,13 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
} else {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
}
|
||||
|
||||
@ -2399,7 +2399,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
@ -2556,7 +2556,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
switch (order) {
|
||||
case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
|
||||
case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
@ -2645,7 +2645,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ASSERT(false && "add template specialization for this size");
|
||||
GGML_ABORT("add template specialization for this size");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -2658,7 +2658,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ASSERT(false && "add template specialization for this size");
|
||||
GGML_ABORT("add template specialization for this size");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2779,7 +2779,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
default: GGML_ABORT("not implemented");
|
||||
};
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
@ -2787,10 +2787,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
switch (dstt) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
default: GGML_ABORT("not implemented");
|
||||
};
|
||||
} break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
default: GGML_ABORT("not implemented");
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
@ -2818,7 +2818,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4190,15 +4190,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
||||
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
||||
|
||||
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
@ -4474,15 +4477,18 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
@ -4823,18 +4829,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi = 0;
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16;
|
||||
const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16;
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
@ -5194,7 +5203,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi = 0;
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
@ -5203,9 +5213,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
@ -12692,7 +12704,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
||||
printf("Oops: found point %u not on grid:", u);
|
||||
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
q2[2*ib+0] |= ((uint32_t) grid_index << 8*k);
|
||||
q2[2*ib+1] |= (block_signs[k] << 7*k);
|
||||
@ -12871,7 +12883,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
||||
printf("Oops: found point %u not on grid:", u);
|
||||
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
q2[2*ib+k] = grid_index | (block_signs[k] << 9);
|
||||
}
|
||||
@ -13314,7 +13326,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
||||
printf("Oops: found point %u not on grid:", u);
|
||||
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
if (grid_size == 256) {
|
||||
q3[8*ib+k] = grid_index;
|
||||
@ -13527,7 +13539,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
||||
printf("Oops: found point %u not on grid:", u);
|
||||
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
qs[k] = grid_index & 255;
|
||||
qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
|
||||
@ -14503,7 +14515,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
||||
printf("Oops: found point %u not on grid:", u);
|
||||
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
const int i8 = 2*ib + k;
|
||||
y[ibl].qs[i8] = grid_index & 255;
|
||||
@ -14623,7 +14635,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||
}
|
||||
|
||||
if (nbytes % ggml_type_size(type) != 0) {
|
||||
fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
|
||||
fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1723,7 +1723,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
});
|
||||
});
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -2075,8 +2075,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
||||
// GGML_SYCL_DEBUG("current device index %d\n", id);
|
||||
src_ptr = (char *) extra->data_device[id];
|
||||
} else {
|
||||
// GGML_SYCL_DEBUG("GGML_ASSERT(false)\n");
|
||||
GGML_ASSERT(false);
|
||||
// GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n");
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
char * dst_ptr = (char *) dst;
|
||||
|
||||
@ -2163,7 +2163,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
|
||||
default:
|
||||
// TODO: k-quants
|
||||
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -2192,7 +2192,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -2476,7 +2476,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 64;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
}
|
||||
@ -3101,7 +3101,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
||||
@ -3896,7 +3896,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
(void) dst;
|
||||
|
@ -100,7 +100,7 @@ static void crash() {
|
||||
const char* msg) {
|
||||
fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
|
||||
fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
|
||||
GGML_ASSERT(!"SYCL error");
|
||||
GGML_ABORT("SYCL error");
|
||||
}
|
||||
|
||||
#define SYCL_CHECK(err) \
|
||||
|
@ -1011,7 +1011,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
break;
|
||||
default:
|
||||
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -975,7 +975,7 @@ namespace dpct
|
||||
if (backend == "opencl:cpu") return 4;
|
||||
if (backend == "opencl:acc") return 5;
|
||||
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
static bool compare_backend(std::string &backend1, std::string &backend2) {
|
||||
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
||||
|
@ -1799,7 +1799,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q4_0_PASCAL;
|
||||
nwarps = NWARPS_Q4_0_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -1914,7 +1914,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q4_1_PASCAL;
|
||||
nwarps = NWARPS_Q4_1_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -2029,7 +2029,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q5_0_PASCAL;
|
||||
nwarps = NWARPS_Q5_0_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -2144,7 +2144,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q5_1_PASCAL;
|
||||
nwarps = NWARPS_Q5_1_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -2259,7 +2259,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q8_0_PASCAL;
|
||||
nwarps = NWARPS_Q8_0_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -2374,7 +2374,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q2_K_PASCAL;
|
||||
nwarps = NWARPS_Q2_K_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -2497,7 +2497,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q3_K_PASCAL;
|
||||
nwarps = NWARPS_Q3_K_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -2625,7 +2625,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q4_K_PASCAL;
|
||||
nwarps = NWARPS_Q4_K_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -2746,7 +2746,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q5_K_PASCAL;
|
||||
nwarps = NWARPS_Q5_K_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -2867,7 +2867,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
mmq_y = MMQ_Y_Q6_K_PASCAL;
|
||||
nwarps = NWARPS_Q6_K_PASCAL;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
@ -3016,7 +3016,7 @@ void ggml_sycl_op_mul_mat_q(
|
||||
ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1017,7 +1017,7 @@ void ggml_sycl_op_mul_mat_vec_q(
|
||||
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -251,7 +251,7 @@ void ggml_sycl_op_rope(
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
@ -265,7 +265,7 @@ void ggml_sycl_op_rope(
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -236,8 +236,8 @@ struct vk_device_struct {
|
||||
};
|
||||
|
||||
struct vk_buffer_struct {
|
||||
vk::Buffer buffer;
|
||||
vk::DeviceMemory device_memory;
|
||||
vk::Buffer buffer = VK_NULL_HANDLE;
|
||||
vk::DeviceMemory device_memory = VK_NULL_HANDLE;
|
||||
vk::MemoryPropertyFlags memory_property_flags;
|
||||
void * ptr;
|
||||
size_t size = 0;
|
||||
@ -1961,7 +1961,7 @@ void ggml_vk_instance_init() {
|
||||
// Make sure at least one device exists
|
||||
if (devices.empty()) {
|
||||
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// Default to using all dedicated GPUs
|
||||
@ -2459,7 +2459,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
||||
// Buffer is already mapped
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer buf;
|
||||
@ -2527,7 +2527,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
||||
staging = ctx->device->sync_staging;
|
||||
staging_offset = 0;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -2563,7 +2563,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
|
||||
// Buffer is already mapped
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer buf = nullptr;
|
||||
@ -2602,7 +2602,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
|
||||
staging_buffer = dst->device->sync_staging;
|
||||
staging_offset = 0;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -2704,7 +2704,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
|
||||
|
||||
staging_buffer = src->device->sync_staging;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -2913,7 +2913,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
||||
}
|
||||
|
||||
std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
||||
@ -3499,7 +3499,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
||||
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
|
||||
|
||||
if (mmp == nullptr) {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// Not implemented
|
||||
@ -4078,7 +4078,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
std::cerr << " and " << ggml_type_name(src1->type);
|
||||
}
|
||||
std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
op_func(ctx, subctx, src0, src1, dst);
|
||||
@ -4521,7 +4521,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
||||
} else if (type == GGML_TYPE_F16) {
|
||||
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
fprintf(stderr, "% 7.2f ", val);
|
||||
} else {
|
||||
@ -4555,7 +4555,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
p = ctx->device->pipeline_matmul_f16->a_s;
|
||||
shname = "F16_ALIGNED_S";
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (shader_size == 1) {
|
||||
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
@ -4571,7 +4571,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
p = ctx->device->pipeline_matmul_f16->a_m;
|
||||
shname = "F16_ALIGNED_M";
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (shader_size == 2) {
|
||||
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
||||
@ -4587,7 +4587,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
p = ctx->device->pipeline_matmul_f16->a_l;
|
||||
shname = "F16_ALIGNED_L";
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(0);
|
||||
@ -4668,7 +4668,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
|
||||
x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < y_ne; i++) {
|
||||
@ -4679,7 +4679,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
// y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
|
||||
y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -4727,14 +4727,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
|
||||
src0_type = GGML_TYPE_F16;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
if (std::is_same<float, Y_TYPE>()) {
|
||||
src1_type = GGML_TYPE_F32;
|
||||
} else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
||||
src1_type = GGML_TYPE_F16;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
|
||||
@ -4841,7 +4841,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
fprintf(stderr, "% 7.2f ", val);
|
||||
} else {
|
||||
@ -5391,7 +5391,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
#endif
|
||||
|
||||
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
||||
@ -5486,7 +5486,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
break;
|
||||
default:
|
||||
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -6498,7 +6498,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
||||
} else if (tensor->type == GGML_TYPE_I32) {
|
||||
val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
fprintf(stderr, "% 7.2f ", val);
|
||||
} else {
|
||||
@ -6620,7 +6620,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
@ -6662,7 +6662,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
@ -6720,7 +6720,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
@ -6797,7 +6797,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
break;
|
||||
default:
|
||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
|
||||
if (src1 == nullptr) {
|
||||
@ -6825,7 +6825,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
|
||||
} else {
|
||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
||||
@ -6912,7 +6912,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
}
|
||||
} else {
|
||||
std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
|
||||
@ -6935,7 +6935,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
std::cerr << std::endl;
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
|
||||
first_error[0] = i0;
|
||||
@ -7006,7 +7006,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
std::cerr << std::endl;
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
|
||||
}
|
||||
|
865
ggml/src/ggml.c
865
ggml/src/ggml.c
File diff suppressed because it is too large
Load Diff
@ -102,6 +102,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
|
||||
#
|
||||
# src/ggml.c -> ggml/src/ggml.c
|
||||
# src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c
|
||||
# src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h
|
||||
# src/ggml-alloc.c -> ggml/src/ggml-alloc.c
|
||||
# src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
|
||||
# src/ggml-backend.c -> ggml/src/ggml-backend.c
|
||||
@ -117,6 +119,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# src/ggml-sycl/* -> ggml/src/ggml-sycl/
|
||||
# src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp
|
||||
# src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp
|
||||
# src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
|
||||
#
|
||||
# include/ggml.h -> ggml/include/ggml.h
|
||||
# include/ggml-alloc.h -> ggml/include/ggml-alloc.h
|
||||
@ -143,6 +146,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \
|
||||
@ -158,6 +163,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
|
||||
|
@ -1 +1 @@
|
||||
e3b3846976c94163f2b3dd128cc959782653edbb
|
||||
31d544f87835a55602883fe09156bb85a4c163d8
|
||||
|
@ -5,6 +5,8 @@ cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt
|
||||
cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
|
||||
|
||||
cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c
|
||||
cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c
|
||||
cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h
|
||||
cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c
|
||||
cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
|
||||
cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c
|
||||
@ -21,6 +23,7 @@ cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp
|
||||
cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/
|
||||
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp
|
||||
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp
|
||||
cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
|
||||
|
||||
cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h
|
||||
cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h
|
||||
|
@ -221,7 +221,7 @@ static void llama_grammar_advance_stack(
|
||||
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
||||
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
||||
// those
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -517,7 +517,7 @@ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struc
|
||||
return;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
const std::string & piece = vocab->cache_token_to_piece.at(token);
|
||||
|
@ -13,8 +13,6 @@ struct llama_grammar {
|
||||
llama_partial_utf8 partial_utf8;
|
||||
};
|
||||
|
||||
struct llama_grammar * llama_get_grammar(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// internal API
|
||||
//
|
||||
|
@ -152,14 +152,14 @@ static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
||||
return strtol(buf.c_str(), NULL, 16);
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
GGML_ASSERT(false);
|
||||
return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
||||
GGML_ABORT("fatal error");
|
||||
//return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_WPM: {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1396,7 +1396,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_NONE:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
return output;
|
||||
@ -1422,7 +1422,7 @@ llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
|
||||
return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1606,7 +1606,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
158
src/llama.cpp
158
src/llama.cpp
@ -101,7 +101,6 @@
|
||||
#endif
|
||||
|
||||
// bump if necessary
|
||||
#define LLAMA_MAX_NODES 8192
|
||||
#define LLAMA_MAX_LAYERS 512
|
||||
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
||||
|
||||
@ -2259,8 +2258,7 @@ struct llama_hparams {
|
||||
return n_head_arr[il];
|
||||
}
|
||||
|
||||
GGML_ASSERT(false);
|
||||
return 0;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t n_head_kv(uint32_t il = 0) const {
|
||||
@ -2268,8 +2266,7 @@ struct llama_hparams {
|
||||
return n_head_kv_arr[il];
|
||||
}
|
||||
|
||||
GGML_ASSERT(false);
|
||||
return 0;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t n_ff(uint32_t il = 0) const {
|
||||
@ -2277,8 +2274,7 @@ struct llama_hparams {
|
||||
return n_ff_arr[il];
|
||||
}
|
||||
|
||||
GGML_ASSERT(false);
|
||||
return 0;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t n_gqa(uint32_t il = 0) const {
|
||||
@ -2455,6 +2451,7 @@ struct llama_layer {
|
||||
// long rope factors
|
||||
struct ggml_tensor * rope_long = nullptr;
|
||||
struct ggml_tensor * rope_short = nullptr;
|
||||
struct ggml_tensor * rope_freqs = nullptr;
|
||||
|
||||
// bitnet scale
|
||||
struct ggml_tensor * wq_scale;
|
||||
@ -2657,7 +2654,6 @@ struct llama_context {
|
||||
llama_context(const llama_model & model)
|
||||
: model(model)
|
||||
, sampling(llama_n_vocab(&model))
|
||||
, grammar()
|
||||
, t_start_us(model.t_start_us)
|
||||
, t_load_us(model.t_load_us) {}
|
||||
|
||||
@ -2675,7 +2671,6 @@ struct llama_context {
|
||||
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sampling sampling;
|
||||
struct llama_grammar grammar;
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_control_vector cvec;
|
||||
|
||||
@ -2907,7 +2902,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||
#elif defined(GGML_USE_CANN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_cann_get_device_memory(device, &total, &free);
|
||||
ggml_backend_cann_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#else
|
||||
return 1;
|
||||
@ -3572,6 +3567,15 @@ namespace GGUFMeta {
|
||||
|
||||
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
||||
|
||||
// TODO: update when needed or think of some clever automatic way to do this
|
||||
static size_t llama_model_max_nodes(const llama_model & /*model*/) {
|
||||
//if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
|
||||
// return 32768;
|
||||
//}
|
||||
|
||||
return 8192;
|
||||
}
|
||||
|
||||
struct llama_model_loader {
|
||||
int n_kv = 0;
|
||||
int n_tensors = 0;
|
||||
@ -6056,6 +6060,8 @@ static bool llm_load_tensors(
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
|
||||
if (n_expert == 0) {
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
@ -8074,7 +8080,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
||||
cb(gate, "ffn_moe_gelu", il);
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
||||
@ -8401,7 +8407,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_k_shift() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
GGML_ASSERT(kv_self.size == n_ctx);
|
||||
|
||||
@ -8432,7 +8438,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_s_copy() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
GGML_ASSERT(kv_self.recurrent);
|
||||
|
||||
@ -8455,7 +8461,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
@ -8533,6 +8539,10 @@ struct llm_build_context {
|
||||
// choose long/short freq factors based on the context size
|
||||
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
|
||||
if (model.layers[il].rope_freqs != nullptr) {
|
||||
return model.layers[il].rope_freqs;
|
||||
}
|
||||
|
||||
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
|
||||
return model.layers[il].rope_long;
|
||||
}
|
||||
@ -8637,8 +8647,8 @@ struct llm_build_context {
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false && "unknown pooling type");
|
||||
} break;
|
||||
GGML_ABORT("unknown pooling type");
|
||||
}
|
||||
}
|
||||
|
||||
cb(cur, "result_embd_pooled", -1);
|
||||
@ -8696,7 +8706,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_llama() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
@ -8727,6 +8737,9 @@ struct llm_build_context {
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
@ -8750,14 +8763,14 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
@ -8839,7 +8852,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_baichuan() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -8893,7 +8906,7 @@ struct llm_build_context {
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
@ -8954,7 +8967,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_xverse() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -9057,7 +9070,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_falcon() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -9177,7 +9190,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_grok() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
@ -9334,7 +9347,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_dbrx() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
@ -9460,7 +9473,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_starcoder() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -9564,7 +9577,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_refact() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -9658,7 +9671,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_bert() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -9852,7 +9865,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_bloom() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -9953,7 +9966,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_mpt() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -10243,7 +10256,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_qwen() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -10355,7 +10368,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_qwen2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -10467,7 +10480,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_qwen2moe() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
@ -10613,7 +10626,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_phi2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -10734,7 +10747,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_phi3() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -10966,7 +10979,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gpt2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -11071,7 +11084,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_codeshell() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -11182,7 +11195,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_orion() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -11300,7 +11313,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_internlm2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -11421,7 +11434,7 @@ struct llm_build_context {
|
||||
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
|
||||
// based on the original build_llama() function
|
||||
struct ggml_cgraph * build_minicpm() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -11565,7 +11578,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gemma() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
|
||||
@ -11673,7 +11686,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gemma2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
|
||||
@ -11725,7 +11738,7 @@ struct llm_build_context {
|
||||
switch (model.type) {
|
||||
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
||||
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
||||
default: GGML_ASSERT(false);
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
cb(Qcur, "Qcur_scaled", il);
|
||||
|
||||
@ -11808,7 +11821,7 @@ struct llm_build_context {
|
||||
|
||||
|
||||
struct ggml_cgraph * build_starcoder2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -11927,7 +11940,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_mamba() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t d_model = n_embd;
|
||||
const int64_t d_conv = hparams.ssm_d_conv;
|
||||
@ -12076,7 +12089,7 @@ struct llm_build_context {
|
||||
|
||||
struct ggml_cgraph * build_command_r() {
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -12230,7 +12243,7 @@ struct llm_build_context {
|
||||
// * removed bias
|
||||
// * removed MoE
|
||||
struct ggml_cgraph * build_olmo() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
@ -12354,7 +12367,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_openelm() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -12479,7 +12492,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gptneox() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -12621,7 +12634,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_arctic() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
@ -12753,7 +12766,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_deepseek2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
@ -12981,7 +12994,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_bitnet() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
@ -13121,7 +13134,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_t5() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
@ -13438,7 +13451,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_jais() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -13530,7 +13543,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_chatglm() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
@ -13890,7 +13903,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
result = llm.build_jais();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// add on pooling layer
|
||||
@ -14048,7 +14061,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
f = -INFINITY;
|
||||
} else {
|
||||
if (hparams.use_alibi) {
|
||||
f = -fabs(lctx.kv_self.cells[i].pos - pos);
|
||||
f = -std::abs(lctx.kv_self.cells[i].pos - pos);
|
||||
} else {
|
||||
f = 0.0f;
|
||||
}
|
||||
@ -14102,7 +14115,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
if (batch.seq_id[i][s] == seq_id) {
|
||||
if (hparams.use_alibi) {
|
||||
f = -fabs(batch.pos[i] - batch.pos[j]);
|
||||
f = -std::abs(batch.pos[i] - batch.pos[j]);
|
||||
} else {
|
||||
f = 0.0f;
|
||||
}
|
||||
@ -14689,8 +14702,8 @@ static int llama_decode_internal(
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
||||
{
|
||||
GGML_ASSERT(false && "unknown pooling type");
|
||||
} break;
|
||||
GGML_ABORT("unknown pooling type");
|
||||
}
|
||||
}
|
||||
}
|
||||
n_outputs_prev += lctx.n_outputs;
|
||||
@ -14875,9 +14888,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
// each move requires 6*n_layer tensors (see build_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
// - x2 for keys and values
|
||||
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
||||
//const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
|
||||
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
||||
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
|
||||
const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
|
||||
|
||||
// determine which KV cells to move where
|
||||
//
|
||||
@ -15081,7 +15094,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||
// apply K-shift if needed
|
||||
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
||||
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
|
||||
GGML_ASSERT(false && "Deepseek2 does not support K-shift");
|
||||
GGML_ABORT("Deepseek2 does not support K-shift");
|
||||
}
|
||||
|
||||
{
|
||||
@ -15220,7 +15233,7 @@ static void llama_tensor_dequantize_internal(
|
||||
} else if (ggml_is_quantized(tensor->type)) {
|
||||
qtype.to_float(tensor->data, f32_output, nelements);
|
||||
} else {
|
||||
GGML_ASSERT(false); // unreachable
|
||||
GGML_ABORT("fatal error"); // unreachable
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -16767,8 +16780,10 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
}
|
||||
|
||||
const size_t max_nodes = llama_model_max_nodes(*model);
|
||||
|
||||
// buffer used to store the computation graph and the tensor meta data
|
||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
bool pipeline_parallel =
|
||||
@ -16781,7 +16796,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
// currently this is only implemented in the CUDA backend
|
||||
pipeline_parallel = false;
|
||||
#endif
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
|
||||
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
|
||||
@ -16833,10 +16848,6 @@ const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
|
||||
return &ctx->model.vocab;
|
||||
}
|
||||
|
||||
struct llama_grammar * llama_get_grammar(struct llama_context * ctx) {
|
||||
return &ctx->grammar;
|
||||
}
|
||||
|
||||
uint32_t llama_n_ctx(const struct llama_context * ctx) {
|
||||
return ctx->cparams.n_ctx;
|
||||
}
|
||||
@ -16910,8 +16921,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
|
||||
// all model arches should be listed explicitly here
|
||||
case LLM_ARCH_UNKNOWN:
|
||||
GGML_ASSERT(false && "unknown architecture");
|
||||
break;
|
||||
GGML_ABORT("unknown architecture");
|
||||
}
|
||||
|
||||
return LLAMA_ROPE_TYPE_NONE;
|
||||
@ -18475,7 +18485,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
@ -18520,7 +18530,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -94,7 +94,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
// This is going to create some weird integers though.
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
@ -132,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
||||
tt.to_float(&buf[i], vq.data(), bs);
|
||||
tv.insert(tv.end(), vq.begin(), vq.end());
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1435,7 +1435,7 @@ struct test_argsort : public test_case {
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2462,7 +2462,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -146,7 +146,7 @@ int main(void) {
|
||||
auto fmt_sys = [&](std::string tmpl) {
|
||||
auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
|
||||
printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
|
||||
printf("-------------------------\n", output.c_str());
|
||||
printf("-------------------------\n");
|
||||
return output;
|
||||
};
|
||||
assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
|
||||
@ -165,7 +165,7 @@ int main(void) {
|
||||
auto fmt_single = [&](std::string tmpl) {
|
||||
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
|
||||
printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
|
||||
printf("-------------------------\n", output.c_str());
|
||||
printf("-------------------------\n");
|
||||
return output;
|
||||
};
|
||||
assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
|
||||
|
@ -166,12 +166,12 @@ static void test_sampler_queue(
|
||||
for (auto s : samplers_sequence) {
|
||||
switch (s){
|
||||
case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
|
||||
case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break;
|
||||
case 'y': GGML_ASSERT(false && "typical test not implemented"); break;
|
||||
case 'f': GGML_ABORT("tail_free test not implemented"); break;
|
||||
case 'y': GGML_ABORT("typical test not implemented"); break;
|
||||
case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
|
||||
case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
|
||||
case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
|
||||
default : GGML_ASSERT(false && "Unknown sampler"); break;
|
||||
case 't': GGML_ABORT("temperature test not implemented"); break;
|
||||
default : GGML_ABORT("Unknown sampler"); break;
|
||||
}
|
||||
|
||||
llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
|
||||
@ -222,7 +222,7 @@ static void test_sampler_queue(
|
||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user