mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 20:14:29 +00:00
metal : bug-fix when enable ggml-alloc (#2757)
* metal: better memory alloc w/ concurrency dispatch The ggml-alloc should only free tensors at memory barriers. * ggml-alloc: avoid return silently In certain cases, the allocate_node() function may silently return without performing any memory allocation.
This commit is contained in:
parent
8f8c28e89c
commit
38b16dfca6
141
ggml-alloc.c
141
ggml-alloc.c
@ -68,7 +68,7 @@ struct ggml_allocr {
|
|||||||
size_t max_size;
|
size_t max_size;
|
||||||
bool measure;
|
bool measure;
|
||||||
int parse_seq[GGML_MAX_NODES];
|
int parse_seq[GGML_MAX_NODES];
|
||||||
bool has_parse_seq;
|
int parse_seq_len;
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
struct ggml_tensor * allocated_tensors[1024];
|
struct ggml_tensor * allocated_tensors[1024];
|
||||||
@ -239,14 +239,10 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
||||||
int pos = 0;
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
if (list[i] != -1) {
|
alloc->parse_seq[i] = list[i];
|
||||||
alloc->parse_seq[pos] = list[i];
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
alloc->has_parse_seq = true;
|
alloc->parse_seq_len = n;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
||||||
@ -269,7 +265,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|||||||
/*.max_size = */ 0,
|
/*.max_size = */ 0,
|
||||||
/*.measure = */ false,
|
/*.measure = */ false,
|
||||||
/*.parse_seq = */ {0},
|
/*.parse_seq = */ {0},
|
||||||
/*.has_parse_seq = */ false,
|
/*.parse_seq_len = */ 0,
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
/*.allocated_tensors = */ = {0},
|
/*.allocated_tensors = */ = {0},
|
||||||
#endif
|
#endif
|
||||||
@ -298,7 +294,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|||||||
/*.max_size = */ 0,
|
/*.max_size = */ 0,
|
||||||
/*.measure = */ true,
|
/*.measure = */ true,
|
||||||
/*.parse_seq = */ {0},
|
/*.parse_seq = */ {0},
|
||||||
/*.has_parse_seq = */ false,
|
/*.parse_seq_len = */ 0,
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
/*.allocated_tensors = */ = {0},
|
/*.allocated_tensors = */ = {0},
|
||||||
#endif
|
#endif
|
||||||
@ -445,8 +441,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|||||||
else {
|
else {
|
||||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||||
node->data = parent->data;
|
node->data = parent->data;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -497,69 +493,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|||||||
allocate_node(alloc, input);
|
allocate_node(alloc, input);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int ind = 0; ind < gf->n_nodes; ind++) {
|
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
||||||
int i;
|
int last_barrier_pos = 0;
|
||||||
if (alloc->has_parse_seq) {
|
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
||||||
i = alloc->parse_seq[ind];
|
|
||||||
} else {
|
|
||||||
i = ind;
|
|
||||||
}
|
|
||||||
struct ggml_tensor * node = gf->nodes[i];
|
|
||||||
|
|
||||||
// allocate parents (leafs)
|
for (int ind = 0; ind < n_nodes; ind++) {
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
// allocate a node if there is no parse_seq or this is not a barrier
|
||||||
struct ggml_tensor * parent = node->src[j];
|
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
|
||||||
if (parent == NULL) {
|
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
|
||||||
break;
|
struct ggml_tensor * node = gf->nodes[i];
|
||||||
|
|
||||||
|
// allocate parents (leafs)
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * parent = node->src[j];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
allocate_node(alloc, parent);
|
||||||
}
|
}
|
||||||
allocate_node(alloc, parent);
|
|
||||||
|
// allocate node
|
||||||
|
allocate_node(alloc, node);
|
||||||
|
|
||||||
|
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * parent = node->src[j];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
AT_PRINTF("%s", parent->name);
|
||||||
|
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
||||||
|
AT_PRINTF(", ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
AT_PRINTF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate node
|
|
||||||
allocate_node(alloc, node);
|
|
||||||
|
|
||||||
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
||||||
struct ggml_tensor * parent = node->src[j];
|
|
||||||
if (parent == NULL) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
AT_PRINTF("%s", parent->name);
|
|
||||||
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
|
||||||
AT_PRINTF(", ");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
AT_PRINTF("\n");
|
|
||||||
|
|
||||||
// update parents
|
// update parents
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
// update immediately if there is no parse_seq
|
||||||
struct ggml_tensor * parent = node->src[j];
|
// update only at barriers if there is parse_seq
|
||||||
if (parent == NULL) {
|
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
||||||
break;
|
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
||||||
|
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
||||||
|
for (int i = update_start; i < update_end; i++) {
|
||||||
|
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
||||||
|
struct ggml_tensor * node = gf->nodes[node_i];
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * parent = node->src[j];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
struct hash_node * p_hn = hash_get(ht, parent);
|
||||||
|
p_hn->n_children -= 1;
|
||||||
|
|
||||||
|
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
||||||
|
|
||||||
|
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
||||||
|
if (ggml_is_view(parent)) {
|
||||||
|
struct ggml_tensor * view_src = get_view_source(parent);
|
||||||
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||||
|
view_src_hn->n_views -= 1;
|
||||||
|
AT_PRINTF("view_src %s\n", view_src->name);
|
||||||
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||||
|
ggml_allocator_free_tensor(alloc, view_src);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (parent->data != node->data) {
|
||||||
|
ggml_allocator_free_tensor(alloc, parent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
struct hash_node * p_hn = hash_get(ht, parent);
|
AT_PRINTF("\n");
|
||||||
p_hn->n_children -= 1;
|
if (alloc->parse_seq_len) {
|
||||||
|
last_barrier_pos = ind + 1;
|
||||||
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
|
||||||
|
|
||||||
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
|
||||||
if (ggml_is_view(parent)) {
|
|
||||||
struct ggml_tensor * view_src = get_view_source(parent);
|
|
||||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
|
||||||
view_src_hn->n_views -= 1;
|
|
||||||
AT_PRINTF("view_src %s\n", view_src->name);
|
|
||||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
|
||||||
ggml_allocator_free_tensor(alloc, view_src);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (parent->data != node->data) {
|
|
||||||
ggml_allocator_free_tensor(alloc, parent);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
AT_PRINTF("\n");
|
|
||||||
}
|
}
|
||||||
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
||||||
if (outputs != NULL && outputs[g] != NULL) {
|
if (outputs != NULL && outputs[g] != NULL) {
|
||||||
|
@ -2707,11 +2707,6 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||||||
struct ggml_tensor * inpFF = attn_norm;
|
struct ggml_tensor * inpFF = attn_norm;
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
|
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
|
||||||
|
|
||||||
// TODO: this is temporary needed to introduce artificial dependency between FF and ATTN
|
|
||||||
// adding this, because there seems to be a bug in the Metal concurrency optimization
|
|
||||||
// without this line, the results are non-deterministic and wrong
|
|
||||||
cur->src[2] = attn_out;
|
|
||||||
offload_func(cur);
|
offload_func(cur);
|
||||||
|
|
||||||
cur = ggml_gelu(ctx0, cur);
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
Loading…
Reference in New Issue
Block a user