mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-24 18:34:36 +00:00
ggml : hide ggml_object, ggml_cgraph, ggml_hash_set (#9408)
* ggml : hide ggml_object, ggml_cgraph, ggml_hash_set ggml-ci * ggml : add ggml-impl.h to backends * ggml : fix compiler warnings ggml-ci * ggml : add assert upon adding nodes
This commit is contained in:
parent
c9c8575a1a
commit
d6a04f872d
@ -183,7 +183,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
||||||
|
|
||||||
TENSOR_DUMP(gf->nodes[0]);
|
TENSOR_DUMP(ggml_graph_node(gf, 0));
|
||||||
|
|
||||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||||
|
|
||||||
@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
|
|
||||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||||
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
|
||||||
|
|
||||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||||
printf("=====================================================================================\n");
|
printf("=====================================================================================\n");
|
||||||
@ -252,7 +252,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// Check that the matrix multiplication result is in the right ballpark
|
// Check that the matrix multiplication result is in the right ballpark
|
||||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||||
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
|
||||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||||
|
|
||||||
|
@ -226,8 +226,8 @@ static ggml_status compute_piter(
|
|||||||
result.eigenvectors.resize(params.n_batch);
|
result.eigenvectors.resize(params.n_batch);
|
||||||
result.distances.resize(params.n_batch);
|
result.distances.resize(params.n_batch);
|
||||||
// get output nodes
|
// get output nodes
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||||
auto node = gf->nodes[i];
|
auto node = ggml_graph_node(gf, i);
|
||||||
int iter = -1;
|
int iter = -1;
|
||||||
// find b_tensor (without copying data from device)
|
// find b_tensor (without copying data from device)
|
||||||
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||||
|
@ -370,7 +370,7 @@ struct lora_merge_ctx {
|
|||||||
|
|
||||||
// write data to output file
|
// write data to output file
|
||||||
{
|
{
|
||||||
auto result = gf->nodes[gf->n_nodes - 1];
|
auto * result = ggml_graph_node(gf, -1);
|
||||||
size_t len = ggml_nbytes(result);
|
size_t len = ggml_nbytes(result);
|
||||||
if (read_buf.size() < len) {
|
if (read_buf.size() < len) {
|
||||||
read_buf.resize(len);
|
read_buf.resize(len);
|
||||||
|
@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
ggml_backend_graph_compute(ctx->backend, gf);
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
// the last node is the embedding tensor
|
// the last node is the embedding tensor
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
// copy the embeddings to the location passed by the user
|
// copy the embeddings to the location passed by the user
|
||||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||||
|
@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|||||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||||
ggml_build_forward_expand(gf, flatten);
|
ggml_build_forward_expand(gf, flatten);
|
||||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
||||||
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||||
|
@ -358,6 +358,7 @@ extern "C" {
|
|||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
struct ggml_cgraph;
|
||||||
|
|
||||||
// NOTE: always add types at the end of the enum to keep backward compatibility
|
// NOTE: always add types at the end of the enum to keep backward compatibility
|
||||||
enum ggml_type {
|
enum ggml_type {
|
||||||
@ -575,23 +576,9 @@ extern "C" {
|
|||||||
GGML_TENSOR_FLAG_PARAM = 4,
|
GGML_TENSOR_FLAG_PARAM = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
// ggml object
|
|
||||||
struct ggml_object {
|
|
||||||
size_t offs;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
struct ggml_object * next;
|
|
||||||
|
|
||||||
enum ggml_object_type type;
|
|
||||||
|
|
||||||
char padding[4];
|
|
||||||
};
|
|
||||||
|
|
||||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
|
||||||
|
|
||||||
// n-dimensional tensor
|
// n-dimensional tensor
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
|
||||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||||
|
|
||||||
@ -655,7 +642,7 @@ extern "C" {
|
|||||||
|
|
||||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||||
|
|
||||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||||
|
|
||||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
// since https://github.com/ggerganov/ggml/issues/287
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
@ -671,35 +658,6 @@ extern "C" {
|
|||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order {
|
|
||||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
|
||||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
|
||||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef uint32_t ggml_bitset_t;
|
|
||||||
|
|
||||||
struct ggml_hash_set {
|
|
||||||
size_t size;
|
|
||||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
|
||||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
|
||||||
};
|
|
||||||
|
|
||||||
// computation graph
|
|
||||||
struct ggml_cgraph {
|
|
||||||
int size;
|
|
||||||
int n_nodes;
|
|
||||||
int n_leafs;
|
|
||||||
|
|
||||||
struct ggml_tensor ** nodes;
|
|
||||||
struct ggml_tensor ** grads;
|
|
||||||
struct ggml_tensor ** leafs;
|
|
||||||
|
|
||||||
struct ggml_hash_set visited_hash_set;
|
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
|
||||||
};
|
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
struct ggml_scratch {
|
struct ggml_scratch {
|
||||||
size_t offs;
|
size_t offs;
|
||||||
@ -2017,8 +1975,6 @@ extern "C" {
|
|||||||
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
||||||
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
||||||
|
|
||||||
#define GGML_N_TASKS_MAX -1
|
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_map_custom1(
|
GGML_API struct ggml_tensor * ggml_map_custom1(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
@ -2088,30 +2044,35 @@ extern "C" {
|
|||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * tensor);
|
struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
||||||
|
|
||||||
// graph allocation in a context
|
// graph allocation in a context
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
||||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||||
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
|
||||||
|
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
|
||||||
|
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API size_t ggml_graph_overhead(void);
|
GGML_API size_t ggml_graph_overhead(void);
|
||||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||||
|
|
||||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
|
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
|
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||||
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
|
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "ggml-impl.h"
|
||||||
#include "ggml-blas.h"
|
#include "ggml-blas.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
@ -30,6 +30,7 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-cann/aclnn_ops.h"
|
#include "ggml-cann/aclnn_ops.h"
|
||||||
#include "ggml-cann/common.h"
|
#include "ggml-cann/common.h"
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include "ggml-cuda/common.cuh"
|
#include "ggml-cuda/common.cuh"
|
||||||
|
@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
enum ggml_cgraph_eval_order {
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
// bitset
|
// bitset
|
||||||
|
|
||||||
|
typedef uint32_t ggml_bitset_t;
|
||||||
|
|
||||||
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||||
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||||
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||||
@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
|||||||
#define GGML_HASHSET_FULL ((size_t)-1)
|
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||||
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||||
|
|
||||||
|
struct ggml_hash_set {
|
||||||
|
size_t size;
|
||||||
|
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||||
|
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||||
|
};
|
||||||
|
|
||||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||||
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||||
|
|
||||||
@ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
|
|||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// computation graph
|
||||||
|
|
||||||
|
struct ggml_cgraph {
|
||||||
|
int size;
|
||||||
|
int n_nodes;
|
||||||
|
int n_leafs;
|
||||||
|
|
||||||
|
struct ggml_tensor ** nodes;
|
||||||
|
struct ggml_tensor ** grads;
|
||||||
|
struct ggml_tensor ** leafs;
|
||||||
|
|
||||||
|
struct ggml_hash_set visited_hash_set;
|
||||||
|
|
||||||
|
enum ggml_cgraph_eval_order order;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-kompute.h"
|
#include "ggml-kompute.h"
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#import "ggml-metal.h"
|
#import "ggml-metal.h"
|
||||||
|
|
||||||
|
#import "ggml-impl.h"
|
||||||
#import "ggml-backend-impl.h"
|
#import "ggml-backend-impl.h"
|
||||||
#import "ggml.h"
|
|
||||||
|
|
||||||
#import <Foundation/Foundation.h>
|
#import <Foundation/Foundation.h>
|
||||||
|
|
||||||
@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
// create multiple command buffers and enqueue them
|
// create multiple command buffers and enqueue them
|
||||||
// then, we encode the graph into the command buffers in parallel
|
// then, we encode the graph into the command buffers in parallel
|
||||||
|
|
||||||
const int n_nodes = gf->n_nodes;
|
const int n_nodes = gf->n_nodes;
|
||||||
const int n_cb = ctx->n_cb;
|
const int n_cb = ctx->n_cb;
|
||||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -33,7 +33,7 @@
|
|||||||
#include <sycl/half_type.hpp>
|
#include <sycl/half_type.hpp>
|
||||||
|
|
||||||
#include "ggml-sycl.h"
|
#include "ggml-sycl.h"
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include "ggml-sycl/backend.hpp"
|
#include "ggml-sycl/backend.hpp"
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include "ggml-vulkan-shaders.hpp"
|
#include "ggml-vulkan-shaders.hpp"
|
||||||
|
112
ggml/src/ggml.c
112
ggml/src/ggml.c
@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|||||||
#define GGML_DEBUG 0
|
#define GGML_DEBUG 0
|
||||||
#define GGML_GELU_FP16
|
#define GGML_GELU_FP16
|
||||||
#define GGML_GELU_QUICK_FP16
|
#define GGML_GELU_QUICK_FP16
|
||||||
|
#define GGML_N_TASKS_MAX (-1)
|
||||||
|
|
||||||
#define GGML_SOFT_MAX_UNROLL 4
|
#define GGML_SOFT_MAX_UNROLL 4
|
||||||
#define GGML_VEC_DOT_UNROLL 2
|
#define GGML_VEC_DOT_UNROLL 2
|
||||||
@ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|||||||
#define GGML_F32x4_ADD vaddq_f32
|
#define GGML_F32x4_ADD vaddq_f32
|
||||||
#define GGML_F32x4_MUL vmulq_f32
|
#define GGML_F32x4_MUL vmulq_f32
|
||||||
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
||||||
#define GGML_F32x4_REDUCE(res, x) \
|
#define GGML_F32x4_REDUCE(res, x) \
|
||||||
{ \
|
{ \
|
||||||
int offset = GGML_F32_ARR >> 1; \
|
int offset = GGML_F32_ARR >> 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_F32_VEC GGML_F32x4
|
#define GGML_F32_VEC GGML_F32x4
|
||||||
@ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|||||||
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
||||||
#define GGML_F16x8_ADD vaddq_f16
|
#define GGML_F16x8_ADD vaddq_f16
|
||||||
#define GGML_F16x8_MUL vmulq_f16
|
#define GGML_F16x8_MUL vmulq_f16
|
||||||
#define GGML_F16x8_REDUCE(res, x) \
|
#define GGML_F16x8_REDUCE(res, x) \
|
||||||
do { \
|
do { \
|
||||||
int offset = GGML_F16_ARR >> 1; \
|
int offset = GGML_F16_ARR >> 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||||
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define GGML_F16_VEC GGML_F16x8
|
#define GGML_F16_VEC GGML_F16x8
|
||||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
|
||||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||||
@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|||||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// ggml object
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_object {
|
||||||
|
size_t offs;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
struct ggml_object * next;
|
||||||
|
|
||||||
|
enum ggml_object_type type;
|
||||||
|
|
||||||
|
char padding[4];
|
||||||
|
};
|
||||||
|
|
||||||
|
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||||
|
|
||||||
//
|
//
|
||||||
// ggml context
|
// ggml context
|
||||||
//
|
//
|
||||||
@ -19161,6 +19179,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
|||||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_graph_size(struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->size;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
|
||||||
|
if (i < 0) {
|
||||||
|
GGML_ASSERT(cgraph->n_nodes + i >= 0);
|
||||||
|
return cgraph->nodes[cgraph->n_nodes + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(i < cgraph->n_nodes);
|
||||||
|
return cgraph->nodes[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->n_nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||||
|
GGML_ASSERT(cgraph->size > cgraph->n_nodes);
|
||||||
|
cgraph->nodes[cgraph->n_nodes] = tensor;
|
||||||
|
cgraph->n_nodes++;
|
||||||
|
}
|
||||||
|
|
||||||
// Android's libc implementation "bionic" does not support setting affinity
|
// Android's libc implementation "bionic" does not support setting affinity
|
||||||
#if defined(__gnu_linux__)
|
#if defined(__gnu_linux__)
|
||||||
static void set_numa_thread_affinity(int thread_n) {
|
static void set_numa_thread_affinity(int thread_n) {
|
||||||
|
@ -9877,8 +9877,8 @@ struct llm_build_context {
|
|||||||
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
||||||
// find result_norm tensor for input
|
// find result_norm tensor for input
|
||||||
struct ggml_tensor * inp = nullptr;
|
struct ggml_tensor * inp = nullptr;
|
||||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||||
inp = gf->nodes[i];
|
inp = ggml_graph_node(gf, i);
|
||||||
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
@ -16207,8 +16207,8 @@ static int llama_decode_internal(
|
|||||||
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
||||||
|
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * res = ggml_graph_node(gf, -1);
|
||||||
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
struct ggml_tensor * embd = ggml_graph_node(gf, -2);
|
||||||
|
|
||||||
if (lctx.n_outputs == 0) {
|
if (lctx.n_outputs == 0) {
|
||||||
// no output
|
// no output
|
||||||
@ -16217,9 +16217,9 @@ static int llama_decode_internal(
|
|||||||
} else if (cparams.embeddings) {
|
} else if (cparams.embeddings) {
|
||||||
res = nullptr; // do not extract logits for embedding case
|
res = nullptr; // do not extract logits for embedding case
|
||||||
embd = nullptr;
|
embd = nullptr;
|
||||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||||
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
||||||
embd = gf->nodes[i];
|
embd = ggml_graph_node(gf, i);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -16436,15 +16436,15 @@ static int llama_encode_internal(
|
|||||||
// there are two cases here
|
// there are two cases here
|
||||||
if (llama_model_has_decoder(&lctx.model)) {
|
if (llama_model_has_decoder(&lctx.model)) {
|
||||||
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
||||||
embd = gf->nodes[gf->n_nodes - 1];
|
embd = ggml_graph_node(gf, -1);
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
||||||
} else {
|
} else {
|
||||||
// second case is an encoder-only T5 model
|
// second case is an encoder-only T5 model
|
||||||
if (cparams.embeddings) {
|
if (cparams.embeddings) {
|
||||||
// only output embeddings if required
|
// only output embeddings if required
|
||||||
embd = gf->nodes[gf->n_nodes - 1];
|
embd = ggml_graph_node(gf, -1);
|
||||||
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
||||||
embd = gf->nodes[gf->n_nodes - 2];
|
embd = ggml_graph_node(gf, -2);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
||||||
}
|
}
|
||||||
@ -18492,7 +18492,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
|
|
||||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf));
|
||||||
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -519,7 +519,7 @@ struct test_case {
|
|||||||
|
|
||||||
// add sentinels as graph nodes so that they are checked in the callback
|
// add sentinels as graph nodes so that they are checked in the callback
|
||||||
for (ggml_tensor * sentinel : sentinels) {
|
for (ggml_tensor * sentinel : sentinels) {
|
||||||
gf->nodes[gf->n_nodes++] = sentinel;
|
ggml_graph_add_node(gf, sentinel);
|
||||||
}
|
}
|
||||||
|
|
||||||
// randomize tensors
|
// randomize tensors
|
||||||
@ -679,9 +679,9 @@ struct test_case {
|
|||||||
|
|
||||||
// duplicate the op
|
// duplicate the op
|
||||||
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
||||||
int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
|
int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
||||||
for (int i = 1; i < n_runs; i++) {
|
for (int i = 1; i < n_runs; i++) {
|
||||||
gf->nodes[gf->n_nodes++] = out;
|
ggml_graph_add_node(gf, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculate memory
|
// calculate memory
|
||||||
@ -696,11 +696,11 @@ struct test_case {
|
|||||||
}
|
}
|
||||||
return size;
|
return size;
|
||||||
};
|
};
|
||||||
for (int i = 0; i < gf->n_nodes; i++) {
|
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||||
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
|
if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
mem += tensor_op_size(gf->nodes[i]);
|
mem += tensor_op_size(ggml_graph_node(gf, i));
|
||||||
}
|
}
|
||||||
|
|
||||||
// run
|
// run
|
||||||
@ -804,7 +804,7 @@ struct test_case {
|
|||||||
ggml_graph_cpy(gf, gb);
|
ggml_graph_cpy(gf, gb);
|
||||||
ggml_build_backward_expand(ctx, gf, gb, false);
|
ggml_build_backward_expand(ctx, gf, gb, false);
|
||||||
if (expect.size() != 1 || expect[0] != 0.0f) {
|
if (expect.size() != 1 || expect[0] != 0.0f) {
|
||||||
GGML_ASSERT(gb->n_nodes > gf->n_nodes);
|
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
|
||||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
|
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user