mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-13 14:29:52 +00:00
ggml : move CPU backend to a separate file (#10144)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
This commit is contained in:
parent
08828a6d7d
commit
9f40989351
21
Makefile
21
Makefile
@ -1,7 +1,6 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
libllava.a \
|
||||
llama-baby-llama \
|
||||
llama-batched \
|
||||
llama-batched-bench \
|
||||
llama-bench \
|
||||
@ -56,7 +55,6 @@ TEST_TARGETS = \
|
||||
tests/test-llama-grammar \
|
||||
tests/test-log \
|
||||
tests/test-model-load-cancel \
|
||||
tests/test-opt \
|
||||
tests/test-quantize-fns \
|
||||
tests/test-quantize-perf \
|
||||
tests/test-rope \
|
||||
@ -64,6 +62,7 @@ TEST_TARGETS = \
|
||||
tests/test-tokenizer-0 \
|
||||
tests/test-tokenizer-1-bpe \
|
||||
tests/test-tokenizer-1-spm
|
||||
# tests/test-opt \
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
||||
@ -916,6 +915,7 @@ endif # GGML_METAL
|
||||
|
||||
OBJ_GGML += \
|
||||
ggml/src/ggml.o \
|
||||
ggml/src/ggml-cpu.o \
|
||||
ggml/src/ggml-alloc.o \
|
||||
ggml/src/ggml-backend.o \
|
||||
ggml/src/ggml-quants.o \
|
||||
@ -936,7 +936,6 @@ OBJ_COMMON = \
|
||||
common/console.o \
|
||||
common/ngram-cache.o \
|
||||
common/sampling.o \
|
||||
common/train.o \
|
||||
common/build-info.o \
|
||||
common/json-schema-to-grammar.o
|
||||
|
||||
@ -1048,6 +1047,12 @@ ggml/src/ggml.o: \
|
||||
ggml/include/ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-cpu.o: \
|
||||
ggml/src/ggml-cpu.c \
|
||||
ggml/include/ggml.h \
|
||||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml/src/ggml-alloc.o: \
|
||||
ggml/src/ggml-alloc.c \
|
||||
ggml/include/ggml.h \
|
||||
@ -1213,11 +1218,6 @@ common/json-schema-to-grammar.o: \
|
||||
common/json-schema-to-grammar.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/train.o: \
|
||||
common/train.cpp \
|
||||
common/train.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/ngram-cache.o: \
|
||||
common/ngram-cache.cpp \
|
||||
common/ngram-cache.h
|
||||
@ -1390,11 +1390,6 @@ llama-bench: examples/llama-bench/llama-bench.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-baby-llama: examples/baby-llama/baby-llama.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
@ -10,6 +10,7 @@ var sources = [
|
||||
"src/unicode.cpp",
|
||||
"src/unicode-data.cpp",
|
||||
"ggml/src/ggml.c",
|
||||
"ggml/src/ggml-cpu.c",
|
||||
"ggml/src/ggml-alloc.c",
|
||||
"ggml/src/ggml-backend.cpp",
|
||||
"ggml/src/ggml-quants.c",
|
||||
|
@ -66,8 +66,6 @@ add_library(${TARGET} STATIC
|
||||
ngram-cache.h
|
||||
sampling.cpp
|
||||
sampling.h
|
||||
train.cpp
|
||||
train.h
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
@ -1951,6 +1951,8 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
||||
|
||||
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||
ggml_cpu_init(); // some ARM features are detected at runtime
|
||||
|
||||
const auto & sparams = params.sparams;
|
||||
|
||||
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
||||
|
1515
common/train.cpp
1515
common/train.cpp
File diff suppressed because it is too large
Load Diff
233
common/train.h
233
common/train.h
@ -1,233 +0,0 @@
|
||||
// Various helper functions and utilities for training
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#define LLAMA_TRAIN_MAX_NODES 16384
|
||||
|
||||
typedef std::string mt19937_state;
|
||||
|
||||
struct train_state {
|
||||
struct ggml_opt_context * opt;
|
||||
|
||||
uint64_t train_its;
|
||||
uint64_t train_samples;
|
||||
uint64_t train_tokens;
|
||||
uint64_t train_epochs;
|
||||
|
||||
size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
|
||||
mt19937_state shuffle_rng_state_current;
|
||||
mt19937_state shuffle_rng_state_next;
|
||||
size_t shuffle_sample_count;
|
||||
size_t shuffle_next_sample;
|
||||
};
|
||||
|
||||
struct train_params_common {
|
||||
const char * fn_train_data;
|
||||
const char * fn_checkpoint_in;
|
||||
const char * fn_checkpoint_out;
|
||||
const char * pattern_fn_it;
|
||||
const char * fn_latest;
|
||||
|
||||
bool print_usage;
|
||||
|
||||
int save_every;
|
||||
|
||||
uint32_t seed;
|
||||
|
||||
int n_ctx;
|
||||
int n_threads;
|
||||
int n_batch;
|
||||
int n_gradient_accumulation;
|
||||
int n_epochs;
|
||||
int n_gpu_layers;
|
||||
|
||||
bool custom_n_ctx;
|
||||
|
||||
bool use_flash;
|
||||
bool use_checkpointing;
|
||||
|
||||
std::string sample_start;
|
||||
bool include_sample_start;
|
||||
bool escape;
|
||||
bool overlapping_samples;
|
||||
bool fill_with_next_samples;
|
||||
bool separate_with_eos;
|
||||
bool separate_with_bos;
|
||||
bool sample_random_offsets;
|
||||
|
||||
bool force_reshuffle;
|
||||
|
||||
int warmup;
|
||||
int cos_decay_steps;
|
||||
float cos_decay_restart;
|
||||
float cos_decay_min;
|
||||
bool enable_restart;
|
||||
|
||||
int opt_past;
|
||||
float opt_delta;
|
||||
int opt_max_no_improvement;
|
||||
|
||||
int adam_n_iter;
|
||||
float adam_alpha;
|
||||
float adam_min_alpha;
|
||||
float adam_decay;
|
||||
int adam_decay_min_ndim;
|
||||
float adam_beta1;
|
||||
float adam_beta2;
|
||||
float adam_gclip;
|
||||
float adam_eps_f;
|
||||
};
|
||||
|
||||
typedef void (*save_train_files_callback)(void * data, struct train_state * train);
|
||||
|
||||
struct train_opt_callback_data {
|
||||
struct train_params_common * params;
|
||||
struct train_state * train;
|
||||
save_train_files_callback save_cb;
|
||||
void * save_data;
|
||||
struct llama_context * lctx;
|
||||
int last_save_iter;
|
||||
llama_token * tokens_data;
|
||||
size_t tokens_size;
|
||||
size_t * samples_begin;
|
||||
size_t * samples_size;
|
||||
size_t * shuffled_samples_offs;
|
||||
size_t * shuffled_samples_begin;
|
||||
size_t * shuffled_samples_size;
|
||||
size_t samples_count;
|
||||
struct ggml_tensor * tokens_input;
|
||||
struct ggml_tensor * target_probs;
|
||||
int first_iter;
|
||||
int first_epoch;
|
||||
int iter_at_last_epoch;
|
||||
int64_t last_time;
|
||||
double millis_per_iter;
|
||||
};
|
||||
|
||||
struct train_state * init_train_state();
|
||||
void free_train_state(struct train_state * state);
|
||||
|
||||
struct train_params_common get_default_train_params_common();
|
||||
void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
|
||||
|
||||
bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
|
||||
void finish_processing_train_args(struct train_params_common * params);
|
||||
|
||||
struct random_normal_distribution;
|
||||
struct random_uniform_distribution;
|
||||
|
||||
struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
|
||||
struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
|
||||
|
||||
void free_random_normal_distribution (struct random_normal_distribution * rnd);
|
||||
void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
|
||||
|
||||
struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
|
||||
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
|
||||
|
||||
// generate random float in interval [0,1)
|
||||
float frand();
|
||||
float frand_normal (struct random_normal_distribution * rnd);
|
||||
float frand_uniform(struct random_uniform_distribution * rnd);
|
||||
|
||||
int clamp (const int v, const int min, const int max);
|
||||
float fclamp(const float v, const float min, const float max);
|
||||
|
||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
|
||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
|
||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
|
||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
|
||||
|
||||
size_t tokenize_file(
|
||||
struct llama_context * lctx,
|
||||
const char * filename,
|
||||
const std::string & sample_start,
|
||||
bool include_sample_start,
|
||||
bool overlapping_samples,
|
||||
unsigned context_length,
|
||||
std::vector<llama_token> & out_tokens,
|
||||
std::vector<size_t> & out_samples_begin,
|
||||
std::vector<size_t> & out_samples_size);
|
||||
|
||||
int64_t get_example_targets_batch(
|
||||
struct llama_context * lctx,
|
||||
struct ggml_tensor * tokens_input,
|
||||
struct ggml_tensor * target_probs,
|
||||
int64_t example_id,
|
||||
const size_t * samples_offs,
|
||||
const size_t * samples_begin,
|
||||
const size_t * samples_size,
|
||||
size_t samples_count,
|
||||
const llama_token * train_data,
|
||||
size_t n_train_data,
|
||||
bool separate_with_eos,
|
||||
bool separate_with_bos,
|
||||
bool fill_with_next_samples,
|
||||
bool sample_random_offsets);
|
||||
|
||||
|
||||
void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
|
||||
mt19937_state mt19937_get_state(const std::mt19937& rng);
|
||||
mt19937_state mt19937_seed_to_state(unsigned seed);
|
||||
|
||||
mt19937_state shuffle_samples(
|
||||
const mt19937_state & rng_state,
|
||||
size_t * shuffled_offs,
|
||||
size_t * shuffled_begins,
|
||||
size_t * shuffled_sizes,
|
||||
const size_t * begins,
|
||||
const size_t * sizes,
|
||||
size_t count);
|
||||
|
||||
size_t hash_combine(size_t h1, size_t h2);
|
||||
|
||||
size_t compute_samples_hash(
|
||||
const char* fn,
|
||||
const size_t* samples_begin,
|
||||
const size_t* samples_size,
|
||||
size_t sample_count);
|
||||
|
||||
|
||||
std::string replace_str(const char * s, const char * needle, const char * replacement);
|
||||
|
||||
void print_duration(double milliseconds);
|
||||
|
||||
float cosine_decay(
|
||||
int64_t step,
|
||||
int64_t decay_steps,
|
||||
float minimum);
|
||||
|
||||
float cosine_decay_restart(
|
||||
int64_t step,
|
||||
int64_t decay_steps,
|
||||
float minimum,
|
||||
float restart_step_mult);
|
||||
|
||||
float learning_schedule(
|
||||
int64_t step,
|
||||
int64_t warmup_steps,
|
||||
int64_t decay_steps,
|
||||
float learning_rate,
|
||||
float overall_minimum,
|
||||
float cos_decay_minimum,
|
||||
float cos_decay_restart_step_mult,
|
||||
bool enable_restart);
|
||||
|
||||
void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
|
||||
|
||||
void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
|
||||
void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
|
||||
|
||||
bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
|
||||
void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
|
||||
|
||||
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
|
||||
|
||||
void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
|
@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
|
@ -1,5 +0,0 @@
|
||||
set(TARGET llama-baby-llama)
|
||||
add_executable(${TARGET} baby-llama.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
File diff suppressed because it is too large
Load Diff
@ -4,6 +4,7 @@
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
@ -305,27 +305,10 @@ extern "C" {
|
||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
// CPU buffer types are always available
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
150
ggml/include/ggml-cpu.h
Normal file
150
ggml/include/ggml-cpu.h
Normal file
@ -0,0 +1,150 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
enum ggml_numa_strategy {
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
|
||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
// TODO: move to backend interface
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||
// get the sve vector length in bytes
|
||||
GGML_API int ggml_cpu_get_sve_cnt(void);
|
||||
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
typedef void (*ggml_from_float_to_mat_t)
|
||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
struct ggml_type_traits_cpu {
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
};
|
||||
|
||||
GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||
|
||||
GGML_API void ggml_cpu_init(void);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -573,6 +573,13 @@ extern "C" {
|
||||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||
};
|
||||
|
||||
struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
@ -618,59 +625,6 @@ extern "C" {
|
||||
// If it returns true, the computation is aborted
|
||||
typedef bool (*ggml_abort_callback)(void * data);
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
enum ggml_numa_strategy {
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
//
|
||||
// GUID
|
||||
@ -693,9 +647,6 @@ extern "C" {
|
||||
// accepts a UTF-8 path, even on Windows
|
||||
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
||||
|
||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
@ -797,8 +748,7 @@ extern "C" {
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
||||
@ -808,35 +758,25 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
// Converts a flat index into coordinates
|
||||
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
||||
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
|
||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
||||
|
||||
// Tensor flags
|
||||
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// operations on tensors with backpropagation
|
||||
//
|
||||
@ -2052,9 +1992,6 @@ extern "C" {
|
||||
// automatic differentiation
|
||||
//
|
||||
|
||||
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
|
||||
|
||||
@ -2086,27 +2023,6 @@ extern "C" {
|
||||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||
|
||||
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||
@ -2277,6 +2193,8 @@ extern "C" {
|
||||
} lbfgs;
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||
|
||||
// optimize the function defined by the tensor f
|
||||
@ -2308,12 +2226,6 @@ extern "C" {
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data);
|
||||
|
||||
//
|
||||
// tensor flags
|
||||
//
|
||||
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
@ -2482,8 +2394,6 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_API int ggml_cpu_has_amx_int8 (void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
@ -2500,17 +2410,9 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_sycl (void);
|
||||
GGML_API int ggml_cpu_has_rpc (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||
GGML_API int ggml_cpu_has_cann (void);
|
||||
GGML_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
// get the sve vector length in bytes
|
||||
GGML_API int ggml_cpu_get_sve_cnt(void);
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
@ -2519,14 +2421,6 @@ extern "C" {
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_to_mat_t)
|
||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
struct ggml_type_traits {
|
||||
const char * type_name;
|
||||
@ -2537,13 +2431,6 @@ extern "C" {
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_t from_float_ref;
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
};
|
||||
|
||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||
|
@ -1366,10 +1366,12 @@ endif()
|
||||
|
||||
add_library(ggml
|
||||
../include/ggml.h
|
||||
../include/ggml-cpu.h
|
||||
../include/ggml-alloc.h
|
||||
../include/ggml-backend.h
|
||||
../include/ggml-cpp.h
|
||||
ggml.c
|
||||
ggml-cpu.c
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
ggml-quants.c
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <math.h>
|
||||
|
File diff suppressed because it is too large
Load Diff
13715
ggml/src/ggml-cpu.c
Normal file
13715
ggml/src/ggml-cpu.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -8,6 +8,7 @@
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@ -36,6 +37,20 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline int ggml_up32(int n) {
|
||||
return (n + 31) & ~31;
|
||||
}
|
||||
|
||||
//static inline int ggml_up64(int n) {
|
||||
// return (n + 63) & ~63;
|
||||
//}
|
||||
|
||||
static inline int ggml_up(int n, int m) {
|
||||
// assert m is a power of 2
|
||||
GGML_ASSERT((m & (m - 1)) == 0);
|
||||
return (n + m - 1) & ~(m - 1);
|
||||
}
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
@ -51,6 +66,74 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
||||
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
||||
|
||||
#define GGML_DEBUG 0
|
||||
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 5)
|
||||
#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_5(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 10)
|
||||
#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_10(...)
|
||||
#endif
|
||||
|
||||
// tensor params
|
||||
|
||||
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
||||
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
||||
assert(params_size <= GGML_MAX_OP_PARAMS);
|
||||
memcpy(tensor->op_params, params, params_size);
|
||||
}
|
||||
|
||||
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||
return ((const int32_t *)(tensor->op_params))[i];
|
||||
}
|
||||
|
||||
static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||
return ((const float *)(tensor->op_params))[i];
|
||||
}
|
||||
|
||||
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||
((int32_t *)(tensor->op_params))[i] = value;
|
||||
}
|
||||
|
||||
static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
|
||||
assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
|
||||
((float *)(tensor->op_params))[i] = value;
|
||||
}
|
||||
|
||||
struct ggml_map_custom1_op_params {
|
||||
ggml_custom1_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_map_custom2_op_params {
|
||||
ggml_custom2_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_map_custom3_op_params {
|
||||
ggml_custom3_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
// bitset
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
@ -204,6 +287,10 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
void * ggml_aligned_malloc(size_t size);
|
||||
void ggml_aligned_free(void * ptr, size_t size);
|
||||
|
||||
// TODO: move to threading file
|
||||
void ggml_critical_section_start(void);
|
||||
void ggml_critical_section_end(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -1296,13 +1296,6 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
UNUSED(dev);
|
||||
UNUSED(max_tensor_size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
UNUSED(dev);
|
||||
UNUSED(op);
|
||||
@ -1328,7 +1321,7 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
||||
/* .init_backend = */ ggml_backend_rpc_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_rpc_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
|
15264
ggml/src/ggml.c
15264
ggml/src/ggml.c
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,7 @@
|
||||
#define LLAMA_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <type_traits>
|
||||
|
||||
#include <ggml.h>
|
||||
#include <ggml-cpu.h>
|
||||
|
||||
constexpr int kVecSize = 1 << 16;
|
||||
|
||||
@ -136,7 +137,7 @@ int main(int argc, char** argv) {
|
||||
|
||||
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
|
||||
|
||||
const auto * funcs = ggml_get_type_traits(ggml_type);
|
||||
const auto * funcs = ggml_get_type_traits_cpu(ggml_type);
|
||||
|
||||
Stat simple, ggml;
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <array>
|
||||
|
||||
#include <ggml.h>
|
||||
#include <ggml-cpu.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@ -236,7 +237,8 @@ int main(int argc, char** argv) {
|
||||
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
||||
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
||||
|
||||
const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0);
|
||||
const auto * funcs = ggml_get_type_traits(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
|
||||
const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
|
||||
|
||||
std::vector<block_q4_0> q40;
|
||||
std::vector<block_q4_1> q41;
|
||||
@ -282,10 +284,10 @@ int main(int argc, char** argv) {
|
||||
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
||||
}
|
||||
else {
|
||||
const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type);
|
||||
const auto * vdot = ggml_get_type_traits(funcs_cpu->vec_dot_type);
|
||||
vdot->from_float(y1.data(), q8.data(), kVecSize);
|
||||
if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
||||
else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
||||
if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
||||
else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
||||
}
|
||||
sumq += result;
|
||||
t2 = std::chrono::high_resolution_clock::now();
|
||||
|
1
spm-headers/ggml-cpu.h
Symbolic link
1
spm-headers/ggml-cpu.h
Symbolic link
@ -0,0 +1 @@
|
||||
../ggml/include/ggml-cpu.h
|
@ -21900,6 +21900,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
||||
}
|
||||
|
||||
const char * llama_print_system_info(void) {
|
||||
ggml_cpu_init(); // some ARM features are detected at runtime
|
||||
|
||||
static std::string s;
|
||||
|
||||
s = "";
|
||||
|
@ -16,6 +16,7 @@
|
||||
|
||||
|
||||
#include <ggml.h>
|
||||
#include <ggml-cpu.h>
|
||||
#include <ggml-alloc.h>
|
||||
#include <ggml-backend.h>
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <chrono>
|
||||
|
@ -1,5 +1,6 @@
|
||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
|
@ -1,6 +1,7 @@
|
||||
// Unit tests for quantization specific functions - quantize, dequantize and dot product
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
@ -78,18 +79,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||
|
||||
// Total dot product error
|
||||
static float dot_product_error(
|
||||
const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
|
||||
const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2
|
||||
) {
|
||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||
|
||||
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
|
||||
const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
|
||||
|
||||
qfns->from_float(test_data1, tmp_q1.data(), test_size);
|
||||
vdot->from_float(test_data2, tmp_q2.data(), test_size);
|
||||
|
||||
float result = INFINITY;
|
||||
qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||
|
||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||
|
||||
@ -132,6 +133,7 @@ int main(int argc, char * argv[]) {
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
||||
|
||||
// deprecated - skip
|
||||
if (qfns->blck_size == 0) {
|
||||
@ -166,7 +168,7 @@ int main(int argc, char * argv[]) {
|
||||
printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
|
||||
}
|
||||
|
||||
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
||||
const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
|
||||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
||||
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
|
@ -1,6 +1,7 @@
|
||||
// Benchmark quantization specific functions on synthetic data
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#undef NDEBUG
|
||||
#include <algorithm>
|
||||
@ -271,6 +272,7 @@ int main(int argc, char * argv[]) {
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
const auto * qfns = ggml_get_type_traits(type);
|
||||
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
||||
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
@ -328,7 +330,7 @@ int main(int argc, char * argv[]) {
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
|
||||
const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type);
|
||||
vdot->from_float(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
@ -346,7 +348,7 @@ int main(int argc, char * argv[]) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
float result;
|
||||
qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||
qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||
return result;
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
|
Loading…
Reference in New Issue
Block a user